xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision dad64f0e)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/bus.h>
66 #include <sys/counter.h>
67 #include <sys/kernel.h>
68 #include <sys/limits.h>
69 #include <sys/malloc.h>
70 #include <sys/mbuf.h>
71 #include <sys/module.h>
72 #include <sys/queue.h>
73 #include <sys/lock.h>
74 #include <sys/proc.h>
75 #include <sys/rmlock.h>
76 #include <sys/sbuf.h>
77 #include <sys/sched.h>
78 #include <sys/smp.h>
79 #include <sys/socket.h>
80 #include <sys/sockio.h>
81 #include <sys/sx.h>
82 #include <sys/sysctl.h>
83 #include <sys/taskqueue.h>
84 #include <sys/buf_ring.h>
85 #include <sys/eventhandler.h>
86 #include <sys/epoch.h>
87 
88 #include <machine/atomic.h>
89 #include <machine/in_cksum.h>
90 
91 #include <net/bpf.h>
92 #include <net/ethernet.h>
93 #include <net/if.h>
94 #include <net/if_dl.h>
95 #include <net/if_media.h>
96 #include <net/if_types.h>
97 #include <net/if_var.h>
98 #include <net/rndis.h>
99 #ifdef RSS
100 #include <net/rss_config.h>
101 #endif
102 
103 #include <netinet/in_systm.h>
104 #include <netinet/in.h>
105 #include <netinet/ip.h>
106 #include <netinet/ip6.h>
107 #include <netinet/tcp.h>
108 #include <netinet/tcp_lro.h>
109 #include <netinet/udp.h>
110 
111 #include <dev/hyperv/include/hyperv.h>
112 #include <dev/hyperv/include/hyperv_busdma.h>
113 #include <dev/hyperv/include/vmbus.h>
114 #include <dev/hyperv/include/vmbus_xact.h>
115 
116 #include <dev/hyperv/netvsc/ndis.h>
117 #include <dev/hyperv/netvsc/if_hnreg.h>
118 #include <dev/hyperv/netvsc/if_hnvar.h>
119 #include <dev/hyperv/netvsc/hn_nvs.h>
120 #include <dev/hyperv/netvsc/hn_rndis.h>
121 
122 #include "vmbus_if.h"
123 
124 #define HN_IFSTART_SUPPORT
125 
126 #define HN_RING_CNT_DEF_MAX		8
127 
128 #define HN_VFMAP_SIZE_DEF		8
129 
130 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
131 
132 /* YYY should get it from the underlying channel */
133 #define HN_TX_DESC_CNT			512
134 
135 #define HN_RNDIS_PKT_LEN					\
136 	(sizeof(struct rndis_packet_msg) +			\
137 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
138 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
139 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
140 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
141 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
142 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
143 
144 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
145 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
146 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
147 /* -1 for RNDIS packet message */
148 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
149 
150 #define HN_DIRECT_TX_SIZE_DEF		128
151 
152 #define HN_EARLY_TXEOF_THRESH		8
153 
154 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
155 
156 #define HN_LROENT_CNT_DEF		128
157 
158 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
159 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
160 /* YYY 2*MTU is a bit rough, but should be good enough. */
161 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
162 
163 #define HN_LRO_ACKCNT_DEF		1
164 
165 #define HN_LOCK_INIT(sc)		\
166 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
167 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
168 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
169 #define HN_LOCK(sc)					\
170 do {							\
171 	while (sx_try_xlock(&(sc)->hn_lock) == 0) {	\
172 		/* Relinquish cpu to avoid deadlock */	\
173 		sched_relinquish(curthread);		\
174 		DELAY(1000);				\
175 	}						\
176 } while (0)
177 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
178 
179 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
180 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
181 #define HN_CSUM_IP_HWASSIST(sc)		\
182 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
183 #define HN_CSUM_IP6_HWASSIST(sc)	\
184 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
185 
186 #define HN_PKTSIZE_MIN(align)		\
187 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
188 	    HN_RNDIS_PKT_LEN, (align))
189 #define HN_PKTSIZE(m, align)		\
190 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
191 
192 #ifdef RSS
193 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
194 #else
195 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
196 #endif
197 
198 struct hn_txdesc {
199 #ifndef HN_USE_TXDESC_BUFRING
200 	SLIST_ENTRY(hn_txdesc)		link;
201 #endif
202 	STAILQ_ENTRY(hn_txdesc)		agg_link;
203 
204 	/* Aggregated txdescs, in sending order. */
205 	STAILQ_HEAD(, hn_txdesc)	agg_list;
206 
207 	/* The oldest packet, if transmission aggregation happens. */
208 	struct mbuf			*m;
209 	struct hn_tx_ring		*txr;
210 	int				refs;
211 	uint32_t			flags;	/* HN_TXD_FLAG_ */
212 	struct hn_nvs_sendctx		send_ctx;
213 	uint32_t			chim_index;
214 	int				chim_size;
215 
216 	bus_dmamap_t			data_dmap;
217 
218 	bus_addr_t			rndis_pkt_paddr;
219 	struct rndis_packet_msg		*rndis_pkt;
220 	bus_dmamap_t			rndis_pkt_dmap;
221 };
222 
223 #define HN_TXD_FLAG_ONLIST		0x0001
224 #define HN_TXD_FLAG_DMAMAP		0x0002
225 #define HN_TXD_FLAG_ONAGG		0x0004
226 
227 #define	HN_NDIS_PKTINFO_SUBALLOC	0x01
228 #define	HN_NDIS_PKTINFO_1ST_FRAG	0x02
229 #define	HN_NDIS_PKTINFO_LAST_FRAG	0x04
230 
231 struct packet_info_id {
232 	uint8_t				ver;
233 	uint8_t				flag;
234 	uint16_t			pkt_id;
235 };
236 
237 #define NDIS_PKTINFOID_SZ		sizeof(struct packet_info_id)
238 
239 
240 struct hn_rxinfo {
241 	const uint32_t			*vlan_info;
242 	const uint32_t			*csum_info;
243 	const uint32_t			*hash_info;
244 	const uint32_t			*hash_value;
245 	const struct packet_info_id	*pktinfo_id;
246 };
247 
248 struct hn_rxvf_setarg {
249 	struct hn_rx_ring	*rxr;
250 	struct ifnet		*vf_ifp;
251 };
252 
253 #define HN_RXINFO_VLAN			0x0001
254 #define HN_RXINFO_CSUM			0x0002
255 #define HN_RXINFO_HASHINF		0x0004
256 #define HN_RXINFO_HASHVAL		0x0008
257 #define HN_RXINFO_PKTINFO_ID		0x0010
258 #define HN_RXINFO_ALL			\
259 	(HN_RXINFO_VLAN |		\
260 	 HN_RXINFO_CSUM |		\
261 	 HN_RXINFO_HASHINF |		\
262 	 HN_RXINFO_HASHVAL |		\
263 	 HN_RXINFO_PKTINFO_ID)
264 
265 static int			hn_probe(device_t);
266 static int			hn_attach(device_t);
267 static int			hn_detach(device_t);
268 static int			hn_shutdown(device_t);
269 static void			hn_chan_callback(struct vmbus_channel *,
270 				    void *);
271 
272 static void			hn_init(void *);
273 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
274 #ifdef HN_IFSTART_SUPPORT
275 static void			hn_start(struct ifnet *);
276 #endif
277 static int			hn_transmit(struct ifnet *, struct mbuf *);
278 static void			hn_xmit_qflush(struct ifnet *);
279 static int			hn_ifmedia_upd(struct ifnet *);
280 static void			hn_ifmedia_sts(struct ifnet *,
281 				    struct ifmediareq *);
282 
283 static void			hn_ifnet_event(void *, struct ifnet *, int);
284 static void			hn_ifaddr_event(void *, struct ifnet *);
285 static void			hn_ifnet_attevent(void *, struct ifnet *);
286 static void			hn_ifnet_detevent(void *, struct ifnet *);
287 static void			hn_ifnet_lnkevent(void *, struct ifnet *, int);
288 
289 static bool			hn_ismyvf(const struct hn_softc *,
290 				    const struct ifnet *);
291 static void			hn_rxvf_change(struct hn_softc *,
292 				    struct ifnet *, bool);
293 static void			hn_rxvf_set(struct hn_softc *, struct ifnet *);
294 static void			hn_rxvf_set_task(void *, int);
295 static void			hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
296 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
297 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
298 				    struct ifreq *);
299 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
300 static bool			hn_xpnt_vf_isready(struct hn_softc *);
301 static void			hn_xpnt_vf_setready(struct hn_softc *);
302 static void			hn_xpnt_vf_init_taskfunc(void *, int);
303 static void			hn_xpnt_vf_init(struct hn_softc *);
304 static void			hn_xpnt_vf_setenable(struct hn_softc *);
305 static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
306 static void			hn_vf_rss_fixup(struct hn_softc *, bool);
307 static void			hn_vf_rss_restore(struct hn_softc *);
308 
309 static int			hn_rndis_rxinfo(const void *, int,
310 				    struct hn_rxinfo *);
311 static void			hn_rndis_rx_data(struct hn_rx_ring *,
312 				    const void *, int);
313 static void			hn_rndis_rx_status(struct hn_softc *,
314 				    const void *, int);
315 static void			hn_rndis_init_fixat(struct hn_softc *, int);
316 
317 static void			hn_nvs_handle_notify(struct hn_softc *,
318 				    const struct vmbus_chanpkt_hdr *);
319 static void			hn_nvs_handle_comp(struct hn_softc *,
320 				    struct vmbus_channel *,
321 				    const struct vmbus_chanpkt_hdr *);
322 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
323 				    struct vmbus_channel *,
324 				    const struct vmbus_chanpkt_hdr *);
325 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
326 				    struct vmbus_channel *, uint64_t);
327 
328 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
329 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
330 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
331 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
332 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
333 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
334 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
335 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
336 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
337 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
338 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
339 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
340 #ifndef RSS
341 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
342 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
343 #endif
344 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
345 static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
346 static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
347 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
348 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
349 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
350 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
351 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
352 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
353 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
354 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
355 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
356 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
357 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
358 
359 static void			hn_stop(struct hn_softc *, bool);
360 static void			hn_init_locked(struct hn_softc *);
361 static int			hn_chan_attach(struct hn_softc *,
362 				    struct vmbus_channel *);
363 static void			hn_chan_detach(struct hn_softc *,
364 				    struct vmbus_channel *);
365 static int			hn_attach_subchans(struct hn_softc *);
366 static void			hn_detach_allchans(struct hn_softc *);
367 static void			hn_chan_rollup(struct hn_rx_ring *,
368 				    struct hn_tx_ring *);
369 static void			hn_set_ring_inuse(struct hn_softc *, int);
370 static int			hn_synth_attach(struct hn_softc *, int);
371 static void			hn_synth_detach(struct hn_softc *);
372 static int			hn_synth_alloc_subchans(struct hn_softc *,
373 				    int *);
374 static bool			hn_synth_attachable(const struct hn_softc *);
375 static void			hn_suspend(struct hn_softc *);
376 static void			hn_suspend_data(struct hn_softc *);
377 static void			hn_suspend_mgmt(struct hn_softc *);
378 static void			hn_resume(struct hn_softc *);
379 static void			hn_resume_data(struct hn_softc *);
380 static void			hn_resume_mgmt(struct hn_softc *);
381 static void			hn_suspend_mgmt_taskfunc(void *, int);
382 static void			hn_chan_drain(struct hn_softc *,
383 				    struct vmbus_channel *);
384 static void			hn_disable_rx(struct hn_softc *);
385 static void			hn_drain_rxtx(struct hn_softc *, int);
386 static void			hn_polling(struct hn_softc *, u_int);
387 static void			hn_chan_polling(struct vmbus_channel *, u_int);
388 static void			hn_mtu_change_fixup(struct hn_softc *);
389 
390 static void			hn_update_link_status(struct hn_softc *);
391 static void			hn_change_network(struct hn_softc *);
392 static void			hn_link_taskfunc(void *, int);
393 static void			hn_netchg_init_taskfunc(void *, int);
394 static void			hn_netchg_status_taskfunc(void *, int);
395 static void			hn_link_status(struct hn_softc *);
396 
397 static int			hn_create_rx_data(struct hn_softc *, int);
398 static void			hn_destroy_rx_data(struct hn_softc *);
399 static int			hn_check_iplen(const struct mbuf *, int);
400 static void			hn_rxpkt_proto(const struct mbuf *, int *, int *);
401 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
402 static int			hn_rxfilter_config(struct hn_softc *);
403 static int			hn_rss_reconfig(struct hn_softc *);
404 static void			hn_rss_ind_fixup(struct hn_softc *);
405 static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
406 static int			hn_rxpkt(struct hn_rx_ring *);
407 static uint32_t			hn_rss_type_fromndis(uint32_t);
408 static uint32_t			hn_rss_type_tondis(uint32_t);
409 
410 static int			hn_tx_ring_create(struct hn_softc *, int);
411 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
412 static int			hn_create_tx_data(struct hn_softc *, int);
413 static void			hn_fixup_tx_data(struct hn_softc *);
414 static void			hn_fixup_rx_data(struct hn_softc *);
415 static void			hn_destroy_tx_data(struct hn_softc *);
416 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
417 static void			hn_txdesc_gc(struct hn_tx_ring *,
418 				    struct hn_txdesc *);
419 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
420 				    struct hn_txdesc *, struct mbuf **);
421 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
422 				    struct hn_txdesc *);
423 static void			hn_set_chim_size(struct hn_softc *, int);
424 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
425 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
426 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
427 static void			hn_resume_tx(struct hn_softc *, int);
428 static void			hn_set_txagg(struct hn_softc *);
429 static void			*hn_try_txagg(struct ifnet *,
430 				    struct hn_tx_ring *, struct hn_txdesc *,
431 				    int);
432 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
433 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
434 				    struct hn_softc *, struct vmbus_channel *,
435 				    const void *, int);
436 static int			hn_txpkt_sglist(struct hn_tx_ring *,
437 				    struct hn_txdesc *);
438 static int			hn_txpkt_chim(struct hn_tx_ring *,
439 				    struct hn_txdesc *);
440 static int			hn_xmit(struct hn_tx_ring *, int);
441 static void			hn_xmit_taskfunc(void *, int);
442 static void			hn_xmit_txeof(struct hn_tx_ring *);
443 static void			hn_xmit_txeof_taskfunc(void *, int);
444 #ifdef HN_IFSTART_SUPPORT
445 static int			hn_start_locked(struct hn_tx_ring *, int);
446 static void			hn_start_taskfunc(void *, int);
447 static void			hn_start_txeof(struct hn_tx_ring *);
448 static void			hn_start_txeof_taskfunc(void *, int);
449 #endif
450 
451 static int			hn_rsc_sysctl(SYSCTL_HANDLER_ARGS);
452 
453 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
454     "Hyper-V network interface");
455 
456 /* Trust tcp segment verification on host side. */
457 static int			hn_trust_hosttcp = 1;
458 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
459     &hn_trust_hosttcp, 0,
460     "Trust tcp segment verification on host side, "
461     "when csum info is missing (global setting)");
462 
463 /* Trust udp datagrams verification on host side. */
464 static int			hn_trust_hostudp = 1;
465 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
466     &hn_trust_hostudp, 0,
467     "Trust udp datagram verification on host side, "
468     "when csum info is missing (global setting)");
469 
470 /* Trust ip packets verification on host side. */
471 static int			hn_trust_hostip = 1;
472 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
473     &hn_trust_hostip, 0,
474     "Trust ip packet verification on host side, "
475     "when csum info is missing (global setting)");
476 
477 /*
478  * Offload UDP/IPv4 checksum.
479  */
480 static int			hn_enable_udp4cs = 1;
481 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
482     &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
483 
484 /*
485  * Offload UDP/IPv6 checksum.
486  */
487 static int			hn_enable_udp6cs = 1;
488 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
489     &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
490 
491 /* Stats. */
492 static counter_u64_t		hn_udpcs_fixup;
493 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
494     &hn_udpcs_fixup, "# of UDP checksum fixup");
495 
496 /*
497  * See hn_set_hlen().
498  *
499  * This value is for Azure.  For Hyper-V, set this above
500  * 65536 to disable UDP datagram checksum fixup.
501  */
502 static int			hn_udpcs_fixup_mtu = 1420;
503 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
504     &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
505 
506 /* Limit TSO burst size */
507 static int			hn_tso_maxlen = IP_MAXPACKET;
508 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
509     &hn_tso_maxlen, 0, "TSO burst limit");
510 
511 /* Limit chimney send size */
512 static int			hn_tx_chimney_size = 0;
513 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
514     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
515 
516 /* Limit the size of packet for direct transmission */
517 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
518 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
519     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
520 
521 /* # of LRO entries per RX ring */
522 #if defined(INET) || defined(INET6)
523 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
524 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
525     &hn_lro_entry_count, 0, "LRO entry count");
526 #endif
527 
528 static int			hn_tx_taskq_cnt = 1;
529 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
530     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
531 
532 #define HN_TX_TASKQ_M_INDEP	0
533 #define HN_TX_TASKQ_M_GLOBAL	1
534 #define HN_TX_TASKQ_M_EVTTQ	2
535 
536 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
537 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
538     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
539     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
540 
541 #ifndef HN_USE_TXDESC_BUFRING
542 static int			hn_use_txdesc_bufring = 0;
543 #else
544 static int			hn_use_txdesc_bufring = 1;
545 #endif
546 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
547     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
548 
549 #ifdef HN_IFSTART_SUPPORT
550 /* Use ifnet.if_start instead of ifnet.if_transmit */
551 static int			hn_use_if_start = 0;
552 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
553     &hn_use_if_start, 0, "Use if_start TX method");
554 #endif
555 
556 /* # of channels to use */
557 static int			hn_chan_cnt = 0;
558 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
559     &hn_chan_cnt, 0,
560     "# of channels to use; each channel has one RX ring and one TX ring");
561 
562 /* # of transmit rings to use */
563 static int			hn_tx_ring_cnt = 0;
564 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
565     &hn_tx_ring_cnt, 0, "# of TX rings to use");
566 
567 /* Software TX ring deptch */
568 static int			hn_tx_swq_depth = 0;
569 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
570     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
571 
572 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
573 static u_int			hn_lro_mbufq_depth = 0;
574 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
575     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
576 
577 /* Packet transmission aggregation size limit */
578 static int			hn_tx_agg_size = -1;
579 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
580     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
581 
582 /* Packet transmission aggregation count limit */
583 static int			hn_tx_agg_pkts = -1;
584 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
585     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
586 
587 /* VF list */
588 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist,
589     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
590     hn_vflist_sysctl, "A",
591     "VF list");
592 
593 /* VF mapping */
594 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap,
595     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
596     hn_vfmap_sysctl, "A",
597     "VF mapping");
598 
599 /* Transparent VF */
600 static int			hn_xpnt_vf = 1;
601 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
602     &hn_xpnt_vf, 0, "Transparent VF mod");
603 
604 /* Accurate BPF support for Transparent VF */
605 static int			hn_xpnt_vf_accbpf = 0;
606 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
607     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
608 
609 /* Extra wait for transparent VF attach routing; unit seconds. */
610 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
611 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
612     &hn_xpnt_vf_attwait, 0,
613     "Extra wait for transparent VF attach routing; unit: seconds");
614 
615 static u_int			hn_cpu_index;	/* next CPU for channel */
616 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
617 
618 static struct rmlock		hn_vfmap_lock;
619 static int			hn_vfmap_size;
620 static struct ifnet		**hn_vfmap;
621 
622 #ifndef RSS
623 static const uint8_t
624 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
625 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
626 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
627 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
628 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
629 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
630 };
631 #endif	/* !RSS */
632 
633 static const struct hyperv_guid	hn_guid = {
634 	.hv_guid = {
635 	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
636 	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
637 };
638 
639 static device_method_t hn_methods[] = {
640 	/* Device interface */
641 	DEVMETHOD(device_probe,		hn_probe),
642 	DEVMETHOD(device_attach,	hn_attach),
643 	DEVMETHOD(device_detach,	hn_detach),
644 	DEVMETHOD(device_shutdown,	hn_shutdown),
645 	DEVMETHOD_END
646 };
647 
648 static driver_t hn_driver = {
649 	"hn",
650 	hn_methods,
651 	sizeof(struct hn_softc)
652 };
653 
654 DRIVER_MODULE(hn, vmbus, hn_driver, 0, 0);
655 MODULE_VERSION(hn, 1);
656 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
657 
658 static void
659 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
660 {
661 	int i;
662 
663 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
664 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
665 }
666 
667 static int
668 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
669 {
670 
671 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
672 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
673 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
674 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
675 }
676 
677 static int
678 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
679 {
680 	struct hn_nvs_rndis rndis;
681 
682 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
683 	    txd->chim_size > 0, ("invalid rndis chim txd"));
684 
685 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
686 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
687 	rndis.nvs_chim_idx = txd->chim_index;
688 	rndis.nvs_chim_sz = txd->chim_size;
689 
690 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
691 	    &rndis, sizeof(rndis), &txd->send_ctx));
692 }
693 
694 static __inline uint32_t
695 hn_chim_alloc(struct hn_softc *sc)
696 {
697 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
698 	u_long *bmap = sc->hn_chim_bmap;
699 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
700 
701 	for (i = 0; i < bmap_cnt; ++i) {
702 		int idx;
703 
704 		idx = ffsl(~bmap[i]);
705 		if (idx == 0)
706 			continue;
707 
708 		--idx; /* ffsl is 1-based */
709 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
710 		    ("invalid i %d and idx %d", i, idx));
711 
712 		if (atomic_testandset_long(&bmap[i], idx))
713 			continue;
714 
715 		ret = i * LONG_BIT + idx;
716 		break;
717 	}
718 	return (ret);
719 }
720 
721 static __inline void
722 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
723 {
724 	u_long mask;
725 	uint32_t idx;
726 
727 	idx = chim_idx / LONG_BIT;
728 	KASSERT(idx < sc->hn_chim_bmap_cnt,
729 	    ("invalid chimney index 0x%x", chim_idx));
730 
731 	mask = 1UL << (chim_idx % LONG_BIT);
732 	KASSERT(sc->hn_chim_bmap[idx] & mask,
733 	    ("index bitmap 0x%lx, chimney index %u, "
734 	     "bitmap idx %d, bitmask 0x%lx",
735 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
736 
737 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
738 }
739 
740 #if defined(INET6) || defined(INET)
741 
742 #define PULLUP_HDR(m, len)				\
743 do {							\
744 	if (__predict_false((m)->m_len < (len))) {	\
745 		(m) = m_pullup((m), (len));		\
746 		if ((m) == NULL)			\
747 			return (NULL);			\
748 	}						\
749 } while (0)
750 
751 /*
752  * NOTE: If this function failed, the m_head would be freed.
753  */
754 static __inline struct mbuf *
755 hn_tso_fixup(struct mbuf *m_head)
756 {
757 	struct ether_vlan_header *evl;
758 	struct tcphdr *th;
759 	int ehlen;
760 
761 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
762 
763 	PULLUP_HDR(m_head, sizeof(*evl));
764 	evl = mtod(m_head, struct ether_vlan_header *);
765 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
766 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
767 	else
768 		ehlen = ETHER_HDR_LEN;
769 	m_head->m_pkthdr.l2hlen = ehlen;
770 
771 #ifdef INET
772 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
773 		struct ip *ip;
774 		int iphlen;
775 
776 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
777 		ip = mtodo(m_head, ehlen);
778 		iphlen = ip->ip_hl << 2;
779 		m_head->m_pkthdr.l3hlen = iphlen;
780 
781 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
782 		th = mtodo(m_head, ehlen + iphlen);
783 
784 		ip->ip_len = 0;
785 		ip->ip_sum = 0;
786 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
787 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
788 	}
789 #endif
790 #if defined(INET6) && defined(INET)
791 	else
792 #endif
793 #ifdef INET6
794 	{
795 		struct ip6_hdr *ip6;
796 
797 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
798 		ip6 = mtodo(m_head, ehlen);
799 		if (ip6->ip6_nxt != IPPROTO_TCP) {
800 			m_freem(m_head);
801 			return (NULL);
802 		}
803 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
804 
805 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
806 		th = mtodo(m_head, ehlen + sizeof(*ip6));
807 
808 		ip6->ip6_plen = 0;
809 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
810 	}
811 #endif
812 	return (m_head);
813 }
814 
815 /*
816  * NOTE: If this function failed, the m_head would be freed.
817  */
818 static __inline struct mbuf *
819 hn_set_hlen(struct mbuf *m_head)
820 {
821 	const struct ether_vlan_header *evl;
822 	int ehlen;
823 
824 	PULLUP_HDR(m_head, sizeof(*evl));
825 	evl = mtod(m_head, const struct ether_vlan_header *);
826 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
827 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
828 	else
829 		ehlen = ETHER_HDR_LEN;
830 	m_head->m_pkthdr.l2hlen = ehlen;
831 
832 #ifdef INET
833 	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
834 		const struct ip *ip;
835 		int iphlen;
836 
837 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
838 		ip = mtodo(m_head, ehlen);
839 		iphlen = ip->ip_hl << 2;
840 		m_head->m_pkthdr.l3hlen = iphlen;
841 
842 		/*
843 		 * UDP checksum offload does not work in Azure, if the
844 		 * following conditions meet:
845 		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
846 		 * - IP_DF is not set in the IP hdr.
847 		 *
848 		 * Fallback to software checksum for these UDP datagrams.
849 		 */
850 		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
851 		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
852 		    (ntohs(ip->ip_off) & IP_DF) == 0) {
853 			uint16_t off = ehlen + iphlen;
854 
855 			counter_u64_add(hn_udpcs_fixup, 1);
856 			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
857 			*(uint16_t *)(m_head->m_data + off +
858                             m_head->m_pkthdr.csum_data) = in_cksum_skip(
859 			    m_head, m_head->m_pkthdr.len, off);
860 			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
861 		}
862 	}
863 #endif
864 #if defined(INET6) && defined(INET)
865 	else
866 #endif
867 #ifdef INET6
868 	{
869 		const struct ip6_hdr *ip6;
870 
871 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
872 		ip6 = mtodo(m_head, ehlen);
873 		if (ip6->ip6_nxt != IPPROTO_TCP &&
874 		    ip6->ip6_nxt != IPPROTO_UDP) {
875 			m_freem(m_head);
876 			return (NULL);
877 		}
878 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
879 	}
880 #endif
881 	return (m_head);
882 }
883 
884 /*
885  * NOTE: If this function failed, the m_head would be freed.
886  */
887 static __inline struct mbuf *
888 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
889 {
890 	const struct tcphdr *th;
891 	int ehlen, iphlen;
892 
893 	*tcpsyn = 0;
894 	ehlen = m_head->m_pkthdr.l2hlen;
895 	iphlen = m_head->m_pkthdr.l3hlen;
896 
897 	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
898 	th = mtodo(m_head, ehlen + iphlen);
899 	if (th->th_flags & TH_SYN)
900 		*tcpsyn = 1;
901 	return (m_head);
902 }
903 
904 #undef PULLUP_HDR
905 
906 #endif	/* INET6 || INET */
907 
908 static int
909 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
910 {
911 	int error = 0;
912 
913 	HN_LOCK_ASSERT(sc);
914 
915 	if (sc->hn_rx_filter != filter) {
916 		error = hn_rndis_set_rxfilter(sc, filter);
917 		if (!error)
918 			sc->hn_rx_filter = filter;
919 	}
920 	return (error);
921 }
922 
923 static int
924 hn_rxfilter_config(struct hn_softc *sc)
925 {
926 	struct ifnet *ifp = sc->hn_ifp;
927 	uint32_t filter;
928 
929 	HN_LOCK_ASSERT(sc);
930 
931 	/*
932 	 * If the non-transparent mode VF is activated, we don't know how
933 	 * its RX filter is configured, so stick the synthetic device in
934 	 * the promiscous mode.
935 	 */
936 	if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
937 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
938 	} else {
939 		filter = NDIS_PACKET_TYPE_DIRECTED;
940 		if (ifp->if_flags & IFF_BROADCAST)
941 			filter |= NDIS_PACKET_TYPE_BROADCAST;
942 		/* TODO: support multicast list */
943 		if ((ifp->if_flags & IFF_ALLMULTI) ||
944 		    !CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
945 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
946 	}
947 	return (hn_set_rxfilter(sc, filter));
948 }
949 
950 static void
951 hn_set_txagg(struct hn_softc *sc)
952 {
953 	uint32_t size, pkts;
954 	int i;
955 
956 	/*
957 	 * Setup aggregation size.
958 	 */
959 	if (sc->hn_agg_size < 0)
960 		size = UINT32_MAX;
961 	else
962 		size = sc->hn_agg_size;
963 
964 	if (sc->hn_rndis_agg_size < size)
965 		size = sc->hn_rndis_agg_size;
966 
967 	/* NOTE: We only aggregate packets using chimney sending buffers. */
968 	if (size > (uint32_t)sc->hn_chim_szmax)
969 		size = sc->hn_chim_szmax;
970 
971 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
972 		/* Disable */
973 		size = 0;
974 		pkts = 0;
975 		goto done;
976 	}
977 
978 	/* NOTE: Type of the per TX ring setting is 'int'. */
979 	if (size > INT_MAX)
980 		size = INT_MAX;
981 
982 	/*
983 	 * Setup aggregation packet count.
984 	 */
985 	if (sc->hn_agg_pkts < 0)
986 		pkts = UINT32_MAX;
987 	else
988 		pkts = sc->hn_agg_pkts;
989 
990 	if (sc->hn_rndis_agg_pkts < pkts)
991 		pkts = sc->hn_rndis_agg_pkts;
992 
993 	if (pkts <= 1) {
994 		/* Disable */
995 		size = 0;
996 		pkts = 0;
997 		goto done;
998 	}
999 
1000 	/* NOTE: Type of the per TX ring setting is 'short'. */
1001 	if (pkts > SHRT_MAX)
1002 		pkts = SHRT_MAX;
1003 
1004 done:
1005 	/* NOTE: Type of the per TX ring setting is 'short'. */
1006 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
1007 		/* Disable */
1008 		size = 0;
1009 		pkts = 0;
1010 	}
1011 
1012 	if (bootverbose) {
1013 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1014 		    size, pkts, sc->hn_rndis_agg_align);
1015 	}
1016 
1017 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1018 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1019 
1020 		mtx_lock(&txr->hn_tx_lock);
1021 		txr->hn_agg_szmax = size;
1022 		txr->hn_agg_pktmax = pkts;
1023 		txr->hn_agg_align = sc->hn_rndis_agg_align;
1024 		mtx_unlock(&txr->hn_tx_lock);
1025 	}
1026 }
1027 
1028 static int
1029 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1030 {
1031 
1032 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1033 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1034 		return txr->hn_txdesc_cnt;
1035 	return hn_tx_swq_depth;
1036 }
1037 
1038 static int
1039 hn_rss_reconfig(struct hn_softc *sc)
1040 {
1041 	int error;
1042 
1043 	HN_LOCK_ASSERT(sc);
1044 
1045 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1046 		return (ENXIO);
1047 
1048 	/*
1049 	 * Disable RSS first.
1050 	 *
1051 	 * NOTE:
1052 	 * Direct reconfiguration by setting the UNCHG flags does
1053 	 * _not_ work properly.
1054 	 */
1055 	if (bootverbose)
1056 		if_printf(sc->hn_ifp, "disable RSS\n");
1057 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1058 	if (error) {
1059 		if_printf(sc->hn_ifp, "RSS disable failed\n");
1060 		return (error);
1061 	}
1062 
1063 	/*
1064 	 * Reenable the RSS w/ the updated RSS key or indirect
1065 	 * table.
1066 	 */
1067 	if (bootverbose)
1068 		if_printf(sc->hn_ifp, "reconfig RSS\n");
1069 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1070 	if (error) {
1071 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1072 		return (error);
1073 	}
1074 	return (0);
1075 }
1076 
1077 static void
1078 hn_rss_ind_fixup(struct hn_softc *sc)
1079 {
1080 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1081 	int i, nchan;
1082 
1083 	nchan = sc->hn_rx_ring_inuse;
1084 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1085 
1086 	/*
1087 	 * Check indirect table to make sure that all channels in it
1088 	 * can be used.
1089 	 */
1090 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1091 		if (rss->rss_ind[i] >= nchan) {
1092 			if_printf(sc->hn_ifp,
1093 			    "RSS indirect table %d fixup: %u -> %d\n",
1094 			    i, rss->rss_ind[i], nchan - 1);
1095 			rss->rss_ind[i] = nchan - 1;
1096 		}
1097 	}
1098 }
1099 
1100 static int
1101 hn_ifmedia_upd(struct ifnet *ifp __unused)
1102 {
1103 
1104 	return EOPNOTSUPP;
1105 }
1106 
1107 static void
1108 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1109 {
1110 	struct hn_softc *sc = ifp->if_softc;
1111 
1112 	ifmr->ifm_status = IFM_AVALID;
1113 	ifmr->ifm_active = IFM_ETHER;
1114 
1115 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1116 		ifmr->ifm_active |= IFM_NONE;
1117 		return;
1118 	}
1119 	ifmr->ifm_status |= IFM_ACTIVE;
1120 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1121 }
1122 
1123 static void
1124 hn_rxvf_set_task(void *xarg, int pending __unused)
1125 {
1126 	struct hn_rxvf_setarg *arg = xarg;
1127 
1128 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1129 }
1130 
1131 static void
1132 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1133 {
1134 	struct hn_rx_ring *rxr;
1135 	struct hn_rxvf_setarg arg;
1136 	struct task task;
1137 	int i;
1138 
1139 	HN_LOCK_ASSERT(sc);
1140 
1141 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1142 
1143 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1144 		rxr = &sc->hn_rx_ring[i];
1145 
1146 		if (i < sc->hn_rx_ring_inuse) {
1147 			arg.rxr = rxr;
1148 			arg.vf_ifp = vf_ifp;
1149 			vmbus_chan_run_task(rxr->hn_chan, &task);
1150 		} else {
1151 			rxr->hn_rxvf_ifp = vf_ifp;
1152 		}
1153 	}
1154 }
1155 
1156 static bool
1157 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1158 {
1159 	const struct ifnet *hn_ifp;
1160 
1161 	hn_ifp = sc->hn_ifp;
1162 
1163 	if (ifp == hn_ifp)
1164 		return (false);
1165 
1166 	if (ifp->if_alloctype != IFT_ETHER)
1167 		return (false);
1168 
1169 	/* Ignore lagg/vlan interfaces */
1170 	if (strcmp(ifp->if_dname, "lagg") == 0 ||
1171 	    strcmp(ifp->if_dname, "vlan") == 0)
1172 		return (false);
1173 
1174 	/*
1175 	 * During detach events ifp->if_addr might be NULL.
1176 	 * Make sure the bcmp() below doesn't panic on that:
1177 	 */
1178 	if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL)
1179 		return (false);
1180 
1181 	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1182 		return (false);
1183 
1184 	return (true);
1185 }
1186 
1187 static void
1188 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1189 {
1190 	struct ifnet *hn_ifp;
1191 
1192 	HN_LOCK(sc);
1193 
1194 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1195 		goto out;
1196 
1197 	if (!hn_ismyvf(sc, ifp))
1198 		goto out;
1199 	hn_ifp = sc->hn_ifp;
1200 
1201 	if (rxvf) {
1202 		if (sc->hn_flags & HN_FLAG_RXVF)
1203 			goto out;
1204 
1205 		sc->hn_flags |= HN_FLAG_RXVF;
1206 		hn_rxfilter_config(sc);
1207 	} else {
1208 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1209 			goto out;
1210 
1211 		sc->hn_flags &= ~HN_FLAG_RXVF;
1212 		if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1213 			hn_rxfilter_config(sc);
1214 		else
1215 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1216 	}
1217 
1218 	hn_nvs_set_datapath(sc,
1219 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1220 
1221 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1222 
1223 	if (rxvf) {
1224 		hn_vf_rss_fixup(sc, true);
1225 		hn_suspend_mgmt(sc);
1226 		sc->hn_link_flags &=
1227 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1228 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1229 	} else {
1230 		hn_vf_rss_restore(sc);
1231 		hn_resume_mgmt(sc);
1232 	}
1233 
1234 	devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1235 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1236 
1237 	if (bootverbose) {
1238 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1239 		    rxvf ? "to" : "from", ifp->if_xname);
1240 	}
1241 out:
1242 	HN_UNLOCK(sc);
1243 }
1244 
1245 static void
1246 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1247 {
1248 
1249 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1250 		return;
1251 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1252 }
1253 
1254 static void
1255 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1256 {
1257 
1258 	hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1259 }
1260 
1261 static int
1262 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1263 {
1264 	struct ifnet *ifp, *vf_ifp;
1265 	uint64_t tmp;
1266 	int error;
1267 
1268 	HN_LOCK_ASSERT(sc);
1269 	ifp = sc->hn_ifp;
1270 	vf_ifp = sc->hn_vf_ifp;
1271 
1272 	/*
1273 	 * Fix up requested capabilities w/ supported capabilities,
1274 	 * since the supported capabilities could have been changed.
1275 	 */
1276 	ifr->ifr_reqcap &= ifp->if_capabilities;
1277 	/* Pass SIOCSIFCAP to VF. */
1278 	error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1279 
1280 	/*
1281 	 * NOTE:
1282 	 * The error will be propagated to the callers, however, it
1283 	 * is _not_ useful here.
1284 	 */
1285 
1286 	/*
1287 	 * Merge VF's enabled capabilities.
1288 	 */
1289 	ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1290 
1291 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1292 	if (ifp->if_capenable & IFCAP_TXCSUM)
1293 		ifp->if_hwassist |= tmp;
1294 	else
1295 		ifp->if_hwassist &= ~tmp;
1296 
1297 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1298 	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1299 		ifp->if_hwassist |= tmp;
1300 	else
1301 		ifp->if_hwassist &= ~tmp;
1302 
1303 	tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1304 	if (ifp->if_capenable & IFCAP_TSO4)
1305 		ifp->if_hwassist |= tmp;
1306 	else
1307 		ifp->if_hwassist &= ~tmp;
1308 
1309 	tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1310 	if (ifp->if_capenable & IFCAP_TSO6)
1311 		ifp->if_hwassist |= tmp;
1312 	else
1313 		ifp->if_hwassist &= ~tmp;
1314 
1315 	return (error);
1316 }
1317 
1318 static int
1319 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1320 {
1321 	struct ifnet *vf_ifp;
1322 	struct ifreq ifr;
1323 
1324 	HN_LOCK_ASSERT(sc);
1325 	vf_ifp = sc->hn_vf_ifp;
1326 
1327 	memset(&ifr, 0, sizeof(ifr));
1328 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1329 	ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1330 	ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1331 	return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1332 }
1333 
1334 static void
1335 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1336 {
1337 	struct ifnet *ifp = sc->hn_ifp;
1338 	int allmulti = 0;
1339 
1340 	HN_LOCK_ASSERT(sc);
1341 
1342 	/* XXX vlan(4) style mcast addr maintenance */
1343 	if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
1344 		allmulti = IFF_ALLMULTI;
1345 
1346 	/* Always set the VF's if_flags */
1347 	sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1348 }
1349 
1350 static void
1351 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1352 {
1353 	struct rm_priotracker pt;
1354 	struct ifnet *hn_ifp = NULL;
1355 	struct mbuf *mn;
1356 
1357 	/*
1358 	 * XXX racy, if hn(4) ever detached.
1359 	 */
1360 	rm_rlock(&hn_vfmap_lock, &pt);
1361 	if (vf_ifp->if_index < hn_vfmap_size)
1362 		hn_ifp = hn_vfmap[vf_ifp->if_index];
1363 	rm_runlock(&hn_vfmap_lock, &pt);
1364 
1365 	if (hn_ifp != NULL) {
1366 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1367 			/*
1368 			 * Allow tapping on the VF.
1369 			 */
1370 			ETHER_BPF_MTAP(vf_ifp, mn);
1371 
1372 			/*
1373 			 * Update VF stats.
1374 			 */
1375 			if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1376 				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1377 				    mn->m_pkthdr.len);
1378 			}
1379 			/*
1380 			 * XXX IFCOUNTER_IMCAST
1381 			 * This stat updating is kinda invasive, since it
1382 			 * requires two checks on the mbuf: the length check
1383 			 * and the ethernet header check.  As of this write,
1384 			 * all multicast packets go directly to hn(4), which
1385 			 * makes imcast stat updating in the VF a try in vian.
1386 			 */
1387 
1388 			/*
1389 			 * Fix up rcvif and increase hn(4)'s ipackets.
1390 			 */
1391 			mn->m_pkthdr.rcvif = hn_ifp;
1392 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1393 		}
1394 		/*
1395 		 * Go through hn(4)'s if_input.
1396 		 */
1397 		hn_ifp->if_input(hn_ifp, m);
1398 	} else {
1399 		/*
1400 		 * In the middle of the transition; free this
1401 		 * mbuf chain.
1402 		 */
1403 		while (m != NULL) {
1404 			mn = m->m_nextpkt;
1405 			m->m_nextpkt = NULL;
1406 			m_freem(m);
1407 			m = mn;
1408 		}
1409 	}
1410 }
1411 
1412 static void
1413 hn_mtu_change_fixup(struct hn_softc *sc)
1414 {
1415 	struct ifnet *ifp;
1416 
1417 	HN_LOCK_ASSERT(sc);
1418 	ifp = sc->hn_ifp;
1419 
1420 	hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1421 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1422 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1423 }
1424 
1425 static uint32_t
1426 hn_rss_type_fromndis(uint32_t rss_hash)
1427 {
1428 	uint32_t types = 0;
1429 
1430 	if (rss_hash & NDIS_HASH_IPV4)
1431 		types |= RSS_TYPE_IPV4;
1432 	if (rss_hash & NDIS_HASH_TCP_IPV4)
1433 		types |= RSS_TYPE_TCP_IPV4;
1434 	if (rss_hash & NDIS_HASH_IPV6)
1435 		types |= RSS_TYPE_IPV6;
1436 	if (rss_hash & NDIS_HASH_IPV6_EX)
1437 		types |= RSS_TYPE_IPV6_EX;
1438 	if (rss_hash & NDIS_HASH_TCP_IPV6)
1439 		types |= RSS_TYPE_TCP_IPV6;
1440 	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1441 		types |= RSS_TYPE_TCP_IPV6_EX;
1442 	if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1443 		types |= RSS_TYPE_UDP_IPV4;
1444 	return (types);
1445 }
1446 
1447 static uint32_t
1448 hn_rss_type_tondis(uint32_t types)
1449 {
1450 	uint32_t rss_hash = 0;
1451 
1452 	KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1453 	    ("UDP6 and UDP6EX are not supported"));
1454 
1455 	if (types & RSS_TYPE_IPV4)
1456 		rss_hash |= NDIS_HASH_IPV4;
1457 	if (types & RSS_TYPE_TCP_IPV4)
1458 		rss_hash |= NDIS_HASH_TCP_IPV4;
1459 	if (types & RSS_TYPE_IPV6)
1460 		rss_hash |= NDIS_HASH_IPV6;
1461 	if (types & RSS_TYPE_IPV6_EX)
1462 		rss_hash |= NDIS_HASH_IPV6_EX;
1463 	if (types & RSS_TYPE_TCP_IPV6)
1464 		rss_hash |= NDIS_HASH_TCP_IPV6;
1465 	if (types & RSS_TYPE_TCP_IPV6_EX)
1466 		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1467 	if (types & RSS_TYPE_UDP_IPV4)
1468 		rss_hash |= NDIS_HASH_UDP_IPV4_X;
1469 	return (rss_hash);
1470 }
1471 
1472 static void
1473 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1474 {
1475 	int i;
1476 
1477 	HN_LOCK_ASSERT(sc);
1478 
1479 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1480 		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1481 }
1482 
1483 static void
1484 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1485 {
1486 	struct ifnet *ifp, *vf_ifp;
1487 	struct ifrsshash ifrh;
1488 	struct ifrsskey ifrk;
1489 	int error;
1490 	uint32_t my_types, diff_types, mbuf_types = 0;
1491 
1492 	HN_LOCK_ASSERT(sc);
1493 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1494 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1495 
1496 	if (sc->hn_rx_ring_inuse == 1) {
1497 		/* No RSS on synthetic parts; done. */
1498 		return;
1499 	}
1500 	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1501 		/* Synthetic parts do not support Toeplitz; done. */
1502 		return;
1503 	}
1504 
1505 	ifp = sc->hn_ifp;
1506 	vf_ifp = sc->hn_vf_ifp;
1507 
1508 	/*
1509 	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
1510 	 * supported.
1511 	 */
1512 	memset(&ifrk, 0, sizeof(ifrk));
1513 	strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1514 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1515 	if (error) {
1516 		if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
1517 		    vf_ifp->if_xname, error);
1518 		goto done;
1519 	}
1520 	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1521 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1522 		    vf_ifp->if_xname, ifrk.ifrk_func);
1523 		goto done;
1524 	}
1525 	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1526 		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1527 		    vf_ifp->if_xname, ifrk.ifrk_keylen);
1528 		goto done;
1529 	}
1530 
1531 	/*
1532 	 * Extract VF's RSS hash.  Only Toeplitz is supported.
1533 	 */
1534 	memset(&ifrh, 0, sizeof(ifrh));
1535 	strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1536 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1537 	if (error) {
1538 		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1539 		    vf_ifp->if_xname, error);
1540 		goto done;
1541 	}
1542 	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1543 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1544 		    vf_ifp->if_xname, ifrh.ifrh_func);
1545 		goto done;
1546 	}
1547 
1548 	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1549 	if ((ifrh.ifrh_types & my_types) == 0) {
1550 		/* This disables RSS; ignore it then */
1551 		if_printf(ifp, "%s intersection of RSS types failed.  "
1552 		    "VF %#x, mine %#x\n", vf_ifp->if_xname,
1553 		    ifrh.ifrh_types, my_types);
1554 		goto done;
1555 	}
1556 
1557 	diff_types = my_types ^ ifrh.ifrh_types;
1558 	my_types &= ifrh.ifrh_types;
1559 	mbuf_types = my_types;
1560 
1561 	/*
1562 	 * Detect RSS hash value/type confliction.
1563 	 *
1564 	 * NOTE:
1565 	 * We don't disable the hash type, but stop delivery the hash
1566 	 * value/type through mbufs on RX path.
1567 	 *
1568 	 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1569 	 * hash is delivered with type of TCP_IPV4.  This means if
1570 	 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1571 	 * least to hn_mbuf_hash.  However, given that _all_ of the
1572 	 * NICs implement TCP_IPV4, this will _not_ impose any issues
1573 	 * here.
1574 	 */
1575 	if ((my_types & RSS_TYPE_IPV4) &&
1576 	    (diff_types & ifrh.ifrh_types &
1577 	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1578 		/* Conflict; disable IPV4 hash type/value delivery. */
1579 		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1580 		mbuf_types &= ~RSS_TYPE_IPV4;
1581 	}
1582 	if ((my_types & RSS_TYPE_IPV6) &&
1583 	    (diff_types & ifrh.ifrh_types &
1584 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1585 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1586 	      RSS_TYPE_IPV6_EX))) {
1587 		/* Conflict; disable IPV6 hash type/value delivery. */
1588 		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1589 		mbuf_types &= ~RSS_TYPE_IPV6;
1590 	}
1591 	if ((my_types & RSS_TYPE_IPV6_EX) &&
1592 	    (diff_types & ifrh.ifrh_types &
1593 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1594 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1595 	      RSS_TYPE_IPV6))) {
1596 		/* Conflict; disable IPV6_EX hash type/value delivery. */
1597 		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1598 		mbuf_types &= ~RSS_TYPE_IPV6_EX;
1599 	}
1600 	if ((my_types & RSS_TYPE_TCP_IPV6) &&
1601 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1602 		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
1603 		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1604 		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1605 	}
1606 	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1607 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1608 		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1609 		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1610 		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1611 	}
1612 	if ((my_types & RSS_TYPE_UDP_IPV6) &&
1613 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1614 		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
1615 		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1616 		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1617 	}
1618 	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1619 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1620 		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1621 		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1622 		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1623 	}
1624 
1625 	/*
1626 	 * Indirect table does not matter.
1627 	 */
1628 
1629 	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1630 	    hn_rss_type_tondis(my_types);
1631 	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1632 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1633 
1634 	if (reconf) {
1635 		error = hn_rss_reconfig(sc);
1636 		if (error) {
1637 			/* XXX roll-back? */
1638 			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1639 			/* XXX keep going. */
1640 		}
1641 	}
1642 done:
1643 	/* Hash deliverability for mbufs. */
1644 	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1645 }
1646 
1647 static void
1648 hn_vf_rss_restore(struct hn_softc *sc)
1649 {
1650 
1651 	HN_LOCK_ASSERT(sc);
1652 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1653 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1654 
1655 	if (sc->hn_rx_ring_inuse == 1)
1656 		goto done;
1657 
1658 	/*
1659 	 * Restore hash types.  Key does _not_ matter.
1660 	 */
1661 	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1662 		int error;
1663 
1664 		sc->hn_rss_hash = sc->hn_rss_hcap;
1665 		error = hn_rss_reconfig(sc);
1666 		if (error) {
1667 			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1668 			    error);
1669 			/* XXX keep going. */
1670 		}
1671 	}
1672 done:
1673 	/* Hash deliverability for mbufs. */
1674 	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1675 }
1676 
1677 static void
1678 hn_xpnt_vf_setready(struct hn_softc *sc)
1679 {
1680 	struct ifnet *ifp, *vf_ifp;
1681 	struct ifreq ifr;
1682 
1683 	HN_LOCK_ASSERT(sc);
1684 	ifp = sc->hn_ifp;
1685 	vf_ifp = sc->hn_vf_ifp;
1686 
1687 	/*
1688 	 * Mark the VF ready.
1689 	 */
1690 	sc->hn_vf_rdytick = 0;
1691 
1692 	/*
1693 	 * Save information for restoration.
1694 	 */
1695 	sc->hn_saved_caps = ifp->if_capabilities;
1696 	sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1697 	sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1698 	sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1699 
1700 	/*
1701 	 * Intersect supported/enabled capabilities.
1702 	 *
1703 	 * NOTE:
1704 	 * if_hwassist is not changed here.
1705 	 */
1706 	ifp->if_capabilities &= vf_ifp->if_capabilities;
1707 	ifp->if_capenable &= ifp->if_capabilities;
1708 
1709 	/*
1710 	 * Fix TSO settings.
1711 	 */
1712 	if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1713 		ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1714 	if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1715 		ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1716 	if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1717 		ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1718 
1719 	/*
1720 	 * Change VF's enabled capabilities.
1721 	 */
1722 	memset(&ifr, 0, sizeof(ifr));
1723 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1724 	ifr.ifr_reqcap = ifp->if_capenable;
1725 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1726 
1727 	if (ifp->if_mtu != ETHERMTU) {
1728 		int error;
1729 
1730 		/*
1731 		 * Change VF's MTU.
1732 		 */
1733 		memset(&ifr, 0, sizeof(ifr));
1734 		strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1735 		ifr.ifr_mtu = ifp->if_mtu;
1736 		error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1737 		if (error) {
1738 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1739 			    vf_ifp->if_xname, ifp->if_mtu);
1740 			if (ifp->if_mtu > ETHERMTU) {
1741 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1742 
1743 				/*
1744 				 * XXX
1745 				 * No need to adjust the synthetic parts' MTU;
1746 				 * failure of the adjustment will cause us
1747 				 * infinite headache.
1748 				 */
1749 				ifp->if_mtu = ETHERMTU;
1750 				hn_mtu_change_fixup(sc);
1751 			}
1752 		}
1753 	}
1754 }
1755 
1756 static bool
1757 hn_xpnt_vf_isready(struct hn_softc *sc)
1758 {
1759 
1760 	HN_LOCK_ASSERT(sc);
1761 
1762 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1763 		return (false);
1764 
1765 	if (sc->hn_vf_rdytick == 0)
1766 		return (true);
1767 
1768 	if (sc->hn_vf_rdytick > ticks)
1769 		return (false);
1770 
1771 	/* Mark VF as ready. */
1772 	hn_xpnt_vf_setready(sc);
1773 	return (true);
1774 }
1775 
1776 static void
1777 hn_xpnt_vf_setenable(struct hn_softc *sc)
1778 {
1779 	int i;
1780 
1781 	HN_LOCK_ASSERT(sc);
1782 
1783 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1784 	rm_wlock(&sc->hn_vf_lock);
1785 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1786 	rm_wunlock(&sc->hn_vf_lock);
1787 
1788 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1789 		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1790 }
1791 
1792 static void
1793 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1794 {
1795 	int i;
1796 
1797 	HN_LOCK_ASSERT(sc);
1798 
1799 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1800 	rm_wlock(&sc->hn_vf_lock);
1801 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1802 	if (clear_vf)
1803 		sc->hn_vf_ifp = NULL;
1804 	rm_wunlock(&sc->hn_vf_lock);
1805 
1806 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1807 		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1808 }
1809 
1810 static void
1811 hn_xpnt_vf_init(struct hn_softc *sc)
1812 {
1813 	int error;
1814 
1815 	HN_LOCK_ASSERT(sc);
1816 
1817 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1818 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1819 
1820 	if (bootverbose) {
1821 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1822 		    sc->hn_vf_ifp->if_xname);
1823 	}
1824 
1825 	/*
1826 	 * Bring the VF up.
1827 	 */
1828 	hn_xpnt_vf_saveifflags(sc);
1829 	sc->hn_vf_ifp->if_flags |= IFF_UP;
1830 	error = hn_xpnt_vf_iocsetflags(sc);
1831 	if (error) {
1832 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1833 		    sc->hn_vf_ifp->if_xname, error);
1834 		return;
1835 	}
1836 
1837 	/*
1838 	 * NOTE:
1839 	 * Datapath setting must happen _after_ bringing the VF up.
1840 	 */
1841 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1842 
1843 	/*
1844 	 * NOTE:
1845 	 * Fixup RSS related bits _after_ the VF is brought up, since
1846 	 * many VFs generate RSS key during it's initialization.
1847 	 */
1848 	hn_vf_rss_fixup(sc, true);
1849 
1850 	/* Mark transparent mode VF as enabled. */
1851 	hn_xpnt_vf_setenable(sc);
1852 }
1853 
1854 static void
1855 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1856 {
1857 	struct hn_softc *sc = xsc;
1858 
1859 	HN_LOCK(sc);
1860 
1861 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1862 		goto done;
1863 	if (sc->hn_vf_ifp == NULL)
1864 		goto done;
1865 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1866 		goto done;
1867 
1868 	if (sc->hn_vf_rdytick != 0) {
1869 		/* Mark VF as ready. */
1870 		hn_xpnt_vf_setready(sc);
1871 	}
1872 
1873 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1874 		/*
1875 		 * Delayed VF initialization.
1876 		 */
1877 		if (bootverbose) {
1878 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1879 			    sc->hn_vf_ifp->if_xname);
1880 		}
1881 		hn_xpnt_vf_init(sc);
1882 	}
1883 done:
1884 	HN_UNLOCK(sc);
1885 }
1886 
1887 static void
1888 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1889 {
1890 	struct hn_softc *sc = xsc;
1891 
1892 	HN_LOCK(sc);
1893 
1894 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1895 		goto done;
1896 
1897 	if (!hn_ismyvf(sc, ifp))
1898 		goto done;
1899 
1900 	if (sc->hn_vf_ifp != NULL) {
1901 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1902 		    sc->hn_vf_ifp->if_xname);
1903 		goto done;
1904 	}
1905 
1906 	if (hn_xpnt_vf && ifp->if_start != NULL) {
1907 		/*
1908 		 * ifnet.if_start is _not_ supported by transparent
1909 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1910 		 */
1911 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1912 		    "in transparent VF mode.\n", ifp->if_xname);
1913 		goto done;
1914 	}
1915 
1916 	rm_wlock(&hn_vfmap_lock);
1917 
1918 	if (ifp->if_index >= hn_vfmap_size) {
1919 		struct ifnet **newmap;
1920 		int newsize;
1921 
1922 		newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1923 		newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1924 		    M_WAITOK | M_ZERO);
1925 
1926 		memcpy(newmap, hn_vfmap,
1927 		    sizeof(struct ifnet *) * hn_vfmap_size);
1928 		free(hn_vfmap, M_DEVBUF);
1929 		hn_vfmap = newmap;
1930 		hn_vfmap_size = newsize;
1931 	}
1932 	KASSERT(hn_vfmap[ifp->if_index] == NULL,
1933 	    ("%s: ifindex %d was mapped to %s",
1934 	     ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1935 	hn_vfmap[ifp->if_index] = sc->hn_ifp;
1936 
1937 	rm_wunlock(&hn_vfmap_lock);
1938 
1939 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1940 	rm_wlock(&sc->hn_vf_lock);
1941 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1942 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1943 	sc->hn_vf_ifp = ifp;
1944 	rm_wunlock(&sc->hn_vf_lock);
1945 
1946 	if (hn_xpnt_vf) {
1947 		int wait_ticks;
1948 
1949 		/*
1950 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1951 		 * Save vf_ifp's current if_input for later restoration.
1952 		 */
1953 		sc->hn_vf_input = ifp->if_input;
1954 		ifp->if_input = hn_xpnt_vf_input;
1955 
1956 		/*
1957 		 * Stop link status management; use the VF's.
1958 		 */
1959 		hn_suspend_mgmt(sc);
1960 
1961 		/*
1962 		 * Give VF sometime to complete its attach routing.
1963 		 */
1964 		wait_ticks = hn_xpnt_vf_attwait * hz;
1965 		sc->hn_vf_rdytick = ticks + wait_ticks;
1966 
1967 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1968 		    wait_ticks);
1969 	}
1970 done:
1971 	HN_UNLOCK(sc);
1972 }
1973 
1974 static void
1975 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1976 {
1977 	struct hn_softc *sc = xsc;
1978 
1979 	HN_LOCK(sc);
1980 
1981 	if (sc->hn_vf_ifp == NULL)
1982 		goto done;
1983 
1984 	if (!hn_ismyvf(sc, ifp))
1985 		goto done;
1986 
1987 	if (hn_xpnt_vf) {
1988 		/*
1989 		 * Make sure that the delayed initialization is not running.
1990 		 *
1991 		 * NOTE:
1992 		 * - This lock _must_ be released, since the hn_vf_init task
1993 		 *   will try holding this lock.
1994 		 * - It is safe to release this lock here, since the
1995 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1996 		 *
1997 		 * XXX racy, if hn(4) ever detached.
1998 		 */
1999 		HN_UNLOCK(sc);
2000 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
2001 		HN_LOCK(sc);
2002 
2003 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
2004 		    sc->hn_ifp->if_xname));
2005 		ifp->if_input = sc->hn_vf_input;
2006 		sc->hn_vf_input = NULL;
2007 
2008 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
2009 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
2010 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
2011 
2012 		if (sc->hn_vf_rdytick == 0) {
2013 			/*
2014 			 * The VF was ready; restore some settings.
2015 			 */
2016 			sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
2017 			/*
2018 			 * NOTE:
2019 			 * There is _no_ need to fixup if_capenable and
2020 			 * if_hwassist, since the if_capabilities before
2021 			 * restoration was an intersection of the VF's
2022 			 * if_capabilites and the synthetic device's
2023 			 * if_capabilites.
2024 			 */
2025 			sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
2026 			sc->hn_ifp->if_hw_tsomaxsegcount =
2027 			    sc->hn_saved_tsosegcnt;
2028 			sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
2029 		}
2030 
2031 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2032 			/*
2033 			 * Restore RSS settings.
2034 			 */
2035 			hn_vf_rss_restore(sc);
2036 
2037 			/*
2038 			 * Resume link status management, which was suspended
2039 			 * by hn_ifnet_attevent().
2040 			 */
2041 			hn_resume_mgmt(sc);
2042 		}
2043 	}
2044 
2045 	/* Mark transparent mode VF as disabled. */
2046 	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2047 
2048 	rm_wlock(&hn_vfmap_lock);
2049 
2050 	KASSERT(ifp->if_index < hn_vfmap_size,
2051 	    ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
2052 	if (hn_vfmap[ifp->if_index] != NULL) {
2053 		KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
2054 		    ("%s: ifindex %d was mapped to %s",
2055 		     ifp->if_xname, ifp->if_index,
2056 		     hn_vfmap[ifp->if_index]->if_xname));
2057 		hn_vfmap[ifp->if_index] = NULL;
2058 	}
2059 
2060 	rm_wunlock(&hn_vfmap_lock);
2061 done:
2062 	HN_UNLOCK(sc);
2063 }
2064 
2065 static void
2066 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
2067 {
2068 	struct hn_softc *sc = xsc;
2069 
2070 	if (sc->hn_vf_ifp == ifp)
2071 		if_link_state_change(sc->hn_ifp, link_state);
2072 }
2073 
2074 static int
2075 hn_probe(device_t dev)
2076 {
2077 
2078 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2079 		device_set_desc(dev, "Hyper-V Network Interface");
2080 		return BUS_PROBE_DEFAULT;
2081 	}
2082 	return ENXIO;
2083 }
2084 
2085 static int
2086 hn_attach(device_t dev)
2087 {
2088 	struct hn_softc *sc = device_get_softc(dev);
2089 	struct sysctl_oid_list *child;
2090 	struct sysctl_ctx_list *ctx;
2091 	uint8_t eaddr[ETHER_ADDR_LEN];
2092 	struct ifnet *ifp = NULL;
2093 	int error, ring_cnt, tx_ring_cnt;
2094 	uint32_t mtu;
2095 
2096 	sc->hn_dev = dev;
2097 	sc->hn_prichan = vmbus_get_channel(dev);
2098 	HN_LOCK_INIT(sc);
2099 	rm_init(&sc->hn_vf_lock, "hnvf");
2100 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2101 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2102 
2103 	/*
2104 	 * Initialize these tunables once.
2105 	 */
2106 	sc->hn_agg_size = hn_tx_agg_size;
2107 	sc->hn_agg_pkts = hn_tx_agg_pkts;
2108 
2109 	/*
2110 	 * Setup taskqueue for transmission.
2111 	 */
2112 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2113 		int i;
2114 
2115 		sc->hn_tx_taskqs =
2116 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2117 		    M_DEVBUF, M_WAITOK);
2118 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2119 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2120 			    M_WAITOK, taskqueue_thread_enqueue,
2121 			    &sc->hn_tx_taskqs[i]);
2122 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2123 			    "%s tx%d", device_get_nameunit(dev), i);
2124 		}
2125 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2126 		sc->hn_tx_taskqs = hn_tx_taskque;
2127 	}
2128 
2129 	/*
2130 	 * Setup taskqueue for mangement tasks, e.g. link status.
2131 	 */
2132 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2133 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2134 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2135 	    device_get_nameunit(dev));
2136 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2137 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2138 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2139 	    hn_netchg_status_taskfunc, sc);
2140 
2141 	if (hn_xpnt_vf) {
2142 		/*
2143 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2144 		 */
2145 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2146 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2147 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2148 		    device_get_nameunit(dev));
2149 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2150 		    hn_xpnt_vf_init_taskfunc, sc);
2151 	}
2152 
2153 	/*
2154 	 * Allocate ifnet and setup its name earlier, so that if_printf
2155 	 * can be used by functions, which will be called after
2156 	 * ether_ifattach().
2157 	 */
2158 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2159 	ifp->if_softc = sc;
2160 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2161 
2162 	/*
2163 	 * Initialize ifmedia earlier so that it can be unconditionally
2164 	 * destroyed, if error happened later on.
2165 	 */
2166 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2167 
2168 	/*
2169 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2170 	 * to use (tx_ring_cnt).
2171 	 *
2172 	 * NOTE:
2173 	 * The # of RX rings to use is same as the # of channels to use.
2174 	 */
2175 	ring_cnt = hn_chan_cnt;
2176 	if (ring_cnt <= 0) {
2177 		/* Default */
2178 		ring_cnt = mp_ncpus;
2179 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
2180 			ring_cnt = HN_RING_CNT_DEF_MAX;
2181 	} else if (ring_cnt > mp_ncpus) {
2182 		ring_cnt = mp_ncpus;
2183 	}
2184 #ifdef RSS
2185 	if (ring_cnt > rss_getnumbuckets())
2186 		ring_cnt = rss_getnumbuckets();
2187 #endif
2188 
2189 	tx_ring_cnt = hn_tx_ring_cnt;
2190 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2191 		tx_ring_cnt = ring_cnt;
2192 #ifdef HN_IFSTART_SUPPORT
2193 	if (hn_use_if_start) {
2194 		/* ifnet.if_start only needs one TX ring. */
2195 		tx_ring_cnt = 1;
2196 	}
2197 #endif
2198 
2199 	/*
2200 	 * Set the leader CPU for channels.
2201 	 */
2202 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2203 
2204 	/*
2205 	 * Create enough TX/RX rings, even if only limited number of
2206 	 * channels can be allocated.
2207 	 */
2208 	error = hn_create_tx_data(sc, tx_ring_cnt);
2209 	if (error)
2210 		goto failed;
2211 	error = hn_create_rx_data(sc, ring_cnt);
2212 	if (error)
2213 		goto failed;
2214 
2215 	/*
2216 	 * Create transaction context for NVS and RNDIS transactions.
2217 	 */
2218 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2219 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2220 	if (sc->hn_xact == NULL) {
2221 		error = ENXIO;
2222 		goto failed;
2223 	}
2224 
2225 	/*
2226 	 * Install orphan handler for the revocation of this device's
2227 	 * primary channel.
2228 	 *
2229 	 * NOTE:
2230 	 * The processing order is critical here:
2231 	 * Install the orphan handler, _before_ testing whether this
2232 	 * device's primary channel has been revoked or not.
2233 	 */
2234 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2235 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2236 		error = ENXIO;
2237 		goto failed;
2238 	}
2239 
2240 	/*
2241 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
2242 	 */
2243 	error = hn_synth_attach(sc, ETHERMTU);
2244 	if (error)
2245 		goto failed;
2246 
2247 	error = hn_rndis_get_eaddr(sc, eaddr);
2248 	if (error)
2249 		goto failed;
2250 
2251 	error = hn_rndis_get_mtu(sc, &mtu);
2252 	if (error)
2253 		mtu = ETHERMTU;
2254 	else if (bootverbose)
2255 		device_printf(dev, "RNDIS mtu %u\n", mtu);
2256 
2257 	if (sc->hn_rx_ring_inuse > 1) {
2258 		/*
2259 		 * Reduce TCP segment aggregation limit for multiple
2260 		 * RX rings to increase ACK timeliness.
2261 		 */
2262 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2263 	}
2264 
2265 	/*
2266 	 * Fixup TX/RX stuffs after synthetic parts are attached.
2267 	 */
2268 	hn_fixup_tx_data(sc);
2269 	hn_fixup_rx_data(sc);
2270 
2271 	ctx = device_get_sysctl_ctx(dev);
2272 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2273 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2274 	    &sc->hn_nvs_ver, 0, "NVS version");
2275 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2276 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2277 	    hn_ndis_version_sysctl, "A", "NDIS version");
2278 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2279 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2280 	    hn_caps_sysctl, "A", "capabilities");
2281 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2282 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2283 	    hn_hwassist_sysctl, "A", "hwassist");
2284 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2285 	    CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2286 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2287 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2288 	    "max # of TSO segments");
2289 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2290 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2291 	    "max size of TSO segment");
2292 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2293 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2294 	    hn_rxfilter_sysctl, "A", "rxfilter");
2295 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2296 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2297 	    hn_rss_hash_sysctl, "A", "RSS hash");
2298 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2299 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2300 	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2301 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2302 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2303 	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2304 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2305 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2306 #ifndef RSS
2307 	/*
2308 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
2309 	 */
2310 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2311 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2312 	    hn_rss_key_sysctl, "IU", "RSS key");
2313 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2314 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2315 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
2316 #endif
2317 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2318 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2319 	    "RNDIS offered packet transmission aggregation size limit");
2320 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2321 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2322 	    "RNDIS offered packet transmission aggregation count limit");
2323 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2324 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2325 	    "RNDIS packet transmission aggregation alignment");
2326 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2327 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2328 	    hn_txagg_size_sysctl, "I",
2329 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2330 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2331 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2332 	    hn_txagg_pkts_sysctl, "I",
2333 	    "Packet transmission aggregation packets, "
2334 	    "0 -- disable, -1 -- auto");
2335 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2336 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2337 	    hn_polling_sysctl, "I",
2338 	    "Polling frequency: [100,1000000], 0 disable polling");
2339 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2340 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2341 	    hn_vf_sysctl, "A", "Virtual Function's name");
2342 	if (!hn_xpnt_vf) {
2343 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2344 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2345 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2346 	} else {
2347 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2348 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2349 		    hn_xpnt_vf_enabled_sysctl, "I",
2350 		    "Transparent VF enabled");
2351 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2352 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2353 		    hn_xpnt_vf_accbpf_sysctl, "I",
2354 		    "Accurate BPF for transparent VF");
2355 	}
2356 
2357 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rsc_switch",
2358 	    CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_rsc_sysctl, "A",
2359 	    "switch to rsc");
2360 
2361 	/*
2362 	 * Setup the ifmedia, which has been initialized earlier.
2363 	 */
2364 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2365 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2366 	/* XXX ifmedia_set really should do this for us */
2367 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2368 
2369 	/*
2370 	 * Setup the ifnet for this interface.
2371 	 */
2372 
2373 	ifp->if_baudrate = IF_Gbps(10);
2374 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2375 	ifp->if_ioctl = hn_ioctl;
2376 	ifp->if_init = hn_init;
2377 #ifdef HN_IFSTART_SUPPORT
2378 	if (hn_use_if_start) {
2379 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2380 
2381 		ifp->if_start = hn_start;
2382 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2383 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2384 		IFQ_SET_READY(&ifp->if_snd);
2385 	} else
2386 #endif
2387 	{
2388 		ifp->if_transmit = hn_transmit;
2389 		ifp->if_qflush = hn_xmit_qflush;
2390 	}
2391 
2392 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2393 #ifdef foo
2394 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2395 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2396 #endif
2397 	if (sc->hn_caps & HN_CAP_VLAN) {
2398 		/* XXX not sure about VLAN_MTU. */
2399 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2400 	}
2401 
2402 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2403 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2404 		ifp->if_capabilities |= IFCAP_TXCSUM;
2405 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2406 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2407 	if (sc->hn_caps & HN_CAP_TSO4) {
2408 		ifp->if_capabilities |= IFCAP_TSO4;
2409 		ifp->if_hwassist |= CSUM_IP_TSO;
2410 	}
2411 	if (sc->hn_caps & HN_CAP_TSO6) {
2412 		ifp->if_capabilities |= IFCAP_TSO6;
2413 		ifp->if_hwassist |= CSUM_IP6_TSO;
2414 	}
2415 
2416 	/* Enable all available capabilities by default. */
2417 	ifp->if_capenable = ifp->if_capabilities;
2418 
2419 	/*
2420 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2421 	 * be enabled through SIOCSIFCAP.
2422 	 */
2423 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2424 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2425 
2426 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2427 		/*
2428 		 * Lock hn_set_tso_maxsize() to simplify its
2429 		 * internal logic.
2430 		 */
2431 		HN_LOCK(sc);
2432 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2433 		HN_UNLOCK(sc);
2434 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2435 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2436 	}
2437 
2438 	ether_ifattach(ifp, eaddr);
2439 
2440 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2441 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2442 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2443 	}
2444 	if (mtu < ETHERMTU) {
2445 		if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu);
2446 		ifp->if_mtu = mtu;
2447 	}
2448 
2449 	/* Inform the upper layer about the long frame support. */
2450 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2451 
2452 	/*
2453 	 * Kick off link status check.
2454 	 */
2455 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2456 	hn_update_link_status(sc);
2457 
2458 	if (!hn_xpnt_vf) {
2459 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2460 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2461 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2462 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2463 	} else {
2464 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2465 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2466 	}
2467 
2468 	/*
2469 	 * NOTE:
2470 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2471 	 * since interface's LLADDR is needed; interface LLADDR is not
2472 	 * available when ifnet_arrival event is triggered.
2473 	 */
2474 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2475 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2476 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2477 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2478 
2479 	return (0);
2480 failed:
2481 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2482 		hn_synth_detach(sc);
2483 	hn_detach(dev);
2484 	return (error);
2485 }
2486 
2487 static int
2488 hn_detach(device_t dev)
2489 {
2490 	struct hn_softc *sc = device_get_softc(dev);
2491 	struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2492 
2493 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2494 		/*
2495 		 * In case that the vmbus missed the orphan handler
2496 		 * installation.
2497 		 */
2498 		vmbus_xact_ctx_orphan(sc->hn_xact);
2499 	}
2500 
2501 	if (sc->hn_ifaddr_evthand != NULL)
2502 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2503 	if (sc->hn_ifnet_evthand != NULL)
2504 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2505 	if (sc->hn_ifnet_atthand != NULL) {
2506 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2507 		    sc->hn_ifnet_atthand);
2508 	}
2509 	if (sc->hn_ifnet_dethand != NULL) {
2510 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2511 		    sc->hn_ifnet_dethand);
2512 	}
2513 	if (sc->hn_ifnet_lnkhand != NULL)
2514 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2515 
2516 	vf_ifp = sc->hn_vf_ifp;
2517 	__compiler_membar();
2518 	if (vf_ifp != NULL)
2519 		hn_ifnet_detevent(sc, vf_ifp);
2520 
2521 	if (device_is_attached(dev)) {
2522 		HN_LOCK(sc);
2523 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2524 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2525 				hn_stop(sc, true);
2526 			/*
2527 			 * NOTE:
2528 			 * hn_stop() only suspends data, so managment
2529 			 * stuffs have to be suspended manually here.
2530 			 */
2531 			hn_suspend_mgmt(sc);
2532 			hn_synth_detach(sc);
2533 		}
2534 		HN_UNLOCK(sc);
2535 		ether_ifdetach(ifp);
2536 	}
2537 
2538 	ifmedia_removeall(&sc->hn_media);
2539 	hn_destroy_rx_data(sc);
2540 	hn_destroy_tx_data(sc);
2541 
2542 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2543 		int i;
2544 
2545 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2546 			taskqueue_free(sc->hn_tx_taskqs[i]);
2547 		free(sc->hn_tx_taskqs, M_DEVBUF);
2548 	}
2549 	taskqueue_free(sc->hn_mgmt_taskq0);
2550 	if (sc->hn_vf_taskq != NULL)
2551 		taskqueue_free(sc->hn_vf_taskq);
2552 
2553 	if (sc->hn_xact != NULL) {
2554 		/*
2555 		 * Uninstall the orphan handler _before_ the xact is
2556 		 * destructed.
2557 		 */
2558 		vmbus_chan_unset_orphan(sc->hn_prichan);
2559 		vmbus_xact_ctx_destroy(sc->hn_xact);
2560 	}
2561 
2562 	if_free(ifp);
2563 
2564 	HN_LOCK_DESTROY(sc);
2565 	rm_destroy(&sc->hn_vf_lock);
2566 	return (0);
2567 }
2568 
2569 static int
2570 hn_shutdown(device_t dev)
2571 {
2572 
2573 	return (0);
2574 }
2575 
2576 static void
2577 hn_link_status(struct hn_softc *sc)
2578 {
2579 	uint32_t link_status;
2580 	int error;
2581 
2582 	error = hn_rndis_get_linkstatus(sc, &link_status);
2583 	if (error) {
2584 		/* XXX what to do? */
2585 		return;
2586 	}
2587 
2588 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2589 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2590 	else
2591 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2592 	if_link_state_change(sc->hn_ifp,
2593 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2594 	    LINK_STATE_UP : LINK_STATE_DOWN);
2595 }
2596 
2597 static void
2598 hn_link_taskfunc(void *xsc, int pending __unused)
2599 {
2600 	struct hn_softc *sc = xsc;
2601 
2602 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2603 		return;
2604 	hn_link_status(sc);
2605 }
2606 
2607 static void
2608 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2609 {
2610 	struct hn_softc *sc = xsc;
2611 
2612 	/* Prevent any link status checks from running. */
2613 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2614 
2615 	/*
2616 	 * Fake up a [link down --> link up] state change; 5 seconds
2617 	 * delay is used, which closely simulates miibus reaction
2618 	 * upon link down event.
2619 	 */
2620 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2621 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2622 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2623 	    &sc->hn_netchg_status, 5 * hz);
2624 }
2625 
2626 static void
2627 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2628 {
2629 	struct hn_softc *sc = xsc;
2630 
2631 	/* Re-allow link status checks. */
2632 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2633 	hn_link_status(sc);
2634 }
2635 
2636 static void
2637 hn_update_link_status(struct hn_softc *sc)
2638 {
2639 
2640 	if (sc->hn_mgmt_taskq != NULL)
2641 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2642 }
2643 
2644 static void
2645 hn_change_network(struct hn_softc *sc)
2646 {
2647 
2648 	if (sc->hn_mgmt_taskq != NULL)
2649 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2650 }
2651 
2652 static __inline int
2653 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2654     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2655 {
2656 	struct mbuf *m = *m_head;
2657 	int error;
2658 
2659 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2660 
2661 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2662 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2663 	if (error == EFBIG) {
2664 		struct mbuf *m_new;
2665 
2666 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2667 		if (m_new == NULL)
2668 			return ENOBUFS;
2669 		else
2670 			*m_head = m = m_new;
2671 		txr->hn_tx_collapsed++;
2672 
2673 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2674 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2675 	}
2676 	if (!error) {
2677 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2678 		    BUS_DMASYNC_PREWRITE);
2679 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2680 	}
2681 	return error;
2682 }
2683 
2684 static __inline int
2685 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2686 {
2687 
2688 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2689 	    ("put an onlist txd %#x", txd->flags));
2690 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2691 	    ("put an onagg txd %#x", txd->flags));
2692 
2693 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2694 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2695 		return 0;
2696 
2697 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2698 		struct hn_txdesc *tmp_txd;
2699 
2700 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2701 			int freed __diagused;
2702 
2703 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2704 			    ("resursive aggregation on aggregated txdesc"));
2705 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2706 			    ("not aggregated txdesc"));
2707 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2708 			    ("aggregated txdesc uses dmamap"));
2709 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2710 			    ("aggregated txdesc consumes "
2711 			     "chimney sending buffer"));
2712 			KASSERT(tmp_txd->chim_size == 0,
2713 			    ("aggregated txdesc has non-zero "
2714 			     "chimney sending size"));
2715 
2716 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2717 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2718 			freed = hn_txdesc_put(txr, tmp_txd);
2719 			KASSERT(freed, ("failed to free aggregated txdesc"));
2720 		}
2721 	}
2722 
2723 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2724 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2725 		    ("chim txd uses dmamap"));
2726 		hn_chim_free(txr->hn_sc, txd->chim_index);
2727 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2728 		txd->chim_size = 0;
2729 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2730 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2731 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2732 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2733 		    txd->data_dmap);
2734 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2735 	}
2736 
2737 	if (txd->m != NULL) {
2738 		m_freem(txd->m);
2739 		txd->m = NULL;
2740 	}
2741 
2742 	txd->flags |= HN_TXD_FLAG_ONLIST;
2743 #ifndef HN_USE_TXDESC_BUFRING
2744 	mtx_lock_spin(&txr->hn_txlist_spin);
2745 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2746 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2747 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2748 	txr->hn_txdesc_avail++;
2749 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2750 	mtx_unlock_spin(&txr->hn_txlist_spin);
2751 #else	/* HN_USE_TXDESC_BUFRING */
2752 #ifdef HN_DEBUG
2753 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2754 #endif
2755 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2756 #endif	/* !HN_USE_TXDESC_BUFRING */
2757 
2758 	return 1;
2759 }
2760 
2761 static __inline struct hn_txdesc *
2762 hn_txdesc_get(struct hn_tx_ring *txr)
2763 {
2764 	struct hn_txdesc *txd;
2765 
2766 #ifndef HN_USE_TXDESC_BUFRING
2767 	mtx_lock_spin(&txr->hn_txlist_spin);
2768 	txd = SLIST_FIRST(&txr->hn_txlist);
2769 	if (txd != NULL) {
2770 		KASSERT(txr->hn_txdesc_avail > 0,
2771 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2772 		txr->hn_txdesc_avail--;
2773 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2774 	}
2775 	mtx_unlock_spin(&txr->hn_txlist_spin);
2776 #else
2777 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2778 #endif
2779 
2780 	if (txd != NULL) {
2781 #ifdef HN_USE_TXDESC_BUFRING
2782 #ifdef HN_DEBUG
2783 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2784 #endif
2785 #endif	/* HN_USE_TXDESC_BUFRING */
2786 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2787 		    STAILQ_EMPTY(&txd->agg_list) &&
2788 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2789 		    txd->chim_size == 0 &&
2790 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2791 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2792 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2793 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2794 		txd->refs = 1;
2795 	}
2796 	return txd;
2797 }
2798 
2799 static __inline void
2800 hn_txdesc_hold(struct hn_txdesc *txd)
2801 {
2802 
2803 	/* 0->1 transition will never work */
2804 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2805 	atomic_add_int(&txd->refs, 1);
2806 }
2807 
2808 static __inline void
2809 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2810 {
2811 
2812 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2813 	    ("recursive aggregation on aggregating txdesc"));
2814 
2815 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2816 	    ("already aggregated"));
2817 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2818 	    ("recursive aggregation on to-be-aggregated txdesc"));
2819 
2820 	txd->flags |= HN_TXD_FLAG_ONAGG;
2821 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2822 }
2823 
2824 static bool
2825 hn_tx_ring_pending(struct hn_tx_ring *txr)
2826 {
2827 	bool pending = false;
2828 
2829 #ifndef HN_USE_TXDESC_BUFRING
2830 	mtx_lock_spin(&txr->hn_txlist_spin);
2831 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2832 		pending = true;
2833 	mtx_unlock_spin(&txr->hn_txlist_spin);
2834 #else
2835 	if (!buf_ring_full(txr->hn_txdesc_br))
2836 		pending = true;
2837 #endif
2838 	return (pending);
2839 }
2840 
2841 static __inline void
2842 hn_txeof(struct hn_tx_ring *txr)
2843 {
2844 	txr->hn_has_txeof = 0;
2845 	txr->hn_txeof(txr);
2846 }
2847 
2848 static void
2849 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2850     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2851 {
2852 	struct hn_txdesc *txd = sndc->hn_cbarg;
2853 	struct hn_tx_ring *txr;
2854 
2855 	txr = txd->txr;
2856 	KASSERT(txr->hn_chan == chan,
2857 	    ("channel mismatch, on chan%u, should be chan%u",
2858 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2859 
2860 	txr->hn_has_txeof = 1;
2861 	hn_txdesc_put(txr, txd);
2862 
2863 	++txr->hn_txdone_cnt;
2864 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2865 		txr->hn_txdone_cnt = 0;
2866 		if (txr->hn_oactive)
2867 			hn_txeof(txr);
2868 	}
2869 }
2870 
2871 static void
2872 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2873 {
2874 #if defined(INET) || defined(INET6)
2875 	struct epoch_tracker et;
2876 
2877 	NET_EPOCH_ENTER(et);
2878 	tcp_lro_flush_all(&rxr->hn_lro);
2879 	NET_EPOCH_EXIT(et);
2880 #endif
2881 
2882 	/*
2883 	 * NOTE:
2884 	 * 'txr' could be NULL, if multiple channels and
2885 	 * ifnet.if_start method are enabled.
2886 	 */
2887 	if (txr == NULL || !txr->hn_has_txeof)
2888 		return;
2889 
2890 	txr->hn_txdone_cnt = 0;
2891 	hn_txeof(txr);
2892 }
2893 
2894 static __inline uint32_t
2895 hn_rndis_pktmsg_offset(uint32_t ofs)
2896 {
2897 
2898 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2899 	    ("invalid RNDIS packet msg offset %u", ofs));
2900 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2901 }
2902 
2903 static __inline void *
2904 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2905     size_t pi_dlen, uint32_t pi_type)
2906 {
2907 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2908 	struct rndis_pktinfo *pi;
2909 
2910 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2911 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2912 
2913 	/*
2914 	 * Per-packet-info does not move; it only grows.
2915 	 *
2916 	 * NOTE:
2917 	 * rm_pktinfooffset in this phase counts from the beginning
2918 	 * of rndis_packet_msg.
2919 	 */
2920 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2921 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2922 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2923 	    pkt->rm_pktinfolen);
2924 	pkt->rm_pktinfolen += pi_size;
2925 
2926 	pi->rm_size = pi_size;
2927 	pi->rm_type = pi_type;
2928 	pi->rm_internal = 0;
2929 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2930 
2931 	return (pi->rm_data);
2932 }
2933 
2934 static __inline int
2935 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2936 {
2937 	struct hn_txdesc *txd;
2938 	struct mbuf *m;
2939 	int error, pkts;
2940 
2941 	txd = txr->hn_agg_txd;
2942 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2943 
2944 	/*
2945 	 * Since hn_txpkt() will reset this temporary stat, save
2946 	 * it now, so that oerrors can be updated properly, if
2947 	 * hn_txpkt() ever fails.
2948 	 */
2949 	pkts = txr->hn_stat_pkts;
2950 
2951 	/*
2952 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2953 	 * failure, save it for later freeing, if hn_txpkt() ever
2954 	 * fails.
2955 	 */
2956 	m = txd->m;
2957 	error = hn_txpkt(ifp, txr, txd);
2958 	if (__predict_false(error)) {
2959 		/* txd is freed, but m is not. */
2960 		m_freem(m);
2961 
2962 		txr->hn_flush_failed++;
2963 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2964 	}
2965 
2966 	/* Reset all aggregation states. */
2967 	txr->hn_agg_txd = NULL;
2968 	txr->hn_agg_szleft = 0;
2969 	txr->hn_agg_pktleft = 0;
2970 	txr->hn_agg_prevpkt = NULL;
2971 
2972 	return (error);
2973 }
2974 
2975 static void *
2976 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2977     int pktsize)
2978 {
2979 	void *chim;
2980 
2981 	if (txr->hn_agg_txd != NULL) {
2982 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2983 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2984 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2985 			int olen;
2986 
2987 			/*
2988 			 * Update the previous RNDIS packet's total length,
2989 			 * it can be increased due to the mandatory alignment
2990 			 * padding for this RNDIS packet.  And update the
2991 			 * aggregating txdesc's chimney sending buffer size
2992 			 * accordingly.
2993 			 *
2994 			 * XXX
2995 			 * Zero-out the padding, as required by the RNDIS spec.
2996 			 */
2997 			olen = pkt->rm_len;
2998 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2999 			agg_txd->chim_size += pkt->rm_len - olen;
3000 
3001 			/* Link this txdesc to the parent. */
3002 			hn_txdesc_agg(agg_txd, txd);
3003 
3004 			chim = (uint8_t *)pkt + pkt->rm_len;
3005 			/* Save the current packet for later fixup. */
3006 			txr->hn_agg_prevpkt = chim;
3007 
3008 			txr->hn_agg_pktleft--;
3009 			txr->hn_agg_szleft -= pktsize;
3010 			if (txr->hn_agg_szleft <=
3011 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3012 				/*
3013 				 * Probably can't aggregate more packets,
3014 				 * flush this aggregating txdesc proactively.
3015 				 */
3016 				txr->hn_agg_pktleft = 0;
3017 			}
3018 			/* Done! */
3019 			return (chim);
3020 		}
3021 		hn_flush_txagg(ifp, txr);
3022 	}
3023 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3024 
3025 	txr->hn_tx_chimney_tried++;
3026 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
3027 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3028 		return (NULL);
3029 	txr->hn_tx_chimney++;
3030 
3031 	chim = txr->hn_sc->hn_chim +
3032 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3033 
3034 	if (txr->hn_agg_pktmax > 1 &&
3035 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3036 		txr->hn_agg_txd = txd;
3037 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3038 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3039 		txr->hn_agg_prevpkt = chim;
3040 	}
3041 	return (chim);
3042 }
3043 
3044 /*
3045  * NOTE:
3046  * If this function fails, then both txd and m_head0 will be freed.
3047  */
3048 static int
3049 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3050     struct mbuf **m_head0)
3051 {
3052 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3053 	int error, nsegs, i;
3054 	struct mbuf *m_head = *m_head0;
3055 	struct rndis_packet_msg *pkt;
3056 	uint32_t *pi_data;
3057 	void *chim = NULL;
3058 	int pkt_hlen, pkt_size;
3059 
3060 	pkt = txd->rndis_pkt;
3061 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3062 	if (pkt_size < txr->hn_chim_size) {
3063 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3064 		if (chim != NULL)
3065 			pkt = chim;
3066 	} else {
3067 		if (txr->hn_agg_txd != NULL)
3068 			hn_flush_txagg(ifp, txr);
3069 	}
3070 
3071 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3072 	pkt->rm_len = m_head->m_pkthdr.len;
3073 	pkt->rm_dataoffset = 0;
3074 	pkt->rm_datalen = m_head->m_pkthdr.len;
3075 	pkt->rm_oobdataoffset = 0;
3076 	pkt->rm_oobdatalen = 0;
3077 	pkt->rm_oobdataelements = 0;
3078 	pkt->rm_pktinfooffset = sizeof(*pkt);
3079 	pkt->rm_pktinfolen = 0;
3080 	pkt->rm_vchandle = 0;
3081 	pkt->rm_reserved = 0;
3082 
3083 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3084 		/*
3085 		 * Set the hash value for this packet.
3086 		 */
3087 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3088 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3089 
3090 		if (M_HASHTYPE_ISHASH(m_head))
3091 			/*
3092 			 * The flowid field contains the hash value host
3093 			 * set in the rx queue if it is a ip forwarding pkt.
3094 			 * Set the same hash value so host can send on the
3095 			 * cpu it was received.
3096 			 */
3097 			*pi_data = m_head->m_pkthdr.flowid;
3098 		else
3099 			/*
3100 			 * Otherwise just put the tx queue index.
3101 			 */
3102 			*pi_data = txr->hn_tx_idx;
3103 	}
3104 
3105 	if (m_head->m_flags & M_VLANTAG) {
3106 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3107 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3108 		*pi_data = NDIS_VLAN_INFO_MAKE(
3109 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3110 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3111 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3112 	}
3113 
3114 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3115 #if defined(INET6) || defined(INET)
3116 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3117 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3118 #ifdef INET
3119 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3120 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3121 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3122 			    m_head->m_pkthdr.tso_segsz);
3123 		}
3124 #endif
3125 #if defined(INET6) && defined(INET)
3126 		else
3127 #endif
3128 #ifdef INET6
3129 		{
3130 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3131 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3132 			    m_head->m_pkthdr.tso_segsz);
3133 		}
3134 #endif
3135 #endif	/* INET6 || INET */
3136 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3137 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3138 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3139 		if (m_head->m_pkthdr.csum_flags &
3140 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3141 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
3142 		} else {
3143 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
3144 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3145 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
3146 		}
3147 
3148 		if (m_head->m_pkthdr.csum_flags &
3149 		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3150 			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3151 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3152 		} else if (m_head->m_pkthdr.csum_flags &
3153 		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3154 			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3155 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3156 		}
3157 	}
3158 
3159 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3160 	/* Fixup RNDIS packet message total length */
3161 	pkt->rm_len += pkt_hlen;
3162 	/* Convert RNDIS packet message offsets */
3163 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3164 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3165 
3166 	/*
3167 	 * Fast path: Chimney sending.
3168 	 */
3169 	if (chim != NULL) {
3170 		struct hn_txdesc *tgt_txd = txd;
3171 
3172 		if (txr->hn_agg_txd != NULL) {
3173 			tgt_txd = txr->hn_agg_txd;
3174 #ifdef INVARIANTS
3175 			*m_head0 = NULL;
3176 #endif
3177 		}
3178 
3179 		KASSERT(pkt == chim,
3180 		    ("RNDIS pkt not in chimney sending buffer"));
3181 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3182 		    ("chimney sending buffer is not used"));
3183 		tgt_txd->chim_size += pkt->rm_len;
3184 
3185 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
3186 		    ((uint8_t *)chim) + pkt_hlen);
3187 
3188 		txr->hn_gpa_cnt = 0;
3189 		txr->hn_sendpkt = hn_txpkt_chim;
3190 		goto done;
3191 	}
3192 
3193 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3194 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3195 	    ("chimney buffer is used"));
3196 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3197 
3198 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3199 	if (__predict_false(error)) {
3200 		int freed __diagused;
3201 
3202 		/*
3203 		 * This mbuf is not linked w/ the txd yet, so free it now.
3204 		 */
3205 		m_freem(m_head);
3206 		*m_head0 = NULL;
3207 
3208 		freed = hn_txdesc_put(txr, txd);
3209 		KASSERT(freed != 0,
3210 		    ("fail to free txd upon txdma error"));
3211 
3212 		txr->hn_txdma_failed++;
3213 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3214 		return error;
3215 	}
3216 	*m_head0 = m_head;
3217 
3218 	/* +1 RNDIS packet message */
3219 	txr->hn_gpa_cnt = nsegs + 1;
3220 
3221 	/* send packet with page buffer */
3222 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3223 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3224 	txr->hn_gpa[0].gpa_len = pkt_hlen;
3225 
3226 	/*
3227 	 * Fill the page buffers with mbuf info after the page
3228 	 * buffer for RNDIS packet message.
3229 	 */
3230 	for (i = 0; i < nsegs; ++i) {
3231 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3232 
3233 		gpa->gpa_page = atop(segs[i].ds_addr);
3234 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3235 		gpa->gpa_len = segs[i].ds_len;
3236 	}
3237 
3238 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3239 	txd->chim_size = 0;
3240 	txr->hn_sendpkt = hn_txpkt_sglist;
3241 done:
3242 	txd->m = m_head;
3243 
3244 	/* Set the completion routine */
3245 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3246 
3247 	/* Update temporary stats for later use. */
3248 	txr->hn_stat_pkts++;
3249 	txr->hn_stat_size += m_head->m_pkthdr.len;
3250 	if (m_head->m_flags & M_MCAST)
3251 		txr->hn_stat_mcasts++;
3252 
3253 	return 0;
3254 }
3255 
3256 /*
3257  * NOTE:
3258  * If this function fails, then txd will be freed, but the mbuf
3259  * associated w/ the txd will _not_ be freed.
3260  */
3261 static int
3262 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3263 {
3264 	int error, send_failed = 0, has_bpf;
3265 
3266 again:
3267 	has_bpf = bpf_peers_present(ifp->if_bpf);
3268 	if (has_bpf) {
3269 		/*
3270 		 * Make sure that this txd and any aggregated txds are not
3271 		 * freed before ETHER_BPF_MTAP.
3272 		 */
3273 		hn_txdesc_hold(txd);
3274 	}
3275 	error = txr->hn_sendpkt(txr, txd);
3276 	if (!error) {
3277 		if (has_bpf) {
3278 			const struct hn_txdesc *tmp_txd;
3279 
3280 			ETHER_BPF_MTAP(ifp, txd->m);
3281 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3282 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
3283 		}
3284 
3285 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3286 #ifdef HN_IFSTART_SUPPORT
3287 		if (!hn_use_if_start)
3288 #endif
3289 		{
3290 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
3291 			    txr->hn_stat_size);
3292 			if (txr->hn_stat_mcasts != 0) {
3293 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3294 				    txr->hn_stat_mcasts);
3295 			}
3296 		}
3297 		txr->hn_pkts += txr->hn_stat_pkts;
3298 		txr->hn_sends++;
3299 	}
3300 	if (has_bpf)
3301 		hn_txdesc_put(txr, txd);
3302 
3303 	if (__predict_false(error)) {
3304 		int freed __diagused;
3305 
3306 		/*
3307 		 * This should "really rarely" happen.
3308 		 *
3309 		 * XXX Too many RX to be acked or too many sideband
3310 		 * commands to run?  Ask netvsc_channel_rollup()
3311 		 * to kick start later.
3312 		 */
3313 		txr->hn_has_txeof = 1;
3314 		if (!send_failed) {
3315 			txr->hn_send_failed++;
3316 			send_failed = 1;
3317 			/*
3318 			 * Try sending again after set hn_has_txeof;
3319 			 * in case that we missed the last
3320 			 * netvsc_channel_rollup().
3321 			 */
3322 			goto again;
3323 		}
3324 		if_printf(ifp, "send failed\n");
3325 
3326 		/*
3327 		 * Caller will perform further processing on the
3328 		 * associated mbuf, so don't free it in hn_txdesc_put();
3329 		 * only unload it from the DMA map in hn_txdesc_put(),
3330 		 * if it was loaded.
3331 		 */
3332 		txd->m = NULL;
3333 		freed = hn_txdesc_put(txr, txd);
3334 		KASSERT(freed != 0,
3335 		    ("fail to free txd upon send error"));
3336 
3337 		txr->hn_send_failed++;
3338 	}
3339 
3340 	/* Reset temporary stats, after this sending is done. */
3341 	txr->hn_stat_size = 0;
3342 	txr->hn_stat_pkts = 0;
3343 	txr->hn_stat_mcasts = 0;
3344 
3345 	return (error);
3346 }
3347 
3348 /*
3349  * Append the specified data to the indicated mbuf chain,
3350  * Extend the mbuf chain if the new data does not fit in
3351  * existing space.
3352  *
3353  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3354  * There should be an equivalent in the kernel mbuf code,
3355  * but there does not appear to be one yet.
3356  *
3357  * Differs from m_append() in that additional mbufs are
3358  * allocated with cluster size MJUMPAGESIZE, and filled
3359  * accordingly.
3360  *
3361  * Return the last mbuf in the chain or NULL if failed to
3362  * allocate new mbuf.
3363  */
3364 static struct mbuf *
3365 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3366 {
3367 	struct mbuf *m, *n;
3368 	int remainder, space;
3369 
3370 	for (m = m0; m->m_next != NULL; m = m->m_next)
3371 		;
3372 	remainder = len;
3373 	space = M_TRAILINGSPACE(m);
3374 	if (space > 0) {
3375 		/*
3376 		 * Copy into available space.
3377 		 */
3378 		if (space > remainder)
3379 			space = remainder;
3380 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3381 		m->m_len += space;
3382 		cp += space;
3383 		remainder -= space;
3384 	}
3385 	while (remainder > 0) {
3386 		/*
3387 		 * Allocate a new mbuf; could check space
3388 		 * and allocate a cluster instead.
3389 		 */
3390 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3391 		if (n == NULL)
3392 			return NULL;
3393 		n->m_len = min(MJUMPAGESIZE, remainder);
3394 		bcopy(cp, mtod(n, caddr_t), n->m_len);
3395 		cp += n->m_len;
3396 		remainder -= n->m_len;
3397 		m->m_next = n;
3398 		m = n;
3399 	}
3400 
3401 	return m;
3402 }
3403 
3404 #if defined(INET) || defined(INET6)
3405 static __inline int
3406 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3407 {
3408 	if (hn_lro_mbufq_depth) {
3409 		tcp_lro_queue_mbuf(lc, m);
3410 		return 0;
3411 	}
3412 	return tcp_lro_rx(lc, m, 0);
3413 }
3414 #endif
3415 
3416 static int
3417 hn_rxpkt(struct hn_rx_ring *rxr)
3418 {
3419 	struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3420 	struct mbuf *m_new, *n;
3421 	int size, do_lro = 0, do_csum = 1, is_vf = 0;
3422 	int hash_type = M_HASHTYPE_NONE;
3423 	int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3424 	int i;
3425 
3426 	ifp = hn_ifp;
3427 	if (rxr->hn_rxvf_ifp != NULL) {
3428 		/*
3429 		 * Non-transparent mode VF; pretend this packet is from
3430 		 * the VF.
3431 		 */
3432 		ifp = rxr->hn_rxvf_ifp;
3433 		is_vf = 1;
3434 	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3435 		/* Transparent mode VF. */
3436 		is_vf = 1;
3437 	}
3438 
3439 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3440 		/*
3441 		 * NOTE:
3442 		 * See the NOTE of hn_rndis_init_fixat().  This
3443 		 * function can be reached, immediately after the
3444 		 * RNDIS is initialized but before the ifnet is
3445 		 * setup on the hn_attach() path; drop the unexpected
3446 		 * packets.
3447 		 */
3448 		return (0);
3449 	}
3450 
3451 	if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) {
3452 		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3453 		return (0);
3454 	}
3455 
3456 	if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) {
3457 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3458 		if (m_new == NULL) {
3459 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3460 			return (0);
3461 		}
3462 		memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0],
3463 		    rxr->rsc.frag_len[0]);
3464 		m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0];
3465 	} else {
3466 		/*
3467 		 * Get an mbuf with a cluster.  For packets 2K or less,
3468 		 * get a standard 2K cluster.  For anything larger, get a
3469 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3470 		 * if looped around to the Hyper-V TX channel, so avoid them.
3471 		 */
3472 		size = MCLBYTES;
3473 		if (rxr->rsc.pktlen > MCLBYTES) {
3474 			/* 4096 */
3475 			size = MJUMPAGESIZE;
3476 		}
3477 
3478 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3479 		if (m_new == NULL) {
3480 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3481 			return (0);
3482 		}
3483 
3484 		n = m_new;
3485 		for (i = 0; i < rxr->rsc.cnt; i++) {
3486 			n = hv_m_append(n, rxr->rsc.frag_len[i],
3487 			    rxr->rsc.frag_data[i]);
3488 			if (n == NULL) {
3489 				if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3490 				return (0);
3491 			} else {
3492 				m_new->m_pkthdr.len += rxr->rsc.frag_len[i];
3493 			}
3494 		}
3495 	}
3496 	if (rxr->rsc.pktlen <= MHLEN)
3497 		rxr->hn_small_pkts++;
3498 
3499 	m_new->m_pkthdr.rcvif = ifp;
3500 
3501 	if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3502 		do_csum = 0;
3503 
3504 	/* receive side checksum offload */
3505 	if (rxr->rsc.csum_info != NULL) {
3506 		/* IP csum offload */
3507 		if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3508 			m_new->m_pkthdr.csum_flags |=
3509 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3510 			rxr->hn_csum_ip++;
3511 		}
3512 
3513 		/* TCP/UDP csum offload */
3514 		if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK |
3515 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3516 			m_new->m_pkthdr.csum_flags |=
3517 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3518 			m_new->m_pkthdr.csum_data = 0xffff;
3519 			if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK)
3520 				rxr->hn_csum_tcp++;
3521 			else
3522 				rxr->hn_csum_udp++;
3523 		}
3524 
3525 		/*
3526 		 * XXX
3527 		 * As of this write (Oct 28th, 2016), host side will turn
3528 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3529 		 * the do_lro setting here is actually _not_ accurate.  We
3530 		 * depend on the RSS hash type check to reset do_lro.
3531 		 */
3532 		if ((*(rxr->rsc.csum_info) &
3533 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3534 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3535 			do_lro = 1;
3536 	} else {
3537 		hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3538 		if (l3proto == ETHERTYPE_IP) {
3539 			if (l4proto == IPPROTO_TCP) {
3540 				if (do_csum &&
3541 				    (rxr->hn_trust_hcsum &
3542 				     HN_TRUST_HCSUM_TCP)) {
3543 					rxr->hn_csum_trusted++;
3544 					m_new->m_pkthdr.csum_flags |=
3545 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3546 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3547 					m_new->m_pkthdr.csum_data = 0xffff;
3548 				}
3549 				do_lro = 1;
3550 			} else if (l4proto == IPPROTO_UDP) {
3551 				if (do_csum &&
3552 				    (rxr->hn_trust_hcsum &
3553 				     HN_TRUST_HCSUM_UDP)) {
3554 					rxr->hn_csum_trusted++;
3555 					m_new->m_pkthdr.csum_flags |=
3556 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3557 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3558 					m_new->m_pkthdr.csum_data = 0xffff;
3559 				}
3560 			} else if (l4proto != IPPROTO_DONE && do_csum &&
3561 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3562 				rxr->hn_csum_trusted++;
3563 				m_new->m_pkthdr.csum_flags |=
3564 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3565 			}
3566 		}
3567 	}
3568 
3569 	if (rxr->rsc.vlan_info != NULL) {
3570 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3571 		    NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)),
3572 		    NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)),
3573 		    NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info)));
3574 		m_new->m_flags |= M_VLANTAG;
3575 	}
3576 
3577 	/*
3578 	 * If VF is activated (tranparent/non-transparent mode does not
3579 	 * matter here).
3580 	 *
3581 	 * - Disable LRO
3582 	 *
3583 	 *   hn(4) will only receive broadcast packets, multicast packets,
3584 	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3585 	 *   packet types.
3586 	 *
3587 	 *   For non-transparent, we definitely _cannot_ enable LRO at
3588 	 *   all, since the LRO flush will use hn(4) as the receiving
3589 	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3590 	 */
3591 	if (is_vf)
3592 		do_lro = 0;
3593 
3594 	/*
3595 	 * If VF is activated (tranparent/non-transparent mode does not
3596 	 * matter here), do _not_ mess with unsupported hash types or
3597 	 * functions.
3598 	 */
3599 	if (rxr->rsc.hash_info != NULL) {
3600 		rxr->hn_rss_pkts++;
3601 		m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value);
3602 		if (!is_vf)
3603 			hash_type = M_HASHTYPE_OPAQUE_HASH;
3604 		if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) ==
3605 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3606 			uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK &
3607 			    rxr->hn_mbuf_hash);
3608 
3609 			/*
3610 			 * NOTE:
3611 			 * do_lro is resetted, if the hash types are not TCP
3612 			 * related.  See the comment in the above csum_flags
3613 			 * setup section.
3614 			 */
3615 			switch (type) {
3616 			case NDIS_HASH_IPV4:
3617 				hash_type = M_HASHTYPE_RSS_IPV4;
3618 				do_lro = 0;
3619 				break;
3620 
3621 			case NDIS_HASH_TCP_IPV4:
3622 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3623 				if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3624 					int def_htype = M_HASHTYPE_OPAQUE_HASH;
3625 
3626 					if (is_vf)
3627 						def_htype = M_HASHTYPE_NONE;
3628 
3629 					/*
3630 					 * UDP 4-tuple hash is delivered as
3631 					 * TCP 4-tuple hash.
3632 					 */
3633 					if (l3proto == ETHERTYPE_MAX) {
3634 						hn_rxpkt_proto(m_new,
3635 						    &l3proto, &l4proto);
3636 					}
3637 					if (l3proto == ETHERTYPE_IP) {
3638 						if (l4proto == IPPROTO_UDP &&
3639 						    (rxr->hn_mbuf_hash &
3640 						     NDIS_HASH_UDP_IPV4_X)) {
3641 							hash_type =
3642 							M_HASHTYPE_RSS_UDP_IPV4;
3643 							do_lro = 0;
3644 						} else if (l4proto !=
3645 						    IPPROTO_TCP) {
3646 							hash_type = def_htype;
3647 							do_lro = 0;
3648 						}
3649 					} else {
3650 						hash_type = def_htype;
3651 						do_lro = 0;
3652 					}
3653 				}
3654 				break;
3655 
3656 			case NDIS_HASH_IPV6:
3657 				hash_type = M_HASHTYPE_RSS_IPV6;
3658 				do_lro = 0;
3659 				break;
3660 
3661 			case NDIS_HASH_IPV6_EX:
3662 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3663 				do_lro = 0;
3664 				break;
3665 
3666 			case NDIS_HASH_TCP_IPV6:
3667 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3668 				break;
3669 
3670 			case NDIS_HASH_TCP_IPV6_EX:
3671 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3672 				break;
3673 			}
3674 		}
3675 	} else if (!is_vf) {
3676 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3677 		hash_type = M_HASHTYPE_OPAQUE;
3678 	}
3679 	M_HASHTYPE_SET(m_new, hash_type);
3680 
3681 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3682 	if (hn_ifp != ifp) {
3683 		const struct ether_header *eh;
3684 
3685 		/*
3686 		 * Non-transparent mode VF is activated.
3687 		 */
3688 
3689 		/*
3690 		 * Allow tapping on hn(4).
3691 		 */
3692 		ETHER_BPF_MTAP(hn_ifp, m_new);
3693 
3694 		/*
3695 		 * Update hn(4)'s stats.
3696 		 */
3697 		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3698 		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3699 		/* Checked at the beginning of this function. */
3700 		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3701 		eh = mtod(m_new, struct ether_header *);
3702 		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3703 			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3704 	}
3705 	rxr->hn_pkts++;
3706 
3707 	if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3708 #if defined(INET) || defined(INET6)
3709 		struct lro_ctrl *lro = &rxr->hn_lro;
3710 
3711 		if (lro->lro_cnt) {
3712 			rxr->hn_lro_tried++;
3713 			if (hn_lro_rx(lro, m_new) == 0) {
3714 				/* DONE! */
3715 				return 0;
3716 			}
3717 		}
3718 #endif
3719 	}
3720 	ifp->if_input(ifp, m_new);
3721 
3722 	return (0);
3723 }
3724 
3725 static int
3726 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3727 {
3728 	struct hn_softc *sc = ifp->if_softc;
3729 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3730 	struct ifnet *vf_ifp;
3731 	int mask, error = 0;
3732 	struct ifrsskey *ifrk;
3733 	struct ifrsshash *ifrh;
3734 	uint32_t mtu;
3735 
3736 	switch (cmd) {
3737 	case SIOCSIFMTU:
3738 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3739 			error = EINVAL;
3740 			break;
3741 		}
3742 
3743 		HN_LOCK(sc);
3744 
3745 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3746 			HN_UNLOCK(sc);
3747 			break;
3748 		}
3749 
3750 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3751 			/* Can't change MTU */
3752 			HN_UNLOCK(sc);
3753 			error = EOPNOTSUPP;
3754 			break;
3755 		}
3756 
3757 		if (ifp->if_mtu == ifr->ifr_mtu) {
3758 			HN_UNLOCK(sc);
3759 			break;
3760 		}
3761 
3762 		if (hn_xpnt_vf_isready(sc)) {
3763 			vf_ifp = sc->hn_vf_ifp;
3764 			ifr_vf = *ifr;
3765 			strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3766 			    sizeof(ifr_vf.ifr_name));
3767 			error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3768 			    (caddr_t)&ifr_vf);
3769 			if (error) {
3770 				HN_UNLOCK(sc);
3771 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3772 				    vf_ifp->if_xname, ifr->ifr_mtu, error);
3773 				break;
3774 			}
3775 		}
3776 
3777 		/*
3778 		 * Suspend this interface before the synthetic parts
3779 		 * are ripped.
3780 		 */
3781 		hn_suspend(sc);
3782 
3783 		/*
3784 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3785 		 */
3786 		hn_synth_detach(sc);
3787 
3788 		/*
3789 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3790 		 * with the new MTU setting.
3791 		 */
3792 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3793 		if (error) {
3794 			HN_UNLOCK(sc);
3795 			break;
3796 		}
3797 
3798 		error = hn_rndis_get_mtu(sc, &mtu);
3799 		if (error)
3800 			mtu = ifr->ifr_mtu;
3801 		else if (bootverbose)
3802 			if_printf(ifp, "RNDIS mtu %u\n", mtu);
3803 
3804 		/*
3805 		 * Commit the requested MTU, after the synthetic parts
3806 		 * have been successfully attached.
3807 		 */
3808 		if (mtu >= ifr->ifr_mtu) {
3809 			mtu = ifr->ifr_mtu;
3810 		} else {
3811 			if_printf(ifp, "fixup mtu %d -> %u\n",
3812 			    ifr->ifr_mtu, mtu);
3813 		}
3814 		ifp->if_mtu = mtu;
3815 
3816 		/*
3817 		 * Synthetic parts' reattach may change the chimney
3818 		 * sending size; update it.
3819 		 */
3820 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3821 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3822 
3823 		/*
3824 		 * Make sure that various parameters based on MTU are
3825 		 * still valid, after the MTU change.
3826 		 */
3827 		hn_mtu_change_fixup(sc);
3828 
3829 		/*
3830 		 * All done!  Resume the interface now.
3831 		 */
3832 		hn_resume(sc);
3833 
3834 		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3835 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3836 			/*
3837 			 * Since we have reattached the NVS part,
3838 			 * change the datapath to VF again; in case
3839 			 * that it is lost, after the NVS was detached.
3840 			 */
3841 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3842 		}
3843 
3844 		HN_UNLOCK(sc);
3845 		break;
3846 
3847 	case SIOCSIFFLAGS:
3848 		HN_LOCK(sc);
3849 
3850 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3851 			HN_UNLOCK(sc);
3852 			break;
3853 		}
3854 
3855 		if (hn_xpnt_vf_isready(sc))
3856 			hn_xpnt_vf_saveifflags(sc);
3857 
3858 		if (ifp->if_flags & IFF_UP) {
3859 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3860 				/*
3861 				 * Caller meight hold mutex, e.g.
3862 				 * bpf; use busy-wait for the RNDIS
3863 				 * reply.
3864 				 */
3865 				HN_NO_SLEEPING(sc);
3866 				hn_rxfilter_config(sc);
3867 				HN_SLEEPING_OK(sc);
3868 
3869 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3870 					error = hn_xpnt_vf_iocsetflags(sc);
3871 			} else {
3872 				hn_init_locked(sc);
3873 			}
3874 		} else {
3875 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3876 				hn_stop(sc, false);
3877 		}
3878 		sc->hn_if_flags = ifp->if_flags;
3879 
3880 		HN_UNLOCK(sc);
3881 		break;
3882 
3883 	case SIOCSIFCAP:
3884 		HN_LOCK(sc);
3885 
3886 		if (hn_xpnt_vf_isready(sc)) {
3887 			ifr_vf = *ifr;
3888 			strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3889 			    sizeof(ifr_vf.ifr_name));
3890 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3891 			HN_UNLOCK(sc);
3892 			break;
3893 		}
3894 
3895 		/*
3896 		 * Fix up requested capabilities w/ supported capabilities,
3897 		 * since the supported capabilities could have been changed.
3898 		 */
3899 		mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3900 		    ifp->if_capenable;
3901 
3902 		if (mask & IFCAP_TXCSUM) {
3903 			ifp->if_capenable ^= IFCAP_TXCSUM;
3904 			if (ifp->if_capenable & IFCAP_TXCSUM)
3905 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3906 			else
3907 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3908 		}
3909 		if (mask & IFCAP_TXCSUM_IPV6) {
3910 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3911 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3912 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3913 			else
3914 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3915 		}
3916 
3917 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3918 		if (mask & IFCAP_RXCSUM)
3919 			ifp->if_capenable ^= IFCAP_RXCSUM;
3920 #ifdef foo
3921 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3922 		if (mask & IFCAP_RXCSUM_IPV6)
3923 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3924 #endif
3925 
3926 		if (mask & IFCAP_LRO)
3927 			ifp->if_capenable ^= IFCAP_LRO;
3928 
3929 		if (mask & IFCAP_TSO4) {
3930 			ifp->if_capenable ^= IFCAP_TSO4;
3931 			if (ifp->if_capenable & IFCAP_TSO4)
3932 				ifp->if_hwassist |= CSUM_IP_TSO;
3933 			else
3934 				ifp->if_hwassist &= ~CSUM_IP_TSO;
3935 		}
3936 		if (mask & IFCAP_TSO6) {
3937 			ifp->if_capenable ^= IFCAP_TSO6;
3938 			if (ifp->if_capenable & IFCAP_TSO6)
3939 				ifp->if_hwassist |= CSUM_IP6_TSO;
3940 			else
3941 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
3942 		}
3943 
3944 		HN_UNLOCK(sc);
3945 		break;
3946 
3947 	case SIOCADDMULTI:
3948 	case SIOCDELMULTI:
3949 		HN_LOCK(sc);
3950 
3951 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3952 			HN_UNLOCK(sc);
3953 			break;
3954 		}
3955 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3956 			/*
3957 			 * Multicast uses mutex; use busy-wait for
3958 			 * the RNDIS reply.
3959 			 */
3960 			HN_NO_SLEEPING(sc);
3961 			hn_rxfilter_config(sc);
3962 			HN_SLEEPING_OK(sc);
3963 		}
3964 
3965 		/* XXX vlan(4) style mcast addr maintenance */
3966 		if (hn_xpnt_vf_isready(sc)) {
3967 			int old_if_flags;
3968 
3969 			old_if_flags = sc->hn_vf_ifp->if_flags;
3970 			hn_xpnt_vf_saveifflags(sc);
3971 
3972 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3973 			    ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3974 			     IFF_ALLMULTI))
3975 				error = hn_xpnt_vf_iocsetflags(sc);
3976 		}
3977 
3978 		HN_UNLOCK(sc);
3979 		break;
3980 
3981 	case SIOCSIFMEDIA:
3982 	case SIOCGIFMEDIA:
3983 		HN_LOCK(sc);
3984 		if (hn_xpnt_vf_isready(sc)) {
3985 			/*
3986 			 * SIOCGIFMEDIA expects ifmediareq, so don't
3987 			 * create and pass ifr_vf to the VF here; just
3988 			 * replace the ifr_name.
3989 			 */
3990 			vf_ifp = sc->hn_vf_ifp;
3991 			strlcpy(ifr->ifr_name, vf_ifp->if_xname,
3992 			    sizeof(ifr->ifr_name));
3993 			error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
3994 			/* Restore the ifr_name. */
3995 			strlcpy(ifr->ifr_name, ifp->if_xname,
3996 			    sizeof(ifr->ifr_name));
3997 			HN_UNLOCK(sc);
3998 			break;
3999 		}
4000 		HN_UNLOCK(sc);
4001 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
4002 		break;
4003 
4004 	case SIOCGIFRSSHASH:
4005 		ifrh = (struct ifrsshash *)data;
4006 		HN_LOCK(sc);
4007 		if (sc->hn_rx_ring_inuse == 1) {
4008 			HN_UNLOCK(sc);
4009 			ifrh->ifrh_func = RSS_FUNC_NONE;
4010 			ifrh->ifrh_types = 0;
4011 			break;
4012 		}
4013 
4014 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4015 			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
4016 		else
4017 			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
4018 		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
4019 		HN_UNLOCK(sc);
4020 		break;
4021 
4022 	case SIOCGIFRSSKEY:
4023 		ifrk = (struct ifrsskey *)data;
4024 		HN_LOCK(sc);
4025 		if (sc->hn_rx_ring_inuse == 1) {
4026 			HN_UNLOCK(sc);
4027 			ifrk->ifrk_func = RSS_FUNC_NONE;
4028 			ifrk->ifrk_keylen = 0;
4029 			break;
4030 		}
4031 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4032 			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
4033 		else
4034 			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
4035 		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
4036 		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
4037 		    NDIS_HASH_KEYSIZE_TOEPLITZ);
4038 		HN_UNLOCK(sc);
4039 		break;
4040 
4041 	default:
4042 		error = ether_ioctl(ifp, cmd, data);
4043 		break;
4044 	}
4045 	return (error);
4046 }
4047 
4048 static void
4049 hn_stop(struct hn_softc *sc, bool detaching)
4050 {
4051 	struct ifnet *ifp = sc->hn_ifp;
4052 	int i;
4053 
4054 	HN_LOCK_ASSERT(sc);
4055 
4056 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4057 	    ("synthetic parts were not attached"));
4058 
4059 	/* Clear RUNNING bit ASAP. */
4060 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4061 
4062 	/* Disable polling. */
4063 	hn_polling(sc, 0);
4064 
4065 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4066 		KASSERT(sc->hn_vf_ifp != NULL,
4067 		    ("%s: VF is not attached", ifp->if_xname));
4068 
4069 		/* Mark transparent mode VF as disabled. */
4070 		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4071 
4072 		/*
4073 		 * NOTE:
4074 		 * Datapath setting must happen _before_ bringing
4075 		 * the VF down.
4076 		 */
4077 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4078 
4079 		/*
4080 		 * Bring the VF down.
4081 		 */
4082 		hn_xpnt_vf_saveifflags(sc);
4083 		sc->hn_vf_ifp->if_flags &= ~IFF_UP;
4084 		hn_xpnt_vf_iocsetflags(sc);
4085 	}
4086 
4087 	/* Suspend data transfers. */
4088 	hn_suspend_data(sc);
4089 
4090 	/* Clear OACTIVE bit. */
4091 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4092 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4093 		sc->hn_tx_ring[i].hn_oactive = 0;
4094 
4095 	/*
4096 	 * If the non-transparent mode VF is active, make sure
4097 	 * that the RX filter still allows packet reception.
4098 	 */
4099 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4100 		hn_rxfilter_config(sc);
4101 }
4102 
4103 static void
4104 hn_init_locked(struct hn_softc *sc)
4105 {
4106 	struct ifnet *ifp = sc->hn_ifp;
4107 	int i;
4108 
4109 	HN_LOCK_ASSERT(sc);
4110 
4111 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4112 		return;
4113 
4114 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
4115 		return;
4116 
4117 	/* Configure RX filter */
4118 	hn_rxfilter_config(sc);
4119 
4120 	/* Clear OACTIVE bit. */
4121 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4122 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4123 		sc->hn_tx_ring[i].hn_oactive = 0;
4124 
4125 	/* Clear TX 'suspended' bit. */
4126 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4127 
4128 	if (hn_xpnt_vf_isready(sc)) {
4129 		/* Initialize transparent VF. */
4130 		hn_xpnt_vf_init(sc);
4131 	}
4132 
4133 	/* Everything is ready; unleash! */
4134 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4135 
4136 	/* Re-enable polling if requested. */
4137 	if (sc->hn_pollhz > 0)
4138 		hn_polling(sc, sc->hn_pollhz);
4139 }
4140 
4141 static void
4142 hn_init(void *xsc)
4143 {
4144 	struct hn_softc *sc = xsc;
4145 
4146 	HN_LOCK(sc);
4147 	hn_init_locked(sc);
4148 	HN_UNLOCK(sc);
4149 }
4150 
4151 static int
4152 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4153 {
4154 	struct hn_softc *sc = arg1;
4155 	unsigned int lenlim;
4156 	int error;
4157 
4158 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4159 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
4160 	if (error || req->newptr == NULL)
4161 		return error;
4162 
4163 	HN_LOCK(sc);
4164 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4165 	    lenlim > TCP_LRO_LENGTH_MAX) {
4166 		HN_UNLOCK(sc);
4167 		return EINVAL;
4168 	}
4169 	hn_set_lro_lenlim(sc, lenlim);
4170 	HN_UNLOCK(sc);
4171 
4172 	return 0;
4173 }
4174 
4175 static int
4176 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4177 {
4178 	struct hn_softc *sc = arg1;
4179 	int ackcnt, error, i;
4180 
4181 	/*
4182 	 * lro_ackcnt_lim is append count limit,
4183 	 * +1 to turn it into aggregation limit.
4184 	 */
4185 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4186 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4187 	if (error || req->newptr == NULL)
4188 		return error;
4189 
4190 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4191 		return EINVAL;
4192 
4193 	/*
4194 	 * Convert aggregation limit back to append
4195 	 * count limit.
4196 	 */
4197 	--ackcnt;
4198 	HN_LOCK(sc);
4199 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4200 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4201 	HN_UNLOCK(sc);
4202 	return 0;
4203 }
4204 
4205 static int
4206 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4207 {
4208 	struct hn_softc *sc = arg1;
4209 	int hcsum = arg2;
4210 	int on, error, i;
4211 
4212 	on = 0;
4213 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4214 		on = 1;
4215 
4216 	error = sysctl_handle_int(oidp, &on, 0, req);
4217 	if (error || req->newptr == NULL)
4218 		return error;
4219 
4220 	HN_LOCK(sc);
4221 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4222 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4223 
4224 		if (on)
4225 			rxr->hn_trust_hcsum |= hcsum;
4226 		else
4227 			rxr->hn_trust_hcsum &= ~hcsum;
4228 	}
4229 	HN_UNLOCK(sc);
4230 	return 0;
4231 }
4232 
4233 static int
4234 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4235 {
4236 	struct hn_softc *sc = arg1;
4237 	int chim_size, error;
4238 
4239 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
4240 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
4241 	if (error || req->newptr == NULL)
4242 		return error;
4243 
4244 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4245 		return EINVAL;
4246 
4247 	HN_LOCK(sc);
4248 	hn_set_chim_size(sc, chim_size);
4249 	HN_UNLOCK(sc);
4250 	return 0;
4251 }
4252 
4253 static int
4254 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4255 {
4256 	struct hn_softc *sc = arg1;
4257 	int ofs = arg2, i, error;
4258 	struct hn_rx_ring *rxr;
4259 	uint64_t stat;
4260 
4261 	stat = 0;
4262 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4263 		rxr = &sc->hn_rx_ring[i];
4264 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4265 	}
4266 
4267 	error = sysctl_handle_64(oidp, &stat, 0, req);
4268 	if (error || req->newptr == NULL)
4269 		return error;
4270 
4271 	/* Zero out this stat. */
4272 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4273 		rxr = &sc->hn_rx_ring[i];
4274 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4275 	}
4276 	return 0;
4277 }
4278 
4279 static int
4280 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4281 {
4282 	struct hn_softc *sc = arg1;
4283 	int ofs = arg2, i, error;
4284 	struct hn_rx_ring *rxr;
4285 	u_long stat;
4286 
4287 	stat = 0;
4288 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4289 		rxr = &sc->hn_rx_ring[i];
4290 		stat += *((u_long *)((uint8_t *)rxr + ofs));
4291 	}
4292 
4293 	error = sysctl_handle_long(oidp, &stat, 0, req);
4294 	if (error || req->newptr == NULL)
4295 		return error;
4296 
4297 	/* Zero out this stat. */
4298 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4299 		rxr = &sc->hn_rx_ring[i];
4300 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
4301 	}
4302 	return 0;
4303 }
4304 
4305 static int
4306 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4307 {
4308 	struct hn_softc *sc = arg1;
4309 	int ofs = arg2, i, error;
4310 	struct hn_tx_ring *txr;
4311 	u_long stat;
4312 
4313 	stat = 0;
4314 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4315 		txr = &sc->hn_tx_ring[i];
4316 		stat += *((u_long *)((uint8_t *)txr + ofs));
4317 	}
4318 
4319 	error = sysctl_handle_long(oidp, &stat, 0, req);
4320 	if (error || req->newptr == NULL)
4321 		return error;
4322 
4323 	/* Zero out this stat. */
4324 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4325 		txr = &sc->hn_tx_ring[i];
4326 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
4327 	}
4328 	return 0;
4329 }
4330 
4331 static int
4332 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4333 {
4334 	struct hn_softc *sc = arg1;
4335 	int ofs = arg2, i, error, conf;
4336 	struct hn_tx_ring *txr;
4337 
4338 	txr = &sc->hn_tx_ring[0];
4339 	conf = *((int *)((uint8_t *)txr + ofs));
4340 
4341 	error = sysctl_handle_int(oidp, &conf, 0, req);
4342 	if (error || req->newptr == NULL)
4343 		return error;
4344 
4345 	HN_LOCK(sc);
4346 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4347 		txr = &sc->hn_tx_ring[i];
4348 		*((int *)((uint8_t *)txr + ofs)) = conf;
4349 	}
4350 	HN_UNLOCK(sc);
4351 
4352 	return 0;
4353 }
4354 
4355 static int
4356 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4357 {
4358 	struct hn_softc *sc = arg1;
4359 	int error, size;
4360 
4361 	size = sc->hn_agg_size;
4362 	error = sysctl_handle_int(oidp, &size, 0, req);
4363 	if (error || req->newptr == NULL)
4364 		return (error);
4365 
4366 	HN_LOCK(sc);
4367 	sc->hn_agg_size = size;
4368 	hn_set_txagg(sc);
4369 	HN_UNLOCK(sc);
4370 
4371 	return (0);
4372 }
4373 
4374 static int
4375 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4376 {
4377 	struct hn_softc *sc = arg1;
4378 	int error, pkts;
4379 
4380 	pkts = sc->hn_agg_pkts;
4381 	error = sysctl_handle_int(oidp, &pkts, 0, req);
4382 	if (error || req->newptr == NULL)
4383 		return (error);
4384 
4385 	HN_LOCK(sc);
4386 	sc->hn_agg_pkts = pkts;
4387 	hn_set_txagg(sc);
4388 	HN_UNLOCK(sc);
4389 
4390 	return (0);
4391 }
4392 
4393 static int
4394 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4395 {
4396 	struct hn_softc *sc = arg1;
4397 	int pkts;
4398 
4399 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4400 	return (sysctl_handle_int(oidp, &pkts, 0, req));
4401 }
4402 
4403 static int
4404 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4405 {
4406 	struct hn_softc *sc = arg1;
4407 	int align;
4408 
4409 	align = sc->hn_tx_ring[0].hn_agg_align;
4410 	return (sysctl_handle_int(oidp, &align, 0, req));
4411 }
4412 
4413 static void
4414 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4415 {
4416 	if (pollhz == 0)
4417 		vmbus_chan_poll_disable(chan);
4418 	else
4419 		vmbus_chan_poll_enable(chan, pollhz);
4420 }
4421 
4422 static void
4423 hn_polling(struct hn_softc *sc, u_int pollhz)
4424 {
4425 	int nsubch = sc->hn_rx_ring_inuse - 1;
4426 
4427 	HN_LOCK_ASSERT(sc);
4428 
4429 	if (nsubch > 0) {
4430 		struct vmbus_channel **subch;
4431 		int i;
4432 
4433 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4434 		for (i = 0; i < nsubch; ++i)
4435 			hn_chan_polling(subch[i], pollhz);
4436 		vmbus_subchan_rel(subch, nsubch);
4437 	}
4438 	hn_chan_polling(sc->hn_prichan, pollhz);
4439 }
4440 
4441 static int
4442 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4443 {
4444 	struct hn_softc *sc = arg1;
4445 	int pollhz, error;
4446 
4447 	pollhz = sc->hn_pollhz;
4448 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4449 	if (error || req->newptr == NULL)
4450 		return (error);
4451 
4452 	if (pollhz != 0 &&
4453 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4454 		return (EINVAL);
4455 
4456 	HN_LOCK(sc);
4457 	if (sc->hn_pollhz != pollhz) {
4458 		sc->hn_pollhz = pollhz;
4459 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4460 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4461 			hn_polling(sc, sc->hn_pollhz);
4462 	}
4463 	HN_UNLOCK(sc);
4464 
4465 	return (0);
4466 }
4467 
4468 static int
4469 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4470 {
4471 	struct hn_softc *sc = arg1;
4472 	char verstr[16];
4473 
4474 	snprintf(verstr, sizeof(verstr), "%u.%u",
4475 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4476 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4477 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4478 }
4479 
4480 static int
4481 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4482 {
4483 	struct hn_softc *sc = arg1;
4484 	char caps_str[128];
4485 	uint32_t caps;
4486 
4487 	HN_LOCK(sc);
4488 	caps = sc->hn_caps;
4489 	HN_UNLOCK(sc);
4490 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4491 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4492 }
4493 
4494 static int
4495 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4496 {
4497 	struct hn_softc *sc = arg1;
4498 	char assist_str[128];
4499 	uint32_t hwassist;
4500 
4501 	HN_LOCK(sc);
4502 	hwassist = sc->hn_ifp->if_hwassist;
4503 	HN_UNLOCK(sc);
4504 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4505 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4506 }
4507 
4508 static int
4509 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4510 {
4511 	struct hn_softc *sc = arg1;
4512 	char filter_str[128];
4513 	uint32_t filter;
4514 
4515 	HN_LOCK(sc);
4516 	filter = sc->hn_rx_filter;
4517 	HN_UNLOCK(sc);
4518 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4519 	    NDIS_PACKET_TYPES);
4520 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4521 }
4522 
4523 static int
4524 hn_rsc_sysctl(SYSCTL_HANDLER_ARGS)
4525 {
4526 	struct hn_softc *sc = arg1;
4527 	uint32_t mtu;
4528 	int error;
4529 	HN_LOCK(sc);
4530 	error = hn_rndis_get_mtu(sc, &mtu);
4531 	if (error) {
4532 		if_printf(sc->hn_ifp, "failed to get mtu\n");
4533 		goto back;
4534 	}
4535 	error = SYSCTL_OUT(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl));
4536 	if (error || req->newptr == NULL)
4537 		goto back;
4538 
4539 	error = SYSCTL_IN(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl));
4540 	if (error)
4541 		goto back;
4542 	error = hn_rndis_reconf_offload(sc, mtu);
4543 back:
4544 	HN_UNLOCK(sc);
4545 	return (error);
4546 }
4547 #ifndef RSS
4548 
4549 static int
4550 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4551 {
4552 	struct hn_softc *sc = arg1;
4553 	int error;
4554 
4555 	HN_LOCK(sc);
4556 
4557 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4558 	if (error || req->newptr == NULL)
4559 		goto back;
4560 
4561 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
4562 	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4563 		/*
4564 		 * RSS key is synchronized w/ VF's, don't allow users
4565 		 * to change it.
4566 		 */
4567 		error = EBUSY;
4568 		goto back;
4569 	}
4570 
4571 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4572 	if (error)
4573 		goto back;
4574 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4575 
4576 	if (sc->hn_rx_ring_inuse > 1) {
4577 		error = hn_rss_reconfig(sc);
4578 	} else {
4579 		/* Not RSS capable, at least for now; just save the RSS key. */
4580 		error = 0;
4581 	}
4582 back:
4583 	HN_UNLOCK(sc);
4584 	return (error);
4585 }
4586 
4587 static int
4588 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4589 {
4590 	struct hn_softc *sc = arg1;
4591 	int error;
4592 
4593 	HN_LOCK(sc);
4594 
4595 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4596 	if (error || req->newptr == NULL)
4597 		goto back;
4598 
4599 	/*
4600 	 * Don't allow RSS indirect table change, if this interface is not
4601 	 * RSS capable currently.
4602 	 */
4603 	if (sc->hn_rx_ring_inuse == 1) {
4604 		error = EOPNOTSUPP;
4605 		goto back;
4606 	}
4607 
4608 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4609 	if (error)
4610 		goto back;
4611 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4612 
4613 	hn_rss_ind_fixup(sc);
4614 	error = hn_rss_reconfig(sc);
4615 back:
4616 	HN_UNLOCK(sc);
4617 	return (error);
4618 }
4619 
4620 #endif	/* !RSS */
4621 
4622 static int
4623 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4624 {
4625 	struct hn_softc *sc = arg1;
4626 	char hash_str[128];
4627 	uint32_t hash;
4628 
4629 	HN_LOCK(sc);
4630 	hash = sc->hn_rss_hash;
4631 	HN_UNLOCK(sc);
4632 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4633 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4634 }
4635 
4636 static int
4637 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4638 {
4639 	struct hn_softc *sc = arg1;
4640 	char hash_str[128];
4641 	uint32_t hash;
4642 
4643 	HN_LOCK(sc);
4644 	hash = sc->hn_rss_hcap;
4645 	HN_UNLOCK(sc);
4646 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4647 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4648 }
4649 
4650 static int
4651 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4652 {
4653 	struct hn_softc *sc = arg1;
4654 	char hash_str[128];
4655 	uint32_t hash;
4656 
4657 	HN_LOCK(sc);
4658 	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4659 	HN_UNLOCK(sc);
4660 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4661 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4662 }
4663 
4664 static int
4665 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4666 {
4667 	struct hn_softc *sc = arg1;
4668 	char vf_name[IFNAMSIZ + 1];
4669 	struct ifnet *vf_ifp;
4670 
4671 	HN_LOCK(sc);
4672 	vf_name[0] = '\0';
4673 	vf_ifp = sc->hn_vf_ifp;
4674 	if (vf_ifp != NULL)
4675 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4676 	HN_UNLOCK(sc);
4677 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4678 }
4679 
4680 static int
4681 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4682 {
4683 	struct hn_softc *sc = arg1;
4684 	char vf_name[IFNAMSIZ + 1];
4685 	struct ifnet *vf_ifp;
4686 
4687 	HN_LOCK(sc);
4688 	vf_name[0] = '\0';
4689 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4690 	if (vf_ifp != NULL)
4691 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4692 	HN_UNLOCK(sc);
4693 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4694 }
4695 
4696 static int
4697 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4698 {
4699 	struct rm_priotracker pt;
4700 	struct sbuf *sb;
4701 	int error, i;
4702 	bool first;
4703 
4704 	error = sysctl_wire_old_buffer(req, 0);
4705 	if (error != 0)
4706 		return (error);
4707 
4708 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4709 	if (sb == NULL)
4710 		return (ENOMEM);
4711 
4712 	rm_rlock(&hn_vfmap_lock, &pt);
4713 
4714 	first = true;
4715 	for (i = 0; i < hn_vfmap_size; ++i) {
4716 		struct epoch_tracker et;
4717 		struct ifnet *ifp;
4718 
4719 		if (hn_vfmap[i] == NULL)
4720 			continue;
4721 
4722 		NET_EPOCH_ENTER(et);
4723 		ifp = ifnet_byindex(i);
4724 		if (ifp != NULL) {
4725 			if (first)
4726 				sbuf_printf(sb, "%s", ifp->if_xname);
4727 			else
4728 				sbuf_printf(sb, " %s", ifp->if_xname);
4729 			first = false;
4730 		}
4731 		NET_EPOCH_EXIT(et);
4732 	}
4733 
4734 	rm_runlock(&hn_vfmap_lock, &pt);
4735 
4736 	error = sbuf_finish(sb);
4737 	sbuf_delete(sb);
4738 	return (error);
4739 }
4740 
4741 static int
4742 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4743 {
4744 	struct rm_priotracker pt;
4745 	struct sbuf *sb;
4746 	int error, i;
4747 	bool first;
4748 
4749 	error = sysctl_wire_old_buffer(req, 0);
4750 	if (error != 0)
4751 		return (error);
4752 
4753 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4754 	if (sb == NULL)
4755 		return (ENOMEM);
4756 
4757 	rm_rlock(&hn_vfmap_lock, &pt);
4758 
4759 	first = true;
4760 	for (i = 0; i < hn_vfmap_size; ++i) {
4761 		struct epoch_tracker et;
4762 		struct ifnet *ifp, *hn_ifp;
4763 
4764 		hn_ifp = hn_vfmap[i];
4765 		if (hn_ifp == NULL)
4766 			continue;
4767 
4768 		NET_EPOCH_ENTER(et);
4769 		ifp = ifnet_byindex(i);
4770 		if (ifp != NULL) {
4771 			if (first) {
4772 				sbuf_printf(sb, "%s:%s", ifp->if_xname,
4773 				    hn_ifp->if_xname);
4774 			} else {
4775 				sbuf_printf(sb, " %s:%s", ifp->if_xname,
4776 				    hn_ifp->if_xname);
4777 			}
4778 			first = false;
4779 		}
4780 		NET_EPOCH_EXIT(et);
4781 	}
4782 
4783 	rm_runlock(&hn_vfmap_lock, &pt);
4784 
4785 	error = sbuf_finish(sb);
4786 	sbuf_delete(sb);
4787 	return (error);
4788 }
4789 
4790 static int
4791 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4792 {
4793 	struct hn_softc *sc = arg1;
4794 	int error, onoff = 0;
4795 
4796 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4797 		onoff = 1;
4798 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4799 	if (error || req->newptr == NULL)
4800 		return (error);
4801 
4802 	HN_LOCK(sc);
4803 	/* NOTE: hn_vf_lock for hn_transmit() */
4804 	rm_wlock(&sc->hn_vf_lock);
4805 	if (onoff)
4806 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4807 	else
4808 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4809 	rm_wunlock(&sc->hn_vf_lock);
4810 	HN_UNLOCK(sc);
4811 
4812 	return (0);
4813 }
4814 
4815 static int
4816 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4817 {
4818 	struct hn_softc *sc = arg1;
4819 	int enabled = 0;
4820 
4821 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4822 		enabled = 1;
4823 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4824 }
4825 
4826 static int
4827 hn_check_iplen(const struct mbuf *m, int hoff)
4828 {
4829 	const struct ip *ip;
4830 	int len, iphlen, iplen;
4831 	const struct tcphdr *th;
4832 	int thoff;				/* TCP data offset */
4833 
4834 	len = hoff + sizeof(struct ip);
4835 
4836 	/* The packet must be at least the size of an IP header. */
4837 	if (m->m_pkthdr.len < len)
4838 		return IPPROTO_DONE;
4839 
4840 	/* The fixed IP header must reside completely in the first mbuf. */
4841 	if (m->m_len < len)
4842 		return IPPROTO_DONE;
4843 
4844 	ip = mtodo(m, hoff);
4845 
4846 	/* Bound check the packet's stated IP header length. */
4847 	iphlen = ip->ip_hl << 2;
4848 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4849 		return IPPROTO_DONE;
4850 
4851 	/* The full IP header must reside completely in the one mbuf. */
4852 	if (m->m_len < hoff + iphlen)
4853 		return IPPROTO_DONE;
4854 
4855 	iplen = ntohs(ip->ip_len);
4856 
4857 	/*
4858 	 * Check that the amount of data in the buffers is as
4859 	 * at least much as the IP header would have us expect.
4860 	 */
4861 	if (m->m_pkthdr.len < hoff + iplen)
4862 		return IPPROTO_DONE;
4863 
4864 	/*
4865 	 * Ignore IP fragments.
4866 	 */
4867 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4868 		return IPPROTO_DONE;
4869 
4870 	/*
4871 	 * The TCP/IP or UDP/IP header must be entirely contained within
4872 	 * the first fragment of a packet.
4873 	 */
4874 	switch (ip->ip_p) {
4875 	case IPPROTO_TCP:
4876 		if (iplen < iphlen + sizeof(struct tcphdr))
4877 			return IPPROTO_DONE;
4878 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4879 			return IPPROTO_DONE;
4880 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4881 		thoff = th->th_off << 2;
4882 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4883 			return IPPROTO_DONE;
4884 		if (m->m_len < hoff + iphlen + thoff)
4885 			return IPPROTO_DONE;
4886 		break;
4887 	case IPPROTO_UDP:
4888 		if (iplen < iphlen + sizeof(struct udphdr))
4889 			return IPPROTO_DONE;
4890 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4891 			return IPPROTO_DONE;
4892 		break;
4893 	default:
4894 		if (iplen < iphlen)
4895 			return IPPROTO_DONE;
4896 		break;
4897 	}
4898 	return ip->ip_p;
4899 }
4900 
4901 static void
4902 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4903 {
4904 	const struct ether_header *eh;
4905 	uint16_t etype;
4906 	int hoff;
4907 
4908 	hoff = sizeof(*eh);
4909 	/* Checked at the beginning of this function. */
4910 	KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4911 
4912 	eh = mtod(m_new, const struct ether_header *);
4913 	etype = ntohs(eh->ether_type);
4914 	if (etype == ETHERTYPE_VLAN) {
4915 		const struct ether_vlan_header *evl;
4916 
4917 		hoff = sizeof(*evl);
4918 		if (m_new->m_len < hoff)
4919 			return;
4920 		evl = mtod(m_new, const struct ether_vlan_header *);
4921 		etype = ntohs(evl->evl_proto);
4922 	}
4923 	*l3proto = etype;
4924 
4925 	if (etype == ETHERTYPE_IP)
4926 		*l4proto = hn_check_iplen(m_new, hoff);
4927 	else
4928 		*l4proto = IPPROTO_DONE;
4929 }
4930 
4931 static int
4932 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4933 {
4934 	struct sysctl_oid_list *child;
4935 	struct sysctl_ctx_list *ctx;
4936 	device_t dev = sc->hn_dev;
4937 #if defined(INET) || defined(INET6)
4938 	int lroent_cnt;
4939 #endif
4940 	int i;
4941 
4942 	/*
4943 	 * Create RXBUF for reception.
4944 	 *
4945 	 * NOTE:
4946 	 * - It is shared by all channels.
4947 	 * - A large enough buffer is allocated, certain version of NVSes
4948 	 *   may further limit the usable space.
4949 	 */
4950 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4951 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4952 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
4953 	if (sc->hn_rxbuf == NULL) {
4954 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4955 		return (ENOMEM);
4956 	}
4957 
4958 	sc->hn_rx_ring_cnt = ring_cnt;
4959 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4960 
4961 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4962 	    M_DEVBUF, M_WAITOK | M_ZERO);
4963 
4964 #if defined(INET) || defined(INET6)
4965 	lroent_cnt = hn_lro_entry_count;
4966 	if (lroent_cnt < TCP_LRO_ENTRIES)
4967 		lroent_cnt = TCP_LRO_ENTRIES;
4968 	if (bootverbose)
4969 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4970 #endif	/* INET || INET6 */
4971 
4972 	ctx = device_get_sysctl_ctx(dev);
4973 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4974 
4975 	/* Create dev.hn.UNIT.rx sysctl tree */
4976 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4977 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4978 
4979 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4980 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4981 
4982 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4983 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4984 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
4985 		if (rxr->hn_br == NULL) {
4986 			device_printf(dev, "allocate bufring failed\n");
4987 			return (ENOMEM);
4988 		}
4989 
4990 		if (hn_trust_hosttcp)
4991 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4992 		if (hn_trust_hostudp)
4993 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4994 		if (hn_trust_hostip)
4995 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4996 		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
4997 		rxr->hn_ifp = sc->hn_ifp;
4998 		if (i < sc->hn_tx_ring_cnt)
4999 			rxr->hn_txr = &sc->hn_tx_ring[i];
5000 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
5001 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
5002 		rxr->hn_rx_idx = i;
5003 		rxr->hn_rxbuf = sc->hn_rxbuf;
5004 
5005 		/*
5006 		 * Initialize LRO.
5007 		 */
5008 #if defined(INET) || defined(INET6)
5009 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
5010 		    hn_lro_mbufq_depth);
5011 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
5012 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
5013 #endif	/* INET || INET6 */
5014 
5015 		if (sc->hn_rx_sysctl_tree != NULL) {
5016 			char name[16];
5017 
5018 			/*
5019 			 * Create per RX ring sysctl tree:
5020 			 * dev.hn.UNIT.rx.RINGID
5021 			 */
5022 			snprintf(name, sizeof(name), "%d", i);
5023 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
5024 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
5025 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5026 
5027 			if (rxr->hn_rx_sysctl_tree != NULL) {
5028 				SYSCTL_ADD_ULONG(ctx,
5029 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5030 				    OID_AUTO, "packets",
5031 				    CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts,
5032 				    "# of packets received");
5033 				SYSCTL_ADD_ULONG(ctx,
5034 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5035 				    OID_AUTO, "rss_pkts",
5036 				    CTLFLAG_RW | CTLFLAG_STATS,
5037 				    &rxr->hn_rss_pkts,
5038 				    "# of packets w/ RSS info received");
5039 				SYSCTL_ADD_ULONG(ctx,
5040 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5041 				    OID_AUTO, "rsc_pkts",
5042 				    CTLFLAG_RW | CTLFLAG_STATS,
5043 				    &rxr->hn_rsc_pkts,
5044 				    "# of RSC packets received");
5045 				SYSCTL_ADD_ULONG(ctx,
5046 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5047 				    OID_AUTO, "rsc_drop",
5048 				    CTLFLAG_RW | CTLFLAG_STATS,
5049 				    &rxr->hn_rsc_drop,
5050 				    "# of RSC fragments dropped");
5051 				SYSCTL_ADD_INT(ctx,
5052 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5053 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5054 				    &rxr->hn_pktbuf_len, 0,
5055 				    "Temporary channel packet buffer length");
5056 			}
5057 		}
5058 	}
5059 
5060 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5061 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5062 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5063 	    hn_rx_stat_u64_sysctl,
5064 	    "LU", "LRO queued");
5065 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5066 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5067 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5068 	    hn_rx_stat_u64_sysctl,
5069 	    "LU", "LRO flushed");
5070 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5071 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5072 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
5073 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5074 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5075 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5076 	    hn_lro_lenlim_sysctl, "IU",
5077 	    "Max # of data bytes to be aggregated by LRO");
5078 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5079 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5080 	    hn_lro_ackcnt_sysctl, "I",
5081 	    "Max # of ACKs to be aggregated by LRO");
5082 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5083 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5084 	    hn_trust_hcsum_sysctl, "I",
5085 	    "Trust tcp segment verification on host side, "
5086 	    "when csum info is missing");
5087 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5088 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5089 	    hn_trust_hcsum_sysctl, "I",
5090 	    "Trust udp datagram verification on host side, "
5091 	    "when csum info is missing");
5092 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5093 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5094 	    hn_trust_hcsum_sysctl, "I",
5095 	    "Trust ip packet verification on host side, "
5096 	    "when csum info is missing");
5097 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5098 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5099 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
5100 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5101 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5102 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5103 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
5104 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5105 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5106 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5107 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
5108 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5109 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5110 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5111 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
5112 	    hn_rx_stat_ulong_sysctl, "LU",
5113 	    "# of packets that we trust host's csum verification");
5114 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5115 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5116 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
5117 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5118 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5119 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5120 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
5121 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5122 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5123 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5124 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5125 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5126 
5127 	return (0);
5128 }
5129 
5130 static void
5131 hn_destroy_rx_data(struct hn_softc *sc)
5132 {
5133 	int i;
5134 
5135 	if (sc->hn_rxbuf != NULL) {
5136 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5137 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
5138 		else
5139 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
5140 		sc->hn_rxbuf = NULL;
5141 	}
5142 
5143 	if (sc->hn_rx_ring_cnt == 0)
5144 		return;
5145 
5146 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5147 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5148 
5149 		if (rxr->hn_br == NULL)
5150 			continue;
5151 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5152 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5153 		} else {
5154 			device_printf(sc->hn_dev,
5155 			    "%dth channel bufring is referenced", i);
5156 		}
5157 		rxr->hn_br = NULL;
5158 
5159 #if defined(INET) || defined(INET6)
5160 		tcp_lro_free(&rxr->hn_lro);
5161 #endif
5162 		free(rxr->hn_pktbuf, M_DEVBUF);
5163 	}
5164 	free(sc->hn_rx_ring, M_DEVBUF);
5165 	sc->hn_rx_ring = NULL;
5166 
5167 	sc->hn_rx_ring_cnt = 0;
5168 	sc->hn_rx_ring_inuse = 0;
5169 }
5170 
5171 static int
5172 hn_tx_ring_create(struct hn_softc *sc, int id)
5173 {
5174 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5175 	device_t dev = sc->hn_dev;
5176 	bus_dma_tag_t parent_dtag;
5177 	int error, i;
5178 
5179 	txr->hn_sc = sc;
5180 	txr->hn_tx_idx = id;
5181 
5182 #ifndef HN_USE_TXDESC_BUFRING
5183 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5184 #endif
5185 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5186 
5187 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5188 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5189 	    M_DEVBUF, M_WAITOK | M_ZERO);
5190 #ifndef HN_USE_TXDESC_BUFRING
5191 	SLIST_INIT(&txr->hn_txlist);
5192 #else
5193 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5194 	    M_WAITOK, &txr->hn_tx_lock);
5195 #endif
5196 
5197 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5198 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5199 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5200 	} else {
5201 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5202 	}
5203 
5204 #ifdef HN_IFSTART_SUPPORT
5205 	if (hn_use_if_start) {
5206 		txr->hn_txeof = hn_start_txeof;
5207 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5208 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5209 	} else
5210 #endif
5211 	{
5212 		int br_depth;
5213 
5214 		txr->hn_txeof = hn_xmit_txeof;
5215 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5216 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5217 
5218 		br_depth = hn_get_txswq_depth(txr);
5219 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5220 		    M_WAITOK, &txr->hn_tx_lock);
5221 	}
5222 
5223 	txr->hn_direct_tx_size = hn_direct_tx_size;
5224 
5225 	/*
5226 	 * Always schedule transmission instead of trying to do direct
5227 	 * transmission.  This one gives the best performance so far.
5228 	 */
5229 	txr->hn_sched_tx = 1;
5230 
5231 	parent_dtag = bus_get_dma_tag(dev);
5232 
5233 	/* DMA tag for RNDIS packet messages. */
5234 	error = bus_dma_tag_create(parent_dtag, /* parent */
5235 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
5236 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
5237 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5238 	    BUS_SPACE_MAXADDR,		/* highaddr */
5239 	    NULL, NULL,			/* filter, filterarg */
5240 	    HN_RNDIS_PKT_LEN,		/* maxsize */
5241 	    1,				/* nsegments */
5242 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
5243 	    0,				/* flags */
5244 	    NULL,			/* lockfunc */
5245 	    NULL,			/* lockfuncarg */
5246 	    &txr->hn_tx_rndis_dtag);
5247 	if (error) {
5248 		device_printf(dev, "failed to create rndis dmatag\n");
5249 		return error;
5250 	}
5251 
5252 	/* DMA tag for data. */
5253 	error = bus_dma_tag_create(parent_dtag, /* parent */
5254 	    1,				/* alignment */
5255 	    HN_TX_DATA_BOUNDARY,	/* boundary */
5256 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5257 	    BUS_SPACE_MAXADDR,		/* highaddr */
5258 	    NULL, NULL,			/* filter, filterarg */
5259 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
5260 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
5261 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
5262 	    0,				/* flags */
5263 	    NULL,			/* lockfunc */
5264 	    NULL,			/* lockfuncarg */
5265 	    &txr->hn_tx_data_dtag);
5266 	if (error) {
5267 		device_printf(dev, "failed to create data dmatag\n");
5268 		return error;
5269 	}
5270 
5271 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5272 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
5273 
5274 		txd->txr = txr;
5275 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5276 		STAILQ_INIT(&txd->agg_list);
5277 
5278 		/*
5279 		 * Allocate and load RNDIS packet message.
5280 		 */
5281         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5282 		    (void **)&txd->rndis_pkt,
5283 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5284 		    &txd->rndis_pkt_dmap);
5285 		if (error) {
5286 			device_printf(dev,
5287 			    "failed to allocate rndis_packet_msg, %d\n", i);
5288 			return error;
5289 		}
5290 
5291 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5292 		    txd->rndis_pkt_dmap,
5293 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5294 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5295 		    BUS_DMA_NOWAIT);
5296 		if (error) {
5297 			device_printf(dev,
5298 			    "failed to load rndis_packet_msg, %d\n", i);
5299 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5300 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5301 			return error;
5302 		}
5303 
5304 		/* DMA map for TX data. */
5305 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5306 		    &txd->data_dmap);
5307 		if (error) {
5308 			device_printf(dev,
5309 			    "failed to allocate tx data dmamap\n");
5310 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5311 			    txd->rndis_pkt_dmap);
5312 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5313 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5314 			return error;
5315 		}
5316 
5317 		/* All set, put it to list */
5318 		txd->flags |= HN_TXD_FLAG_ONLIST;
5319 #ifndef HN_USE_TXDESC_BUFRING
5320 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5321 #else
5322 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
5323 #endif
5324 	}
5325 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5326 
5327 	if (sc->hn_tx_sysctl_tree != NULL) {
5328 		struct sysctl_oid_list *child;
5329 		struct sysctl_ctx_list *ctx;
5330 		char name[16];
5331 
5332 		/*
5333 		 * Create per TX ring sysctl tree:
5334 		 * dev.hn.UNIT.tx.RINGID
5335 		 */
5336 		ctx = device_get_sysctl_ctx(dev);
5337 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5338 
5339 		snprintf(name, sizeof(name), "%d", id);
5340 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5341 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5342 
5343 		if (txr->hn_tx_sysctl_tree != NULL) {
5344 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5345 
5346 #ifdef HN_DEBUG
5347 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5348 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5349 			    "# of available TX descs");
5350 #endif
5351 #ifdef HN_IFSTART_SUPPORT
5352 			if (!hn_use_if_start)
5353 #endif
5354 			{
5355 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5356 				    CTLFLAG_RD, &txr->hn_oactive, 0,
5357 				    "over active");
5358 			}
5359 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5360 			    CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts,
5361 			    "# of packets transmitted");
5362 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5363 			    CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends,
5364 			    "# of sends");
5365 		}
5366 	}
5367 
5368 	return 0;
5369 }
5370 
5371 static void
5372 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5373 {
5374 	struct hn_tx_ring *txr = txd->txr;
5375 
5376 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
5377 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5378 
5379 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5380 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5381 	    txd->rndis_pkt_dmap);
5382 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5383 }
5384 
5385 static void
5386 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5387 {
5388 
5389 	KASSERT(txd->refs == 0 || txd->refs == 1,
5390 	    ("invalid txd refs %d", txd->refs));
5391 
5392 	/* Aggregated txds will be freed by their aggregating txd. */
5393 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5394 		int freed __diagused;
5395 
5396 		freed = hn_txdesc_put(txr, txd);
5397 		KASSERT(freed, ("can't free txdesc"));
5398 	}
5399 }
5400 
5401 static void
5402 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5403 {
5404 	int i;
5405 
5406 	if (txr->hn_txdesc == NULL)
5407 		return;
5408 
5409 	/*
5410 	 * NOTE:
5411 	 * Because the freeing of aggregated txds will be deferred
5412 	 * to the aggregating txd, two passes are used here:
5413 	 * - The first pass GCes any pending txds.  This GC is necessary,
5414 	 *   since if the channels are revoked, hypervisor will not
5415 	 *   deliver send-done for all pending txds.
5416 	 * - The second pass frees the busdma stuffs, i.e. after all txds
5417 	 *   were freed.
5418 	 */
5419 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5420 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5421 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5422 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5423 
5424 	if (txr->hn_tx_data_dtag != NULL)
5425 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5426 	if (txr->hn_tx_rndis_dtag != NULL)
5427 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5428 
5429 #ifdef HN_USE_TXDESC_BUFRING
5430 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5431 #endif
5432 
5433 	free(txr->hn_txdesc, M_DEVBUF);
5434 	txr->hn_txdesc = NULL;
5435 
5436 	if (txr->hn_mbuf_br != NULL)
5437 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5438 
5439 #ifndef HN_USE_TXDESC_BUFRING
5440 	mtx_destroy(&txr->hn_txlist_spin);
5441 #endif
5442 	mtx_destroy(&txr->hn_tx_lock);
5443 }
5444 
5445 static int
5446 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5447 {
5448 	struct sysctl_oid_list *child;
5449 	struct sysctl_ctx_list *ctx;
5450 	int i;
5451 
5452 	/*
5453 	 * Create TXBUF for chimney sending.
5454 	 *
5455 	 * NOTE: It is shared by all channels.
5456 	 */
5457 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5458 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5459 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
5460 	if (sc->hn_chim == NULL) {
5461 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
5462 		return (ENOMEM);
5463 	}
5464 
5465 	sc->hn_tx_ring_cnt = ring_cnt;
5466 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5467 
5468 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5469 	    M_DEVBUF, M_WAITOK | M_ZERO);
5470 
5471 	ctx = device_get_sysctl_ctx(sc->hn_dev);
5472 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5473 
5474 	/* Create dev.hn.UNIT.tx sysctl tree */
5475 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5476 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5477 
5478 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5479 		int error;
5480 
5481 		error = hn_tx_ring_create(sc, i);
5482 		if (error)
5483 			return error;
5484 	}
5485 
5486 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5487 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5488 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5489 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5490 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5491 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5492 	    __offsetof(struct hn_tx_ring, hn_send_failed),
5493 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5494 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5495 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5496 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5497 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5498 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5499 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5500 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5501 	    hn_tx_stat_ulong_sysctl, "LU",
5502 	    "# of packet transmission aggregation flush failure");
5503 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5504 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5505 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5506 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5507 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5508 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5509 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5510 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5511 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5512 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5513 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5514 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5515 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5516 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5517 	    "# of total TX descs");
5518 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5519 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5520 	    "Chimney send packet size upper boundary");
5521 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5522 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5523 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5524 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5525 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5526 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5527 	    hn_tx_conf_int_sysctl, "I",
5528 	    "Size of the packet for direct transmission");
5529 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5530 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5531 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5532 	    hn_tx_conf_int_sysctl, "I",
5533 	    "Always schedule transmission "
5534 	    "instead of doing direct transmission");
5535 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5536 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5537 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5538 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5539 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5540 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5541 	    "Applied packet transmission aggregation size");
5542 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5543 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5544 	    hn_txagg_pktmax_sysctl, "I",
5545 	    "Applied packet transmission aggregation packets");
5546 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5547 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5548 	    hn_txagg_align_sysctl, "I",
5549 	    "Applied packet transmission aggregation alignment");
5550 
5551 	return 0;
5552 }
5553 
5554 static void
5555 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5556 {
5557 	int i;
5558 
5559 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5560 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5561 }
5562 
5563 static void
5564 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5565 {
5566 	struct ifnet *ifp = sc->hn_ifp;
5567 	u_int hw_tsomax;
5568 	int tso_minlen;
5569 
5570 	HN_LOCK_ASSERT(sc);
5571 
5572 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5573 		return;
5574 
5575 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5576 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5577 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5578 
5579 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5580 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5581 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5582 
5583 	if (tso_maxlen < tso_minlen)
5584 		tso_maxlen = tso_minlen;
5585 	else if (tso_maxlen > IP_MAXPACKET)
5586 		tso_maxlen = IP_MAXPACKET;
5587 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5588 		tso_maxlen = sc->hn_ndis_tso_szmax;
5589 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5590 
5591 	if (hn_xpnt_vf_isready(sc)) {
5592 		if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5593 			hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5594 	}
5595 	ifp->if_hw_tsomax = hw_tsomax;
5596 	if (bootverbose)
5597 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5598 }
5599 
5600 static void
5601 hn_fixup_tx_data(struct hn_softc *sc)
5602 {
5603 	uint64_t csum_assist;
5604 	int i;
5605 
5606 	hn_set_chim_size(sc, sc->hn_chim_szmax);
5607 	if (hn_tx_chimney_size > 0 &&
5608 	    hn_tx_chimney_size < sc->hn_chim_szmax)
5609 		hn_set_chim_size(sc, hn_tx_chimney_size);
5610 
5611 	csum_assist = 0;
5612 	if (sc->hn_caps & HN_CAP_IPCS)
5613 		csum_assist |= CSUM_IP;
5614 	if (sc->hn_caps & HN_CAP_TCP4CS)
5615 		csum_assist |= CSUM_IP_TCP;
5616 	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5617 		csum_assist |= CSUM_IP_UDP;
5618 	if (sc->hn_caps & HN_CAP_TCP6CS)
5619 		csum_assist |= CSUM_IP6_TCP;
5620 	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5621 		csum_assist |= CSUM_IP6_UDP;
5622 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5623 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5624 
5625 	if (sc->hn_caps & HN_CAP_HASHVAL) {
5626 		/*
5627 		 * Support HASHVAL pktinfo on TX path.
5628 		 */
5629 		if (bootverbose)
5630 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5631 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5632 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5633 	}
5634 }
5635 
5636 static void
5637 hn_fixup_rx_data(struct hn_softc *sc)
5638 {
5639 
5640 	if (sc->hn_caps & HN_CAP_UDPHASH) {
5641 		int i;
5642 
5643 		for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5644 			sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5645 	}
5646 }
5647 
5648 static void
5649 hn_destroy_tx_data(struct hn_softc *sc)
5650 {
5651 	int i;
5652 
5653 	if (sc->hn_chim != NULL) {
5654 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5655 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5656 		} else {
5657 			device_printf(sc->hn_dev,
5658 			    "chimney sending buffer is referenced");
5659 		}
5660 		sc->hn_chim = NULL;
5661 	}
5662 
5663 	if (sc->hn_tx_ring_cnt == 0)
5664 		return;
5665 
5666 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5667 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5668 
5669 	free(sc->hn_tx_ring, M_DEVBUF);
5670 	sc->hn_tx_ring = NULL;
5671 
5672 	sc->hn_tx_ring_cnt = 0;
5673 	sc->hn_tx_ring_inuse = 0;
5674 }
5675 
5676 #ifdef HN_IFSTART_SUPPORT
5677 
5678 static void
5679 hn_start_taskfunc(void *xtxr, int pending __unused)
5680 {
5681 	struct hn_tx_ring *txr = xtxr;
5682 
5683 	mtx_lock(&txr->hn_tx_lock);
5684 	hn_start_locked(txr, 0);
5685 	mtx_unlock(&txr->hn_tx_lock);
5686 }
5687 
5688 static int
5689 hn_start_locked(struct hn_tx_ring *txr, int len)
5690 {
5691 	struct hn_softc *sc = txr->hn_sc;
5692 	struct ifnet *ifp = sc->hn_ifp;
5693 	int sched = 0;
5694 
5695 	KASSERT(hn_use_if_start,
5696 	    ("hn_start_locked is called, when if_start is disabled"));
5697 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5698 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5699 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5700 
5701 	if (__predict_false(txr->hn_suspended))
5702 		return (0);
5703 
5704 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5705 	    IFF_DRV_RUNNING)
5706 		return (0);
5707 
5708 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5709 		struct hn_txdesc *txd;
5710 		struct mbuf *m_head;
5711 		int error;
5712 
5713 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5714 		if (m_head == NULL)
5715 			break;
5716 
5717 		if (len > 0 && m_head->m_pkthdr.len > len) {
5718 			/*
5719 			 * This sending could be time consuming; let callers
5720 			 * dispatch this packet sending (and sending of any
5721 			 * following up packets) to tx taskqueue.
5722 			 */
5723 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5724 			sched = 1;
5725 			break;
5726 		}
5727 
5728 #if defined(INET6) || defined(INET)
5729 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5730 			m_head = hn_tso_fixup(m_head);
5731 			if (__predict_false(m_head == NULL)) {
5732 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5733 				continue;
5734 			}
5735 		} else if (m_head->m_pkthdr.csum_flags &
5736 		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5737 			m_head = hn_set_hlen(m_head);
5738 			if (__predict_false(m_head == NULL)) {
5739 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5740 				continue;
5741 			}
5742 		}
5743 #endif
5744 
5745 		txd = hn_txdesc_get(txr);
5746 		if (txd == NULL) {
5747 			txr->hn_no_txdescs++;
5748 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5749 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5750 			break;
5751 		}
5752 
5753 		error = hn_encap(ifp, txr, txd, &m_head);
5754 		if (error) {
5755 			/* Both txd and m_head are freed */
5756 			KASSERT(txr->hn_agg_txd == NULL,
5757 			    ("encap failed w/ pending aggregating txdesc"));
5758 			continue;
5759 		}
5760 
5761 		if (txr->hn_agg_pktleft == 0) {
5762 			if (txr->hn_agg_txd != NULL) {
5763 				KASSERT(m_head == NULL,
5764 				    ("pending mbuf for aggregating txdesc"));
5765 				error = hn_flush_txagg(ifp, txr);
5766 				if (__predict_false(error)) {
5767 					atomic_set_int(&ifp->if_drv_flags,
5768 					    IFF_DRV_OACTIVE);
5769 					break;
5770 				}
5771 			} else {
5772 				KASSERT(m_head != NULL, ("mbuf was freed"));
5773 				error = hn_txpkt(ifp, txr, txd);
5774 				if (__predict_false(error)) {
5775 					/* txd is freed, but m_head is not */
5776 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5777 					atomic_set_int(&ifp->if_drv_flags,
5778 					    IFF_DRV_OACTIVE);
5779 					break;
5780 				}
5781 			}
5782 		}
5783 #ifdef INVARIANTS
5784 		else {
5785 			KASSERT(txr->hn_agg_txd != NULL,
5786 			    ("no aggregating txdesc"));
5787 			KASSERT(m_head == NULL,
5788 			    ("pending mbuf for aggregating txdesc"));
5789 		}
5790 #endif
5791 	}
5792 
5793 	/* Flush pending aggerated transmission. */
5794 	if (txr->hn_agg_txd != NULL)
5795 		hn_flush_txagg(ifp, txr);
5796 	return (sched);
5797 }
5798 
5799 static void
5800 hn_start(struct ifnet *ifp)
5801 {
5802 	struct hn_softc *sc = ifp->if_softc;
5803 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5804 
5805 	if (txr->hn_sched_tx)
5806 		goto do_sched;
5807 
5808 	if (mtx_trylock(&txr->hn_tx_lock)) {
5809 		int sched;
5810 
5811 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5812 		mtx_unlock(&txr->hn_tx_lock);
5813 		if (!sched)
5814 			return;
5815 	}
5816 do_sched:
5817 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5818 }
5819 
5820 static void
5821 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5822 {
5823 	struct hn_tx_ring *txr = xtxr;
5824 
5825 	mtx_lock(&txr->hn_tx_lock);
5826 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5827 	hn_start_locked(txr, 0);
5828 	mtx_unlock(&txr->hn_tx_lock);
5829 }
5830 
5831 static void
5832 hn_start_txeof(struct hn_tx_ring *txr)
5833 {
5834 	struct hn_softc *sc = txr->hn_sc;
5835 	struct ifnet *ifp = sc->hn_ifp;
5836 
5837 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5838 
5839 	if (txr->hn_sched_tx)
5840 		goto do_sched;
5841 
5842 	if (mtx_trylock(&txr->hn_tx_lock)) {
5843 		int sched;
5844 
5845 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5846 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5847 		mtx_unlock(&txr->hn_tx_lock);
5848 		if (sched) {
5849 			taskqueue_enqueue(txr->hn_tx_taskq,
5850 			    &txr->hn_tx_task);
5851 		}
5852 	} else {
5853 do_sched:
5854 		/*
5855 		 * Release the OACTIVE earlier, with the hope, that
5856 		 * others could catch up.  The task will clear the
5857 		 * flag again with the hn_tx_lock to avoid possible
5858 		 * races.
5859 		 */
5860 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5861 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5862 	}
5863 }
5864 
5865 #endif	/* HN_IFSTART_SUPPORT */
5866 
5867 static int
5868 hn_xmit(struct hn_tx_ring *txr, int len)
5869 {
5870 	struct hn_softc *sc = txr->hn_sc;
5871 	struct ifnet *ifp = sc->hn_ifp;
5872 	struct mbuf *m_head;
5873 	int sched = 0;
5874 
5875 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5876 #ifdef HN_IFSTART_SUPPORT
5877 	KASSERT(hn_use_if_start == 0,
5878 	    ("hn_xmit is called, when if_start is enabled"));
5879 #endif
5880 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5881 
5882 	if (__predict_false(txr->hn_suspended))
5883 		return (0);
5884 
5885 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5886 		return (0);
5887 
5888 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5889 		struct hn_txdesc *txd;
5890 		int error;
5891 
5892 		if (len > 0 && m_head->m_pkthdr.len > len) {
5893 			/*
5894 			 * This sending could be time consuming; let callers
5895 			 * dispatch this packet sending (and sending of any
5896 			 * following up packets) to tx taskqueue.
5897 			 */
5898 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5899 			sched = 1;
5900 			break;
5901 		}
5902 
5903 		txd = hn_txdesc_get(txr);
5904 		if (txd == NULL) {
5905 			txr->hn_no_txdescs++;
5906 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5907 			txr->hn_oactive = 1;
5908 			break;
5909 		}
5910 
5911 		error = hn_encap(ifp, txr, txd, &m_head);
5912 		if (error) {
5913 			/* Both txd and m_head are freed; discard */
5914 			KASSERT(txr->hn_agg_txd == NULL,
5915 			    ("encap failed w/ pending aggregating txdesc"));
5916 			drbr_advance(ifp, txr->hn_mbuf_br);
5917 			continue;
5918 		}
5919 
5920 		if (txr->hn_agg_pktleft == 0) {
5921 			if (txr->hn_agg_txd != NULL) {
5922 				KASSERT(m_head == NULL,
5923 				    ("pending mbuf for aggregating txdesc"));
5924 				error = hn_flush_txagg(ifp, txr);
5925 				if (__predict_false(error)) {
5926 					txr->hn_oactive = 1;
5927 					break;
5928 				}
5929 			} else {
5930 				KASSERT(m_head != NULL, ("mbuf was freed"));
5931 				error = hn_txpkt(ifp, txr, txd);
5932 				if (__predict_false(error)) {
5933 					/* txd is freed, but m_head is not */
5934 					drbr_putback(ifp, txr->hn_mbuf_br,
5935 					    m_head);
5936 					txr->hn_oactive = 1;
5937 					break;
5938 				}
5939 			}
5940 		}
5941 #ifdef INVARIANTS
5942 		else {
5943 			KASSERT(txr->hn_agg_txd != NULL,
5944 			    ("no aggregating txdesc"));
5945 			KASSERT(m_head == NULL,
5946 			    ("pending mbuf for aggregating txdesc"));
5947 		}
5948 #endif
5949 
5950 		/* Sent */
5951 		drbr_advance(ifp, txr->hn_mbuf_br);
5952 	}
5953 
5954 	/* Flush pending aggerated transmission. */
5955 	if (txr->hn_agg_txd != NULL)
5956 		hn_flush_txagg(ifp, txr);
5957 	return (sched);
5958 }
5959 
5960 static int
5961 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5962 {
5963 	struct hn_softc *sc = ifp->if_softc;
5964 	struct hn_tx_ring *txr;
5965 	int error, idx = 0;
5966 
5967 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5968 		struct rm_priotracker pt;
5969 
5970 		rm_rlock(&sc->hn_vf_lock, &pt);
5971 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5972 			struct mbuf *m_bpf = NULL;
5973 			int obytes, omcast;
5974 
5975 			obytes = m->m_pkthdr.len;
5976 			omcast = (m->m_flags & M_MCAST) != 0;
5977 
5978 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5979 				if (bpf_peers_present(ifp->if_bpf)) {
5980 					m_bpf = m_copypacket(m, M_NOWAIT);
5981 					if (m_bpf == NULL) {
5982 						/*
5983 						 * Failed to grab a shallow
5984 						 * copy; tap now.
5985 						 */
5986 						ETHER_BPF_MTAP(ifp, m);
5987 					}
5988 				}
5989 			} else {
5990 				ETHER_BPF_MTAP(ifp, m);
5991 			}
5992 
5993 			error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
5994 			rm_runlock(&sc->hn_vf_lock, &pt);
5995 
5996 			if (m_bpf != NULL) {
5997 				if (!error)
5998 					ETHER_BPF_MTAP(ifp, m_bpf);
5999 				m_freem(m_bpf);
6000 			}
6001 
6002 			if (error == ENOBUFS) {
6003 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6004 			} else if (error) {
6005 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6006 			} else {
6007 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
6008 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
6009 				if (omcast) {
6010 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
6011 					    omcast);
6012 				}
6013 			}
6014 			return (error);
6015 		}
6016 		rm_runlock(&sc->hn_vf_lock, &pt);
6017 	}
6018 
6019 #if defined(INET6) || defined(INET)
6020 	/*
6021 	 * Perform TSO packet header fixup or get l2/l3 header length now,
6022 	 * since packet headers should be cache-hot.
6023 	 */
6024 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
6025 		m = hn_tso_fixup(m);
6026 		if (__predict_false(m == NULL)) {
6027 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6028 			return EIO;
6029 		}
6030 	} else if (m->m_pkthdr.csum_flags &
6031 	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6032 		m = hn_set_hlen(m);
6033 		if (__predict_false(m == NULL)) {
6034 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6035 			return EIO;
6036 		}
6037 	}
6038 #endif
6039 
6040 	/*
6041 	 * Select the TX ring based on flowid
6042 	 */
6043 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6044 #ifdef RSS
6045 		uint32_t bid;
6046 
6047 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
6048 		    &bid) == 0)
6049 			idx = bid % sc->hn_tx_ring_inuse;
6050 		else
6051 #endif
6052 		{
6053 #if defined(INET6) || defined(INET)
6054 			int tcpsyn = 0;
6055 
6056 			if (m->m_pkthdr.len < 128 &&
6057 			    (m->m_pkthdr.csum_flags &
6058 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6059 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6060 				m = hn_check_tcpsyn(m, &tcpsyn);
6061 				if (__predict_false(m == NULL)) {
6062 					if_inc_counter(ifp,
6063 					    IFCOUNTER_OERRORS, 1);
6064 					return (EIO);
6065 				}
6066 			}
6067 #else
6068 			const int tcpsyn = 0;
6069 #endif
6070 			if (tcpsyn)
6071 				idx = 0;
6072 			else
6073 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6074 		}
6075 	}
6076 	txr = &sc->hn_tx_ring[idx];
6077 
6078 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6079 	if (error) {
6080 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6081 		return error;
6082 	}
6083 
6084 	if (txr->hn_oactive)
6085 		return 0;
6086 
6087 	if (txr->hn_sched_tx)
6088 		goto do_sched;
6089 
6090 	if (mtx_trylock(&txr->hn_tx_lock)) {
6091 		int sched;
6092 
6093 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6094 		mtx_unlock(&txr->hn_tx_lock);
6095 		if (!sched)
6096 			return 0;
6097 	}
6098 do_sched:
6099 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6100 	return 0;
6101 }
6102 
6103 static void
6104 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6105 {
6106 	struct mbuf *m;
6107 
6108 	mtx_lock(&txr->hn_tx_lock);
6109 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6110 		m_freem(m);
6111 	mtx_unlock(&txr->hn_tx_lock);
6112 }
6113 
6114 static void
6115 hn_xmit_qflush(struct ifnet *ifp)
6116 {
6117 	struct hn_softc *sc = ifp->if_softc;
6118 	struct rm_priotracker pt;
6119 	int i;
6120 
6121 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6122 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6123 	if_qflush(ifp);
6124 
6125 	rm_rlock(&sc->hn_vf_lock, &pt);
6126 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6127 		sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
6128 	rm_runlock(&sc->hn_vf_lock, &pt);
6129 }
6130 
6131 static void
6132 hn_xmit_txeof(struct hn_tx_ring *txr)
6133 {
6134 
6135 	if (txr->hn_sched_tx)
6136 		goto do_sched;
6137 
6138 	if (mtx_trylock(&txr->hn_tx_lock)) {
6139 		int sched;
6140 
6141 		txr->hn_oactive = 0;
6142 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6143 		mtx_unlock(&txr->hn_tx_lock);
6144 		if (sched) {
6145 			taskqueue_enqueue(txr->hn_tx_taskq,
6146 			    &txr->hn_tx_task);
6147 		}
6148 	} else {
6149 do_sched:
6150 		/*
6151 		 * Release the oactive earlier, with the hope, that
6152 		 * others could catch up.  The task will clear the
6153 		 * oactive again with the hn_tx_lock to avoid possible
6154 		 * races.
6155 		 */
6156 		txr->hn_oactive = 0;
6157 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6158 	}
6159 }
6160 
6161 static void
6162 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6163 {
6164 	struct hn_tx_ring *txr = xtxr;
6165 
6166 	mtx_lock(&txr->hn_tx_lock);
6167 	hn_xmit(txr, 0);
6168 	mtx_unlock(&txr->hn_tx_lock);
6169 }
6170 
6171 static void
6172 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6173 {
6174 	struct hn_tx_ring *txr = xtxr;
6175 
6176 	mtx_lock(&txr->hn_tx_lock);
6177 	txr->hn_oactive = 0;
6178 	hn_xmit(txr, 0);
6179 	mtx_unlock(&txr->hn_tx_lock);
6180 }
6181 
6182 static int
6183 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6184 {
6185 	struct vmbus_chan_br cbr;
6186 	struct hn_rx_ring *rxr;
6187 	struct hn_tx_ring *txr = NULL;
6188 	int idx, error;
6189 
6190 	idx = vmbus_chan_subidx(chan);
6191 
6192 	/*
6193 	 * Link this channel to RX/TX ring.
6194 	 */
6195 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6196 	    ("invalid channel index %d, should > 0 && < %d",
6197 	     idx, sc->hn_rx_ring_inuse));
6198 	rxr = &sc->hn_rx_ring[idx];
6199 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6200 	    ("RX ring %d already attached", idx));
6201 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6202 	rxr->hn_chan = chan;
6203 
6204 	if (bootverbose) {
6205 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6206 		    idx, vmbus_chan_id(chan));
6207 	}
6208 
6209 	if (idx < sc->hn_tx_ring_inuse) {
6210 		txr = &sc->hn_tx_ring[idx];
6211 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6212 		    ("TX ring %d already attached", idx));
6213 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6214 
6215 		txr->hn_chan = chan;
6216 		if (bootverbose) {
6217 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6218 			    idx, vmbus_chan_id(chan));
6219 		}
6220 	}
6221 
6222 	/* Bind this channel to a proper CPU. */
6223 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6224 
6225 	/*
6226 	 * Open this channel
6227 	 */
6228 	cbr.cbr = rxr->hn_br;
6229 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6230 	cbr.cbr_txsz = HN_TXBR_SIZE;
6231 	cbr.cbr_rxsz = HN_RXBR_SIZE;
6232 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6233 	if (error) {
6234 		if (error == EISCONN) {
6235 			if_printf(sc->hn_ifp, "bufring is connected after "
6236 			    "chan%u open failure\n", vmbus_chan_id(chan));
6237 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6238 		} else {
6239 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6240 			    vmbus_chan_id(chan), error);
6241 		}
6242 	}
6243 	return (error);
6244 }
6245 
6246 static void
6247 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6248 {
6249 	struct hn_rx_ring *rxr;
6250 	int idx, error;
6251 
6252 	idx = vmbus_chan_subidx(chan);
6253 
6254 	/*
6255 	 * Link this channel to RX/TX ring.
6256 	 */
6257 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6258 	    ("invalid channel index %d, should > 0 && < %d",
6259 	     idx, sc->hn_rx_ring_inuse));
6260 	rxr = &sc->hn_rx_ring[idx];
6261 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6262 	    ("RX ring %d is not attached", idx));
6263 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6264 
6265 	if (idx < sc->hn_tx_ring_inuse) {
6266 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6267 
6268 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6269 		    ("TX ring %d is not attached attached", idx));
6270 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6271 	}
6272 
6273 	/*
6274 	 * Close this channel.
6275 	 *
6276 	 * NOTE:
6277 	 * Channel closing does _not_ destroy the target channel.
6278 	 */
6279 	error = vmbus_chan_close_direct(chan);
6280 	if (error == EISCONN) {
6281 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
6282 		    "after being closed\n", vmbus_chan_id(chan));
6283 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6284 	} else if (error) {
6285 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6286 		    vmbus_chan_id(chan), error);
6287 	}
6288 }
6289 
6290 static int
6291 hn_attach_subchans(struct hn_softc *sc)
6292 {
6293 	struct vmbus_channel **subchans;
6294 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6295 	int i, error = 0;
6296 
6297 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
6298 
6299 	/* Attach the sub-channels. */
6300 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6301 	for (i = 0; i < subchan_cnt; ++i) {
6302 		int error1;
6303 
6304 		error1 = hn_chan_attach(sc, subchans[i]);
6305 		if (error1) {
6306 			error = error1;
6307 			/* Move on; all channels will be detached later. */
6308 		}
6309 	}
6310 	vmbus_subchan_rel(subchans, subchan_cnt);
6311 
6312 	if (error) {
6313 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6314 	} else {
6315 		if (bootverbose) {
6316 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6317 			    subchan_cnt);
6318 		}
6319 	}
6320 	return (error);
6321 }
6322 
6323 static void
6324 hn_detach_allchans(struct hn_softc *sc)
6325 {
6326 	struct vmbus_channel **subchans;
6327 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6328 	int i;
6329 
6330 	if (subchan_cnt == 0)
6331 		goto back;
6332 
6333 	/* Detach the sub-channels. */
6334 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6335 	for (i = 0; i < subchan_cnt; ++i)
6336 		hn_chan_detach(sc, subchans[i]);
6337 	vmbus_subchan_rel(subchans, subchan_cnt);
6338 
6339 back:
6340 	/*
6341 	 * Detach the primary channel, _after_ all sub-channels
6342 	 * are detached.
6343 	 */
6344 	hn_chan_detach(sc, sc->hn_prichan);
6345 
6346 	/* Wait for sub-channels to be destroyed, if any. */
6347 	vmbus_subchan_drain(sc->hn_prichan);
6348 
6349 #ifdef INVARIANTS
6350 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6351 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6352 		    HN_RX_FLAG_ATTACHED) == 0,
6353 		    ("%dth RX ring is still attached", i));
6354 	}
6355 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6356 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6357 		    HN_TX_FLAG_ATTACHED) == 0,
6358 		    ("%dth TX ring is still attached", i));
6359 	}
6360 #endif
6361 }
6362 
6363 static int
6364 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6365 {
6366 	struct vmbus_channel **subchans;
6367 	int nchan, rxr_cnt, error;
6368 
6369 	nchan = *nsubch + 1;
6370 	if (nchan == 1) {
6371 		/*
6372 		 * Multiple RX/TX rings are not requested.
6373 		 */
6374 		*nsubch = 0;
6375 		return (0);
6376 	}
6377 
6378 	/*
6379 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6380 	 * table entries.
6381 	 */
6382 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6383 	if (error) {
6384 		/* No RSS; this is benign. */
6385 		*nsubch = 0;
6386 		return (0);
6387 	}
6388 	if (bootverbose) {
6389 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6390 		    rxr_cnt, nchan);
6391 	}
6392 
6393 	if (nchan > rxr_cnt)
6394 		nchan = rxr_cnt;
6395 	if (nchan == 1) {
6396 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6397 		*nsubch = 0;
6398 		return (0);
6399 	}
6400 
6401 	/*
6402 	 * Allocate sub-channels from NVS.
6403 	 */
6404 	*nsubch = nchan - 1;
6405 	error = hn_nvs_alloc_subchans(sc, nsubch);
6406 	if (error || *nsubch == 0) {
6407 		/* Failed to allocate sub-channels. */
6408 		*nsubch = 0;
6409 		return (0);
6410 	}
6411 
6412 	/*
6413 	 * Wait for all sub-channels to become ready before moving on.
6414 	 */
6415 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6416 	vmbus_subchan_rel(subchans, *nsubch);
6417 	return (0);
6418 }
6419 
6420 static bool
6421 hn_synth_attachable(const struct hn_softc *sc)
6422 {
6423 	int i;
6424 
6425 	if (sc->hn_flags & HN_FLAG_ERRORS)
6426 		return (false);
6427 
6428 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6429 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6430 
6431 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6432 			return (false);
6433 	}
6434 	return (true);
6435 }
6436 
6437 /*
6438  * Make sure that the RX filter is zero after the successful
6439  * RNDIS initialization.
6440  *
6441  * NOTE:
6442  * Under certain conditions on certain versions of Hyper-V,
6443  * the RNDIS rxfilter is _not_ zero on the hypervisor side
6444  * after the successful RNDIS initialization, which breaks
6445  * the assumption of any following code (well, it breaks the
6446  * RNDIS API contract actually).  Clear the RNDIS rxfilter
6447  * explicitly, drain packets sneaking through, and drain the
6448  * interrupt taskqueues scheduled due to the stealth packets.
6449  */
6450 static void
6451 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6452 {
6453 
6454 	hn_disable_rx(sc);
6455 	hn_drain_rxtx(sc, nchan);
6456 }
6457 
6458 static int
6459 hn_synth_attach(struct hn_softc *sc, int mtu)
6460 {
6461 #define ATTACHED_NVS		0x0002
6462 #define ATTACHED_RNDIS		0x0004
6463 
6464 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6465 	int error, nsubch, nchan = 1, i, rndis_inited;
6466 	uint32_t old_caps, attached = 0;
6467 
6468 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6469 	    ("synthetic parts were attached"));
6470 
6471 	if (!hn_synth_attachable(sc))
6472 		return (ENXIO);
6473 
6474 	/* Save capabilities for later verification. */
6475 	old_caps = sc->hn_caps;
6476 	sc->hn_caps = 0;
6477 
6478 	/* Clear RSS stuffs. */
6479 	sc->hn_rss_ind_size = 0;
6480 	sc->hn_rss_hash = 0;
6481 	sc->hn_rss_hcap = 0;
6482 
6483 	/*
6484 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
6485 	 */
6486 	error = hn_chan_attach(sc, sc->hn_prichan);
6487 	if (error)
6488 		goto failed;
6489 
6490 	/*
6491 	 * Attach NVS.
6492 	 */
6493 	error = hn_nvs_attach(sc, mtu);
6494 	if (error)
6495 		goto failed;
6496 	attached |= ATTACHED_NVS;
6497 
6498 	/*
6499 	 * Attach RNDIS _after_ NVS is attached.
6500 	 */
6501 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6502 	if (rndis_inited)
6503 		attached |= ATTACHED_RNDIS;
6504 	if (error)
6505 		goto failed;
6506 
6507 	/*
6508 	 * Make sure capabilities are not changed.
6509 	 */
6510 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6511 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6512 		    old_caps, sc->hn_caps);
6513 		error = ENXIO;
6514 		goto failed;
6515 	}
6516 
6517 	/*
6518 	 * Allocate sub-channels for multi-TX/RX rings.
6519 	 *
6520 	 * NOTE:
6521 	 * The # of RX rings that can be used is equivalent to the # of
6522 	 * channels to be requested.
6523 	 */
6524 	nsubch = sc->hn_rx_ring_cnt - 1;
6525 	error = hn_synth_alloc_subchans(sc, &nsubch);
6526 	if (error)
6527 		goto failed;
6528 	/* NOTE: _Full_ synthetic parts detach is required now. */
6529 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6530 
6531 	/*
6532 	 * Set the # of TX/RX rings that could be used according to
6533 	 * the # of channels that NVS offered.
6534 	 */
6535 	nchan = nsubch + 1;
6536 	hn_set_ring_inuse(sc, nchan);
6537 	if (nchan == 1) {
6538 		/* Only the primary channel can be used; done */
6539 		goto back;
6540 	}
6541 
6542 	/*
6543 	 * Attach the sub-channels.
6544 	 *
6545 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6546 	 */
6547 	error = hn_attach_subchans(sc);
6548 	if (error)
6549 		goto failed;
6550 
6551 	/*
6552 	 * Configure RSS key and indirect table _after_ all sub-channels
6553 	 * are attached.
6554 	 */
6555 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6556 		/*
6557 		 * RSS key is not set yet; set it to the default RSS key.
6558 		 */
6559 		if (bootverbose)
6560 			if_printf(sc->hn_ifp, "setup default RSS key\n");
6561 #ifdef RSS
6562 		rss_getkey(rss->rss_key);
6563 #else
6564 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6565 #endif
6566 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6567 	}
6568 
6569 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6570 		/*
6571 		 * RSS indirect table is not set yet; set it up in round-
6572 		 * robin fashion.
6573 		 */
6574 		if (bootverbose) {
6575 			if_printf(sc->hn_ifp, "setup default RSS indirect "
6576 			    "table\n");
6577 		}
6578 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6579 			uint32_t subidx;
6580 
6581 #ifdef RSS
6582 			subidx = rss_get_indirection_to_bucket(i);
6583 #else
6584 			subidx = i;
6585 #endif
6586 			rss->rss_ind[i] = subidx % nchan;
6587 		}
6588 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6589 	} else {
6590 		/*
6591 		 * # of usable channels may be changed, so we have to
6592 		 * make sure that all entries in RSS indirect table
6593 		 * are valid.
6594 		 *
6595 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6596 		 */
6597 		hn_rss_ind_fixup(sc);
6598 	}
6599 
6600 	sc->hn_rss_hash = sc->hn_rss_hcap;
6601 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
6602 	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6603 		/* NOTE: Don't reconfigure RSS; will do immediately. */
6604 		hn_vf_rss_fixup(sc, false);
6605 	}
6606 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6607 	if (error)
6608 		goto failed;
6609 back:
6610 	/*
6611 	 * Fixup transmission aggregation setup.
6612 	 */
6613 	hn_set_txagg(sc);
6614 	hn_rndis_init_fixat(sc, nchan);
6615 	return (0);
6616 
6617 failed:
6618 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6619 		hn_rndis_init_fixat(sc, nchan);
6620 		hn_synth_detach(sc);
6621 	} else {
6622 		if (attached & ATTACHED_RNDIS) {
6623 			hn_rndis_init_fixat(sc, nchan);
6624 			hn_rndis_detach(sc);
6625 		}
6626 		if (attached & ATTACHED_NVS)
6627 			hn_nvs_detach(sc);
6628 		hn_chan_detach(sc, sc->hn_prichan);
6629 		/* Restore old capabilities. */
6630 		sc->hn_caps = old_caps;
6631 	}
6632 	return (error);
6633 
6634 #undef ATTACHED_RNDIS
6635 #undef ATTACHED_NVS
6636 }
6637 
6638 /*
6639  * NOTE:
6640  * The interface must have been suspended though hn_suspend(), before
6641  * this function get called.
6642  */
6643 static void
6644 hn_synth_detach(struct hn_softc *sc)
6645 {
6646 
6647 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6648 	    ("synthetic parts were not attached"));
6649 
6650 	/* Detach the RNDIS first. */
6651 	hn_rndis_detach(sc);
6652 
6653 	/* Detach NVS. */
6654 	hn_nvs_detach(sc);
6655 
6656 	/* Detach all of the channels. */
6657 	hn_detach_allchans(sc);
6658 
6659 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
6660 		/*
6661 		 * Host is post-Win2016, disconnect RXBUF from primary channel here.
6662 		 */
6663 		int error;
6664 
6665 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6666 		    sc->hn_rxbuf_gpadl);
6667 		if (error) {
6668 			if_printf(sc->hn_ifp,
6669 			    "rxbuf gpadl disconn failed: %d\n", error);
6670 			sc->hn_flags |= HN_FLAG_RXBUF_REF;
6671 		}
6672 		sc->hn_rxbuf_gpadl = 0;
6673 	}
6674 
6675 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
6676 		/*
6677 		 * Host is post-Win2016, disconnect chimney sending buffer from
6678 		 * primary channel here.
6679 		 */
6680 		int error;
6681 
6682 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6683 		    sc->hn_chim_gpadl);
6684 		if (error) {
6685 			if_printf(sc->hn_ifp,
6686 			    "chim gpadl disconn failed: %d\n", error);
6687 			sc->hn_flags |= HN_FLAG_CHIM_REF;
6688 		}
6689 		sc->hn_chim_gpadl = 0;
6690 	}
6691 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6692 }
6693 
6694 static void
6695 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6696 {
6697 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6698 	    ("invalid ring count %d", ring_cnt));
6699 
6700 	if (sc->hn_tx_ring_cnt > ring_cnt)
6701 		sc->hn_tx_ring_inuse = ring_cnt;
6702 	else
6703 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6704 	sc->hn_rx_ring_inuse = ring_cnt;
6705 
6706 #ifdef RSS
6707 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6708 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6709 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6710 		    rss_getnumbuckets());
6711 	}
6712 #endif
6713 
6714 	if (bootverbose) {
6715 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6716 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6717 	}
6718 }
6719 
6720 static void
6721 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6722 {
6723 
6724 	/*
6725 	 * NOTE:
6726 	 * The TX bufring will not be drained by the hypervisor,
6727 	 * if the primary channel is revoked.
6728 	 */
6729 	while (!vmbus_chan_rx_empty(chan) ||
6730 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6731 	     !vmbus_chan_tx_empty(chan)))
6732 		pause("waitch", 1);
6733 	vmbus_chan_intr_drain(chan);
6734 }
6735 
6736 static void
6737 hn_disable_rx(struct hn_softc *sc)
6738 {
6739 
6740 	/*
6741 	 * Disable RX by clearing RX filter forcefully.
6742 	 */
6743 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6744 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6745 
6746 	/*
6747 	 * Give RNDIS enough time to flush all pending data packets.
6748 	 */
6749 	pause("waitrx", (200 * hz) / 1000);
6750 }
6751 
6752 /*
6753  * NOTE:
6754  * RX/TX _must_ have been suspended/disabled, before this function
6755  * is called.
6756  */
6757 static void
6758 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6759 {
6760 	struct vmbus_channel **subch = NULL;
6761 	int nsubch;
6762 
6763 	/*
6764 	 * Drain RX/TX bufrings and interrupts.
6765 	 */
6766 	nsubch = nchan - 1;
6767 	if (nsubch > 0)
6768 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6769 
6770 	if (subch != NULL) {
6771 		int i;
6772 
6773 		for (i = 0; i < nsubch; ++i)
6774 			hn_chan_drain(sc, subch[i]);
6775 	}
6776 	hn_chan_drain(sc, sc->hn_prichan);
6777 
6778 	if (subch != NULL)
6779 		vmbus_subchan_rel(subch, nsubch);
6780 }
6781 
6782 static void
6783 hn_suspend_data(struct hn_softc *sc)
6784 {
6785 	struct hn_tx_ring *txr;
6786 	int i;
6787 
6788 	HN_LOCK_ASSERT(sc);
6789 
6790 	/*
6791 	 * Suspend TX.
6792 	 */
6793 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6794 		txr = &sc->hn_tx_ring[i];
6795 
6796 		mtx_lock(&txr->hn_tx_lock);
6797 		txr->hn_suspended = 1;
6798 		mtx_unlock(&txr->hn_tx_lock);
6799 		/* No one is able send more packets now. */
6800 
6801 		/*
6802 		 * Wait for all pending sends to finish.
6803 		 *
6804 		 * NOTE:
6805 		 * We will _not_ receive all pending send-done, if the
6806 		 * primary channel is revoked.
6807 		 */
6808 		while (hn_tx_ring_pending(txr) &&
6809 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6810 			pause("hnwtx", 1 /* 1 tick */);
6811 	}
6812 
6813 	/*
6814 	 * Disable RX.
6815 	 */
6816 	hn_disable_rx(sc);
6817 
6818 	/*
6819 	 * Drain RX/TX.
6820 	 */
6821 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6822 
6823 	/*
6824 	 * Drain any pending TX tasks.
6825 	 *
6826 	 * NOTE:
6827 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6828 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6829 	 */
6830 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6831 		txr = &sc->hn_tx_ring[i];
6832 
6833 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6834 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6835 	}
6836 }
6837 
6838 static void
6839 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6840 {
6841 
6842 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6843 }
6844 
6845 static void
6846 hn_suspend_mgmt(struct hn_softc *sc)
6847 {
6848 	struct task task;
6849 
6850 	HN_LOCK_ASSERT(sc);
6851 
6852 	/*
6853 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6854 	 * through hn_mgmt_taskq.
6855 	 */
6856 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6857 	vmbus_chan_run_task(sc->hn_prichan, &task);
6858 
6859 	/*
6860 	 * Make sure that all pending management tasks are completed.
6861 	 */
6862 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6863 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6864 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6865 }
6866 
6867 static void
6868 hn_suspend(struct hn_softc *sc)
6869 {
6870 
6871 	/* Disable polling. */
6872 	hn_polling(sc, 0);
6873 
6874 	/*
6875 	 * If the non-transparent mode VF is activated, the synthetic
6876 	 * device is receiving packets, so the data path of the
6877 	 * synthetic device must be suspended.
6878 	 */
6879 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6880 	    (sc->hn_flags & HN_FLAG_RXVF))
6881 		hn_suspend_data(sc);
6882 	hn_suspend_mgmt(sc);
6883 }
6884 
6885 static void
6886 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6887 {
6888 	int i;
6889 
6890 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6891 	    ("invalid TX ring count %d", tx_ring_cnt));
6892 
6893 	for (i = 0; i < tx_ring_cnt; ++i) {
6894 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6895 
6896 		mtx_lock(&txr->hn_tx_lock);
6897 		txr->hn_suspended = 0;
6898 		mtx_unlock(&txr->hn_tx_lock);
6899 	}
6900 }
6901 
6902 static void
6903 hn_resume_data(struct hn_softc *sc)
6904 {
6905 	int i;
6906 
6907 	HN_LOCK_ASSERT(sc);
6908 
6909 	/*
6910 	 * Re-enable RX.
6911 	 */
6912 	hn_rxfilter_config(sc);
6913 
6914 	/*
6915 	 * Make sure to clear suspend status on "all" TX rings,
6916 	 * since hn_tx_ring_inuse can be changed after
6917 	 * hn_suspend_data().
6918 	 */
6919 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6920 
6921 #ifdef HN_IFSTART_SUPPORT
6922 	if (!hn_use_if_start)
6923 #endif
6924 	{
6925 		/*
6926 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6927 		 * reduced.
6928 		 */
6929 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6930 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6931 	}
6932 
6933 	/*
6934 	 * Kick start TX.
6935 	 */
6936 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6937 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6938 
6939 		/*
6940 		 * Use txeof task, so that any pending oactive can be
6941 		 * cleared properly.
6942 		 */
6943 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6944 	}
6945 }
6946 
6947 static void
6948 hn_resume_mgmt(struct hn_softc *sc)
6949 {
6950 
6951 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6952 
6953 	/*
6954 	 * Kick off network change detection, if it was pending.
6955 	 * If no network change was pending, start link status
6956 	 * checks, which is more lightweight than network change
6957 	 * detection.
6958 	 */
6959 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6960 		hn_change_network(sc);
6961 	else
6962 		hn_update_link_status(sc);
6963 }
6964 
6965 static void
6966 hn_resume(struct hn_softc *sc)
6967 {
6968 
6969 	/*
6970 	 * If the non-transparent mode VF is activated, the synthetic
6971 	 * device have to receive packets, so the data path of the
6972 	 * synthetic device must be resumed.
6973 	 */
6974 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6975 	    (sc->hn_flags & HN_FLAG_RXVF))
6976 		hn_resume_data(sc);
6977 
6978 	/*
6979 	 * Don't resume link status change if VF is attached/activated.
6980 	 * - In the non-transparent VF mode, the synthetic device marks
6981 	 *   link down until the VF is deactivated; i.e. VF is down.
6982 	 * - In transparent VF mode, VF's media status is used until
6983 	 *   the VF is detached.
6984 	 */
6985 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6986 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6987 		hn_resume_mgmt(sc);
6988 
6989 	/*
6990 	 * Re-enable polling if this interface is running and
6991 	 * the polling is requested.
6992 	 */
6993 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6994 		hn_polling(sc, sc->hn_pollhz);
6995 }
6996 
6997 static void
6998 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6999 {
7000 	const struct rndis_status_msg *msg;
7001 	int ofs;
7002 
7003 	if (dlen < sizeof(*msg)) {
7004 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
7005 		return;
7006 	}
7007 	msg = data;
7008 
7009 	switch (msg->rm_status) {
7010 	case RNDIS_STATUS_MEDIA_CONNECT:
7011 	case RNDIS_STATUS_MEDIA_DISCONNECT:
7012 		hn_update_link_status(sc);
7013 		break;
7014 
7015 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
7016 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
7017 		/* Not really useful; ignore. */
7018 		break;
7019 
7020 	case RNDIS_STATUS_NETWORK_CHANGE:
7021 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
7022 		if (dlen < ofs + msg->rm_stbuflen ||
7023 		    msg->rm_stbuflen < sizeof(uint32_t)) {
7024 			if_printf(sc->hn_ifp, "network changed\n");
7025 		} else {
7026 			uint32_t change;
7027 
7028 			memcpy(&change, ((const uint8_t *)msg) + ofs,
7029 			    sizeof(change));
7030 			if_printf(sc->hn_ifp, "network changed, change %u\n",
7031 			    change);
7032 		}
7033 		hn_change_network(sc);
7034 		break;
7035 
7036 	default:
7037 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
7038 		    msg->rm_status);
7039 		break;
7040 	}
7041 }
7042 
7043 static int
7044 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
7045 {
7046 	const struct rndis_pktinfo *pi = info_data;
7047 	uint32_t mask = 0;
7048 
7049 	while (info_dlen != 0) {
7050 		const void *data;
7051 		uint32_t dlen;
7052 
7053 		if (__predict_false(info_dlen < sizeof(*pi)))
7054 			return (EINVAL);
7055 		if (__predict_false(info_dlen < pi->rm_size))
7056 			return (EINVAL);
7057 		info_dlen -= pi->rm_size;
7058 
7059 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
7060 			return (EINVAL);
7061 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
7062 			return (EINVAL);
7063 		dlen = pi->rm_size - pi->rm_pktinfooffset;
7064 		data = pi->rm_data;
7065 
7066 		if (pi->rm_internal == 1) {
7067 			switch (pi->rm_type) {
7068 			case NDIS_PKTINFO_IT_PKTINFO_ID:
7069 				if (__predict_false(dlen < NDIS_PKTINFOID_SZ))
7070 					return (EINVAL);
7071 				info->pktinfo_id =
7072 				    (const struct packet_info_id *)data;
7073 				mask |= HN_RXINFO_PKTINFO_ID;
7074 				break;
7075 
7076 			default:
7077 				goto next;
7078 			}
7079 		} else {
7080 			switch (pi->rm_type) {
7081 			case NDIS_PKTINFO_TYPE_VLAN:
7082 				if (__predict_false(dlen
7083 				    < NDIS_VLAN_INFO_SIZE))
7084 					return (EINVAL);
7085 				info->vlan_info = (const uint32_t *)data;
7086 				mask |= HN_RXINFO_VLAN;
7087 				break;
7088 
7089 			case NDIS_PKTINFO_TYPE_CSUM:
7090 				if (__predict_false(dlen
7091 				    < NDIS_RXCSUM_INFO_SIZE))
7092 					return (EINVAL);
7093 				info->csum_info = (const uint32_t *)data;
7094 				mask |= HN_RXINFO_CSUM;
7095 				break;
7096 
7097 			case HN_NDIS_PKTINFO_TYPE_HASHVAL:
7098 				if (__predict_false(dlen
7099 				    < HN_NDIS_HASH_VALUE_SIZE))
7100 					return (EINVAL);
7101 				info->hash_value = (const uint32_t *)data;
7102 				mask |= HN_RXINFO_HASHVAL;
7103 				break;
7104 
7105 			case HN_NDIS_PKTINFO_TYPE_HASHINF:
7106 				if (__predict_false(dlen
7107 				    < HN_NDIS_HASH_INFO_SIZE))
7108 					return (EINVAL);
7109 				info->hash_info = (const uint32_t *)data;
7110 				mask |= HN_RXINFO_HASHINF;
7111 				break;
7112 
7113 			default:
7114 				goto next;
7115 			}
7116 		}
7117 
7118 		if (mask == HN_RXINFO_ALL) {
7119 			/* All found; done */
7120 			break;
7121 		}
7122 next:
7123 		pi = (const struct rndis_pktinfo *)
7124 		    ((const uint8_t *)pi + pi->rm_size);
7125 	}
7126 
7127 	/*
7128 	 * Final fixup.
7129 	 * - If there is no hash value, invalidate the hash info.
7130 	 */
7131 	if ((mask & HN_RXINFO_HASHVAL) == 0)
7132 		info->hash_info = NULL;
7133 	return (0);
7134 }
7135 
7136 static __inline bool
7137 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7138 {
7139 
7140 	if (off < check_off) {
7141 		if (__predict_true(off + len <= check_off))
7142 			return (false);
7143 	} else if (off > check_off) {
7144 		if (__predict_true(check_off + check_len <= off))
7145 			return (false);
7146 	}
7147 	return (true);
7148 }
7149 
7150 static __inline void
7151 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data,
7152 		uint32_t len, struct hn_rxinfo *info)
7153 {
7154 	uint32_t cnt = rxr->rsc.cnt;
7155 
7156 	if (cnt) {
7157 		rxr->rsc.pktlen += len;
7158 	} else {
7159 		rxr->rsc.vlan_info = info->vlan_info;
7160 		rxr->rsc.csum_info = info->csum_info;
7161 		rxr->rsc.hash_info = info->hash_info;
7162 		rxr->rsc.hash_value = info->hash_value;
7163 		rxr->rsc.pktlen = len;
7164 	}
7165 
7166 	rxr->rsc.frag_data[cnt] = data;
7167 	rxr->rsc.frag_len[cnt] = len;
7168 	rxr->rsc.cnt++;
7169 }
7170 
7171 static void
7172 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7173 {
7174 	const struct rndis_packet_msg *pkt;
7175 	struct hn_rxinfo info;
7176 	int data_off, pktinfo_off, data_len, pktinfo_len;
7177 	bool rsc_more= false;
7178 
7179 	/*
7180 	 * Check length.
7181 	 */
7182 	if (__predict_false(dlen < sizeof(*pkt))) {
7183 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7184 		return;
7185 	}
7186 	pkt = data;
7187 
7188 	if (__predict_false(dlen < pkt->rm_len)) {
7189 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7190 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7191 		return;
7192 	}
7193 	if (__predict_false(pkt->rm_len <
7194 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7195 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7196 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
7197 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7198 		    pkt->rm_pktinfolen);
7199 		return;
7200 	}
7201 	if (__predict_false(pkt->rm_datalen == 0)) {
7202 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7203 		return;
7204 	}
7205 
7206 	/*
7207 	 * Check offests.
7208 	 */
7209 #define IS_OFFSET_INVALID(ofs)			\
7210 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
7211 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7212 
7213 	/* XXX Hyper-V does not meet data offset alignment requirement */
7214 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7215 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7216 		    "data offset %u\n", pkt->rm_dataoffset);
7217 		return;
7218 	}
7219 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7220 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7221 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7222 		    "oob offset %u\n", pkt->rm_oobdataoffset);
7223 		return;
7224 	}
7225 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7226 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7227 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7228 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7229 		return;
7230 	}
7231 
7232 #undef IS_OFFSET_INVALID
7233 
7234 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7235 	data_len = pkt->rm_datalen;
7236 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7237 	pktinfo_len = pkt->rm_pktinfolen;
7238 
7239 	/*
7240 	 * Check OOB coverage.
7241 	 */
7242 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
7243 		int oob_off, oob_len;
7244 
7245 		if_printf(rxr->hn_ifp, "got oobdata\n");
7246 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7247 		oob_len = pkt->rm_oobdatalen;
7248 
7249 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7250 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7251 			    "oob overflow, msglen %u, oob abs %d len %d\n",
7252 			    pkt->rm_len, oob_off, oob_len);
7253 			return;
7254 		}
7255 
7256 		/*
7257 		 * Check against data.
7258 		 */
7259 		if (hn_rndis_check_overlap(oob_off, oob_len,
7260 		    data_off, data_len)) {
7261 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7262 			    "oob overlaps data, oob abs %d len %d, "
7263 			    "data abs %d len %d\n",
7264 			    oob_off, oob_len, data_off, data_len);
7265 			return;
7266 		}
7267 
7268 		/*
7269 		 * Check against pktinfo.
7270 		 */
7271 		if (pktinfo_len != 0 &&
7272 		    hn_rndis_check_overlap(oob_off, oob_len,
7273 		    pktinfo_off, pktinfo_len)) {
7274 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7275 			    "oob overlaps pktinfo, oob abs %d len %d, "
7276 			    "pktinfo abs %d len %d\n",
7277 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
7278 			return;
7279 		}
7280 	}
7281 
7282 	/*
7283 	 * Check per-packet-info coverage and find useful per-packet-info.
7284 	 */
7285 	info.vlan_info = NULL;
7286 	info.csum_info = NULL;
7287 	info.hash_info = NULL;
7288 	info.pktinfo_id = NULL;
7289 
7290 	if (__predict_true(pktinfo_len != 0)) {
7291 		bool overlap;
7292 		int error;
7293 
7294 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7295 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7296 			    "pktinfo overflow, msglen %u, "
7297 			    "pktinfo abs %d len %d\n",
7298 			    pkt->rm_len, pktinfo_off, pktinfo_len);
7299 			return;
7300 		}
7301 
7302 		/*
7303 		 * Check packet info coverage.
7304 		 */
7305 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7306 		    data_off, data_len);
7307 		if (__predict_false(overlap)) {
7308 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7309 			    "pktinfo overlap data, pktinfo abs %d len %d, "
7310 			    "data abs %d len %d\n",
7311 			    pktinfo_off, pktinfo_len, data_off, data_len);
7312 			return;
7313 		}
7314 
7315 		/*
7316 		 * Find useful per-packet-info.
7317 		 */
7318 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7319 		    pktinfo_len, &info);
7320 		if (__predict_false(error)) {
7321 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7322 			    "pktinfo\n");
7323 			return;
7324 		}
7325 	}
7326 
7327 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
7328 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7329 		    "data overflow, msglen %u, data abs %d len %d\n",
7330 		    pkt->rm_len, data_off, data_len);
7331 		return;
7332 	}
7333 
7334 	/* Identify RSC fragments, drop invalid packets */
7335 	if ((info.pktinfo_id != NULL) &&
7336 	    (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) {
7337 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) {
7338 			rxr->rsc.cnt = 0;
7339 			rxr->hn_rsc_pkts++;
7340 		} else if (rxr->rsc.cnt == 0)
7341 			goto drop;
7342 
7343 		rsc_more = true;
7344 
7345 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG)
7346 			rsc_more = false;
7347 
7348 		if (rsc_more && rxr->rsc.is_last)
7349 			goto drop;
7350 	} else {
7351 		rxr->rsc.cnt = 0;
7352 	}
7353 
7354 	if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX))
7355 		goto drop;
7356 
7357 	/* Store data in per rx ring structure */
7358 	hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off,
7359 	    data_len, &info);
7360 
7361 	if (rsc_more)
7362 		return;
7363 
7364 	hn_rxpkt(rxr);
7365 	rxr->rsc.cnt = 0;
7366 	return;
7367 drop:
7368 	rxr->hn_rsc_drop++;
7369 	return;
7370 }
7371 
7372 static __inline void
7373 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7374 {
7375 	const struct rndis_msghdr *hdr;
7376 
7377 	if (__predict_false(dlen < sizeof(*hdr))) {
7378 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7379 		return;
7380 	}
7381 	hdr = data;
7382 
7383 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7384 		/* Hot data path. */
7385 		hn_rndis_rx_data(rxr, data, dlen);
7386 		/* Done! */
7387 		return;
7388 	}
7389 
7390 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7391 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7392 	else
7393 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7394 }
7395 
7396 static void
7397 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7398 {
7399 	const struct hn_nvs_hdr *hdr;
7400 
7401 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7402 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
7403 		return;
7404 	}
7405 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7406 
7407 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7408 		/* Useless; ignore */
7409 		return;
7410 	}
7411 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7412 }
7413 
7414 static void
7415 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7416     const struct vmbus_chanpkt_hdr *pkt)
7417 {
7418 	struct hn_nvs_sendctx *sndc;
7419 
7420 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7421 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7422 	    VMBUS_CHANPKT_DATALEN(pkt));
7423 	/*
7424 	 * NOTE:
7425 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7426 	 * its callback.
7427 	 */
7428 }
7429 
7430 static void
7431 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7432     const struct vmbus_chanpkt_hdr *pkthdr)
7433 {
7434 	struct epoch_tracker et;
7435 	const struct vmbus_chanpkt_rxbuf *pkt;
7436 	const struct hn_nvs_hdr *nvs_hdr;
7437 	int count, i, hlen;
7438 
7439 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7440 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7441 		return;
7442 	}
7443 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7444 
7445 	/* Make sure that this is a RNDIS message. */
7446 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7447 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7448 		    nvs_hdr->nvs_type);
7449 		return;
7450 	}
7451 
7452 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7453 	if (__predict_false(hlen < sizeof(*pkt))) {
7454 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7455 		return;
7456 	}
7457 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7458 
7459 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7460 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7461 		    pkt->cp_rxbuf_id);
7462 		return;
7463 	}
7464 
7465 	count = pkt->cp_rxbuf_cnt;
7466 	if (__predict_false(hlen <
7467 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7468 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7469 		return;
7470 	}
7471 
7472 	NET_EPOCH_ENTER(et);
7473 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7474 	for (i = 0; i < count; ++i) {
7475 		int ofs, len;
7476 
7477 		ofs = pkt->cp_rxbuf[i].rb_ofs;
7478 		len = pkt->cp_rxbuf[i].rb_len;
7479 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7480 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7481 			    "ofs %d, len %d\n", i, ofs, len);
7482 			continue;
7483 		}
7484 
7485 		rxr->rsc.is_last = (i == (count - 1));
7486 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7487 	}
7488 	NET_EPOCH_EXIT(et);
7489 
7490 	/*
7491 	 * Ack the consumed RXBUF associated w/ this channel packet,
7492 	 * so that this RXBUF can be recycled by the hypervisor.
7493 	 */
7494 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7495 }
7496 
7497 static void
7498 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7499     uint64_t tid)
7500 {
7501 	struct hn_nvs_rndis_ack ack;
7502 	int retries, error;
7503 
7504 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7505 	ack.nvs_status = HN_NVS_STATUS_OK;
7506 
7507 	retries = 0;
7508 again:
7509 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7510 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7511 	if (__predict_false(error == EAGAIN)) {
7512 		/*
7513 		 * NOTE:
7514 		 * This should _not_ happen in real world, since the
7515 		 * consumption of the TX bufring from the TX path is
7516 		 * controlled.
7517 		 */
7518 		if (rxr->hn_ack_failed == 0)
7519 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7520 		rxr->hn_ack_failed++;
7521 		retries++;
7522 		if (retries < 10) {
7523 			DELAY(100);
7524 			goto again;
7525 		}
7526 		/* RXBUF leaks! */
7527 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7528 	}
7529 }
7530 
7531 static void
7532 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7533 {
7534 	struct hn_rx_ring *rxr = xrxr;
7535 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
7536 
7537 	for (;;) {
7538 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7539 		int error, pktlen;
7540 
7541 		pktlen = rxr->hn_pktbuf_len;
7542 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7543 		if (__predict_false(error == ENOBUFS)) {
7544 			void *nbuf;
7545 			int nlen;
7546 
7547 			/*
7548 			 * Expand channel packet buffer.
7549 			 *
7550 			 * XXX
7551 			 * Use M_WAITOK here, since allocation failure
7552 			 * is fatal.
7553 			 */
7554 			nlen = rxr->hn_pktbuf_len * 2;
7555 			while (nlen < pktlen)
7556 				nlen *= 2;
7557 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7558 
7559 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7560 			    rxr->hn_pktbuf_len, nlen);
7561 
7562 			free(rxr->hn_pktbuf, M_DEVBUF);
7563 			rxr->hn_pktbuf = nbuf;
7564 			rxr->hn_pktbuf_len = nlen;
7565 			/* Retry! */
7566 			continue;
7567 		} else if (__predict_false(error == EAGAIN)) {
7568 			/* No more channel packets; done! */
7569 			break;
7570 		}
7571 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7572 
7573 		switch (pkt->cph_type) {
7574 		case VMBUS_CHANPKT_TYPE_COMP:
7575 			hn_nvs_handle_comp(sc, chan, pkt);
7576 			break;
7577 
7578 		case VMBUS_CHANPKT_TYPE_RXBUF:
7579 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
7580 			break;
7581 
7582 		case VMBUS_CHANPKT_TYPE_INBAND:
7583 			hn_nvs_handle_notify(sc, pkt);
7584 			break;
7585 
7586 		default:
7587 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7588 			    pkt->cph_type);
7589 			break;
7590 		}
7591 	}
7592 	hn_chan_rollup(rxr, rxr->hn_txr);
7593 }
7594 
7595 static void
7596 hn_sysinit(void *arg __unused)
7597 {
7598 	int i;
7599 
7600 	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7601 
7602 #ifdef HN_IFSTART_SUPPORT
7603 	/*
7604 	 * Don't use ifnet.if_start if transparent VF mode is requested;
7605 	 * mainly due to the IFF_DRV_OACTIVE flag.
7606 	 */
7607 	if (hn_xpnt_vf && hn_use_if_start) {
7608 		hn_use_if_start = 0;
7609 		printf("hn: tranparent VF mode, if_transmit will be used, "
7610 		    "instead of if_start\n");
7611 	}
7612 #endif
7613 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7614 		printf("hn: invalid transparent VF attach routing "
7615 		    "wait timeout %d, reset to %d\n",
7616 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7617 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7618 	}
7619 
7620 	/*
7621 	 * Initialize VF map.
7622 	 */
7623 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7624 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7625 	hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7626 	    M_WAITOK | M_ZERO);
7627 
7628 	/*
7629 	 * Fix the # of TX taskqueues.
7630 	 */
7631 	if (hn_tx_taskq_cnt <= 0)
7632 		hn_tx_taskq_cnt = 1;
7633 	else if (hn_tx_taskq_cnt > mp_ncpus)
7634 		hn_tx_taskq_cnt = mp_ncpus;
7635 
7636 	/*
7637 	 * Fix the TX taskqueue mode.
7638 	 */
7639 	switch (hn_tx_taskq_mode) {
7640 	case HN_TX_TASKQ_M_INDEP:
7641 	case HN_TX_TASKQ_M_GLOBAL:
7642 	case HN_TX_TASKQ_M_EVTTQ:
7643 		break;
7644 	default:
7645 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7646 		break;
7647 	}
7648 
7649 	if (vm_guest != VM_GUEST_HV)
7650 		return;
7651 
7652 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7653 		return;
7654 
7655 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7656 	    M_DEVBUF, M_WAITOK);
7657 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7658 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7659 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7660 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7661 		    "hn tx%d", i);
7662 	}
7663 }
7664 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7665 
7666 static void
7667 hn_sysuninit(void *arg __unused)
7668 {
7669 
7670 	if (hn_tx_taskque != NULL) {
7671 		int i;
7672 
7673 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7674 			taskqueue_free(hn_tx_taskque[i]);
7675 		free(hn_tx_taskque, M_DEVBUF);
7676 	}
7677 
7678 	if (hn_vfmap != NULL)
7679 		free(hn_vfmap, M_DEVBUF);
7680 	rm_destroy(&hn_vfmap_lock);
7681 
7682 	counter_u64_free(hn_udpcs_fixup);
7683 }
7684 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7685