xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision 1323ec57)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/bus.h>
66 #include <sys/counter.h>
67 #include <sys/kernel.h>
68 #include <sys/limits.h>
69 #include <sys/malloc.h>
70 #include <sys/mbuf.h>
71 #include <sys/module.h>
72 #include <sys/queue.h>
73 #include <sys/lock.h>
74 #include <sys/proc.h>
75 #include <sys/rmlock.h>
76 #include <sys/sbuf.h>
77 #include <sys/sched.h>
78 #include <sys/smp.h>
79 #include <sys/socket.h>
80 #include <sys/sockio.h>
81 #include <sys/sx.h>
82 #include <sys/sysctl.h>
83 #include <sys/taskqueue.h>
84 #include <sys/buf_ring.h>
85 #include <sys/eventhandler.h>
86 #include <sys/epoch.h>
87 
88 #include <machine/atomic.h>
89 #include <machine/in_cksum.h>
90 
91 #include <net/bpf.h>
92 #include <net/ethernet.h>
93 #include <net/if.h>
94 #include <net/if_dl.h>
95 #include <net/if_media.h>
96 #include <net/if_types.h>
97 #include <net/if_var.h>
98 #include <net/rndis.h>
99 #ifdef RSS
100 #include <net/rss_config.h>
101 #endif
102 
103 #include <netinet/in_systm.h>
104 #include <netinet/in.h>
105 #include <netinet/ip.h>
106 #include <netinet/ip6.h>
107 #include <netinet/tcp.h>
108 #include <netinet/tcp_lro.h>
109 #include <netinet/udp.h>
110 
111 #include <dev/hyperv/include/hyperv.h>
112 #include <dev/hyperv/include/hyperv_busdma.h>
113 #include <dev/hyperv/include/vmbus.h>
114 #include <dev/hyperv/include/vmbus_xact.h>
115 
116 #include <dev/hyperv/netvsc/ndis.h>
117 #include <dev/hyperv/netvsc/if_hnreg.h>
118 #include <dev/hyperv/netvsc/if_hnvar.h>
119 #include <dev/hyperv/netvsc/hn_nvs.h>
120 #include <dev/hyperv/netvsc/hn_rndis.h>
121 
122 #include "vmbus_if.h"
123 
124 #define HN_IFSTART_SUPPORT
125 
126 #define HN_RING_CNT_DEF_MAX		8
127 
128 #define HN_VFMAP_SIZE_DEF		8
129 
130 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
131 
132 /* YYY should get it from the underlying channel */
133 #define HN_TX_DESC_CNT			512
134 
135 #define HN_RNDIS_PKT_LEN					\
136 	(sizeof(struct rndis_packet_msg) +			\
137 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
138 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
139 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
140 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
141 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
142 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
143 
144 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
145 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
146 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
147 /* -1 for RNDIS packet message */
148 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
149 
150 #define HN_DIRECT_TX_SIZE_DEF		128
151 
152 #define HN_EARLY_TXEOF_THRESH		8
153 
154 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
155 
156 #define HN_LROENT_CNT_DEF		128
157 
158 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
159 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
160 /* YYY 2*MTU is a bit rough, but should be good enough. */
161 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
162 
163 #define HN_LRO_ACKCNT_DEF		1
164 
165 #define HN_LOCK_INIT(sc)		\
166 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
167 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
168 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
169 #define HN_LOCK(sc)					\
170 do {							\
171 	while (sx_try_xlock(&(sc)->hn_lock) == 0) {	\
172 		/* Relinquish cpu to avoid deadlock */	\
173 		sched_relinquish(curthread);		\
174 		DELAY(1000);				\
175 	}						\
176 } while (0)
177 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
178 
179 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
180 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
181 #define HN_CSUM_IP_HWASSIST(sc)		\
182 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
183 #define HN_CSUM_IP6_HWASSIST(sc)	\
184 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
185 
186 #define HN_PKTSIZE_MIN(align)		\
187 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
188 	    HN_RNDIS_PKT_LEN, (align))
189 #define HN_PKTSIZE(m, align)		\
190 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
191 
192 #ifdef RSS
193 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
194 #else
195 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
196 #endif
197 
198 struct hn_txdesc {
199 #ifndef HN_USE_TXDESC_BUFRING
200 	SLIST_ENTRY(hn_txdesc)		link;
201 #endif
202 	STAILQ_ENTRY(hn_txdesc)		agg_link;
203 
204 	/* Aggregated txdescs, in sending order. */
205 	STAILQ_HEAD(, hn_txdesc)	agg_list;
206 
207 	/* The oldest packet, if transmission aggregation happens. */
208 	struct mbuf			*m;
209 	struct hn_tx_ring		*txr;
210 	int				refs;
211 	uint32_t			flags;	/* HN_TXD_FLAG_ */
212 	struct hn_nvs_sendctx		send_ctx;
213 	uint32_t			chim_index;
214 	int				chim_size;
215 
216 	bus_dmamap_t			data_dmap;
217 
218 	bus_addr_t			rndis_pkt_paddr;
219 	struct rndis_packet_msg		*rndis_pkt;
220 	bus_dmamap_t			rndis_pkt_dmap;
221 };
222 
223 #define HN_TXD_FLAG_ONLIST		0x0001
224 #define HN_TXD_FLAG_DMAMAP		0x0002
225 #define HN_TXD_FLAG_ONAGG		0x0004
226 
227 #define	HN_NDIS_PKTINFO_SUBALLOC	0x01
228 #define	HN_NDIS_PKTINFO_1ST_FRAG	0x02
229 #define	HN_NDIS_PKTINFO_LAST_FRAG	0x04
230 
231 struct packet_info_id {
232 	uint8_t				ver;
233 	uint8_t				flag;
234 	uint16_t			pkt_id;
235 };
236 
237 #define NDIS_PKTINFOID_SZ		sizeof(struct packet_info_id)
238 
239 
240 struct hn_rxinfo {
241 	const uint32_t			*vlan_info;
242 	const uint32_t			*csum_info;
243 	const uint32_t			*hash_info;
244 	const uint32_t			*hash_value;
245 	const struct packet_info_id	*pktinfo_id;
246 };
247 
248 struct hn_rxvf_setarg {
249 	struct hn_rx_ring	*rxr;
250 	struct ifnet		*vf_ifp;
251 };
252 
253 #define HN_RXINFO_VLAN			0x0001
254 #define HN_RXINFO_CSUM			0x0002
255 #define HN_RXINFO_HASHINF		0x0004
256 #define HN_RXINFO_HASHVAL		0x0008
257 #define HN_RXINFO_PKTINFO_ID		0x0010
258 #define HN_RXINFO_ALL			\
259 	(HN_RXINFO_VLAN |		\
260 	 HN_RXINFO_CSUM |		\
261 	 HN_RXINFO_HASHINF |		\
262 	 HN_RXINFO_HASHVAL |		\
263 	 HN_RXINFO_PKTINFO_ID)
264 
265 static int			hn_probe(device_t);
266 static int			hn_attach(device_t);
267 static int			hn_detach(device_t);
268 static int			hn_shutdown(device_t);
269 static void			hn_chan_callback(struct vmbus_channel *,
270 				    void *);
271 
272 static void			hn_init(void *);
273 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
274 #ifdef HN_IFSTART_SUPPORT
275 static void			hn_start(struct ifnet *);
276 #endif
277 static int			hn_transmit(struct ifnet *, struct mbuf *);
278 static void			hn_xmit_qflush(struct ifnet *);
279 static int			hn_ifmedia_upd(struct ifnet *);
280 static void			hn_ifmedia_sts(struct ifnet *,
281 				    struct ifmediareq *);
282 
283 static void			hn_ifnet_event(void *, struct ifnet *, int);
284 static void			hn_ifaddr_event(void *, struct ifnet *);
285 static void			hn_ifnet_attevent(void *, struct ifnet *);
286 static void			hn_ifnet_detevent(void *, struct ifnet *);
287 static void			hn_ifnet_lnkevent(void *, struct ifnet *, int);
288 
289 static bool			hn_ismyvf(const struct hn_softc *,
290 				    const struct ifnet *);
291 static void			hn_rxvf_change(struct hn_softc *,
292 				    struct ifnet *, bool);
293 static void			hn_rxvf_set(struct hn_softc *, struct ifnet *);
294 static void			hn_rxvf_set_task(void *, int);
295 static void			hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
296 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
297 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
298 				    struct ifreq *);
299 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
300 static bool			hn_xpnt_vf_isready(struct hn_softc *);
301 static void			hn_xpnt_vf_setready(struct hn_softc *);
302 static void			hn_xpnt_vf_init_taskfunc(void *, int);
303 static void			hn_xpnt_vf_init(struct hn_softc *);
304 static void			hn_xpnt_vf_setenable(struct hn_softc *);
305 static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
306 static void			hn_vf_rss_fixup(struct hn_softc *, bool);
307 static void			hn_vf_rss_restore(struct hn_softc *);
308 
309 static int			hn_rndis_rxinfo(const void *, int,
310 				    struct hn_rxinfo *);
311 static void			hn_rndis_rx_data(struct hn_rx_ring *,
312 				    const void *, int);
313 static void			hn_rndis_rx_status(struct hn_softc *,
314 				    const void *, int);
315 static void			hn_rndis_init_fixat(struct hn_softc *, int);
316 
317 static void			hn_nvs_handle_notify(struct hn_softc *,
318 				    const struct vmbus_chanpkt_hdr *);
319 static void			hn_nvs_handle_comp(struct hn_softc *,
320 				    struct vmbus_channel *,
321 				    const struct vmbus_chanpkt_hdr *);
322 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
323 				    struct vmbus_channel *,
324 				    const struct vmbus_chanpkt_hdr *);
325 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
326 				    struct vmbus_channel *, uint64_t);
327 
328 #if __FreeBSD_version >= 1100099
329 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
330 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
331 #endif
332 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
333 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
334 #if __FreeBSD_version < 1100095
335 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
336 #else
337 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
338 #endif
339 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
340 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
341 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
342 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
343 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
344 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
345 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
346 #ifndef RSS
347 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
348 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
349 #endif
350 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
351 static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
352 static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
353 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
354 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
355 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
356 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
357 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
358 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
359 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
360 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
361 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
362 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
363 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
364 
365 static void			hn_stop(struct hn_softc *, bool);
366 static void			hn_init_locked(struct hn_softc *);
367 static int			hn_chan_attach(struct hn_softc *,
368 				    struct vmbus_channel *);
369 static void			hn_chan_detach(struct hn_softc *,
370 				    struct vmbus_channel *);
371 static int			hn_attach_subchans(struct hn_softc *);
372 static void			hn_detach_allchans(struct hn_softc *);
373 static void			hn_chan_rollup(struct hn_rx_ring *,
374 				    struct hn_tx_ring *);
375 static void			hn_set_ring_inuse(struct hn_softc *, int);
376 static int			hn_synth_attach(struct hn_softc *, int);
377 static void			hn_synth_detach(struct hn_softc *);
378 static int			hn_synth_alloc_subchans(struct hn_softc *,
379 				    int *);
380 static bool			hn_synth_attachable(const struct hn_softc *);
381 static void			hn_suspend(struct hn_softc *);
382 static void			hn_suspend_data(struct hn_softc *);
383 static void			hn_suspend_mgmt(struct hn_softc *);
384 static void			hn_resume(struct hn_softc *);
385 static void			hn_resume_data(struct hn_softc *);
386 static void			hn_resume_mgmt(struct hn_softc *);
387 static void			hn_suspend_mgmt_taskfunc(void *, int);
388 static void			hn_chan_drain(struct hn_softc *,
389 				    struct vmbus_channel *);
390 static void			hn_disable_rx(struct hn_softc *);
391 static void			hn_drain_rxtx(struct hn_softc *, int);
392 static void			hn_polling(struct hn_softc *, u_int);
393 static void			hn_chan_polling(struct vmbus_channel *, u_int);
394 static void			hn_mtu_change_fixup(struct hn_softc *);
395 
396 static void			hn_update_link_status(struct hn_softc *);
397 static void			hn_change_network(struct hn_softc *);
398 static void			hn_link_taskfunc(void *, int);
399 static void			hn_netchg_init_taskfunc(void *, int);
400 static void			hn_netchg_status_taskfunc(void *, int);
401 static void			hn_link_status(struct hn_softc *);
402 
403 static int			hn_create_rx_data(struct hn_softc *, int);
404 static void			hn_destroy_rx_data(struct hn_softc *);
405 static int			hn_check_iplen(const struct mbuf *, int);
406 static void			hn_rxpkt_proto(const struct mbuf *, int *, int *);
407 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
408 static int			hn_rxfilter_config(struct hn_softc *);
409 static int			hn_rss_reconfig(struct hn_softc *);
410 static void			hn_rss_ind_fixup(struct hn_softc *);
411 static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
412 static int			hn_rxpkt(struct hn_rx_ring *);
413 static uint32_t			hn_rss_type_fromndis(uint32_t);
414 static uint32_t			hn_rss_type_tondis(uint32_t);
415 
416 static int			hn_tx_ring_create(struct hn_softc *, int);
417 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
418 static int			hn_create_tx_data(struct hn_softc *, int);
419 static void			hn_fixup_tx_data(struct hn_softc *);
420 static void			hn_fixup_rx_data(struct hn_softc *);
421 static void			hn_destroy_tx_data(struct hn_softc *);
422 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
423 static void			hn_txdesc_gc(struct hn_tx_ring *,
424 				    struct hn_txdesc *);
425 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
426 				    struct hn_txdesc *, struct mbuf **);
427 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
428 				    struct hn_txdesc *);
429 static void			hn_set_chim_size(struct hn_softc *, int);
430 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
431 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
432 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
433 static void			hn_resume_tx(struct hn_softc *, int);
434 static void			hn_set_txagg(struct hn_softc *);
435 static void			*hn_try_txagg(struct ifnet *,
436 				    struct hn_tx_ring *, struct hn_txdesc *,
437 				    int);
438 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
439 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
440 				    struct hn_softc *, struct vmbus_channel *,
441 				    const void *, int);
442 static int			hn_txpkt_sglist(struct hn_tx_ring *,
443 				    struct hn_txdesc *);
444 static int			hn_txpkt_chim(struct hn_tx_ring *,
445 				    struct hn_txdesc *);
446 static int			hn_xmit(struct hn_tx_ring *, int);
447 static void			hn_xmit_taskfunc(void *, int);
448 static void			hn_xmit_txeof(struct hn_tx_ring *);
449 static void			hn_xmit_txeof_taskfunc(void *, int);
450 #ifdef HN_IFSTART_SUPPORT
451 static int			hn_start_locked(struct hn_tx_ring *, int);
452 static void			hn_start_taskfunc(void *, int);
453 static void			hn_start_txeof(struct hn_tx_ring *);
454 static void			hn_start_txeof_taskfunc(void *, int);
455 #endif
456 
457 static int			hn_rsc_sysctl(SYSCTL_HANDLER_ARGS);
458 
459 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
460     "Hyper-V network interface");
461 
462 /* Trust tcp segment verification on host side. */
463 static int			hn_trust_hosttcp = 1;
464 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
465     &hn_trust_hosttcp, 0,
466     "Trust tcp segment verification on host side, "
467     "when csum info is missing (global setting)");
468 
469 /* Trust udp datagrams verification on host side. */
470 static int			hn_trust_hostudp = 1;
471 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
472     &hn_trust_hostudp, 0,
473     "Trust udp datagram verification on host side, "
474     "when csum info is missing (global setting)");
475 
476 /* Trust ip packets verification on host side. */
477 static int			hn_trust_hostip = 1;
478 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
479     &hn_trust_hostip, 0,
480     "Trust ip packet verification on host side, "
481     "when csum info is missing (global setting)");
482 
483 /*
484  * Offload UDP/IPv4 checksum.
485  */
486 static int			hn_enable_udp4cs = 1;
487 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
488     &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
489 
490 /*
491  * Offload UDP/IPv6 checksum.
492  */
493 static int			hn_enable_udp6cs = 1;
494 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
495     &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
496 
497 /* Stats. */
498 static counter_u64_t		hn_udpcs_fixup;
499 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
500     &hn_udpcs_fixup, "# of UDP checksum fixup");
501 
502 /*
503  * See hn_set_hlen().
504  *
505  * This value is for Azure.  For Hyper-V, set this above
506  * 65536 to disable UDP datagram checksum fixup.
507  */
508 static int			hn_udpcs_fixup_mtu = 1420;
509 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
510     &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
511 
512 /* Limit TSO burst size */
513 static int			hn_tso_maxlen = IP_MAXPACKET;
514 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
515     &hn_tso_maxlen, 0, "TSO burst limit");
516 
517 /* Limit chimney send size */
518 static int			hn_tx_chimney_size = 0;
519 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
520     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
521 
522 /* Limit the size of packet for direct transmission */
523 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
524 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
525     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
526 
527 /* # of LRO entries per RX ring */
528 #if defined(INET) || defined(INET6)
529 #if __FreeBSD_version >= 1100095
530 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
531 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
532     &hn_lro_entry_count, 0, "LRO entry count");
533 #endif
534 #endif
535 
536 static int			hn_tx_taskq_cnt = 1;
537 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
538     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
539 
540 #define HN_TX_TASKQ_M_INDEP	0
541 #define HN_TX_TASKQ_M_GLOBAL	1
542 #define HN_TX_TASKQ_M_EVTTQ	2
543 
544 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
545 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
546     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
547     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
548 
549 #ifndef HN_USE_TXDESC_BUFRING
550 static int			hn_use_txdesc_bufring = 0;
551 #else
552 static int			hn_use_txdesc_bufring = 1;
553 #endif
554 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
555     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
556 
557 #ifdef HN_IFSTART_SUPPORT
558 /* Use ifnet.if_start instead of ifnet.if_transmit */
559 static int			hn_use_if_start = 0;
560 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
561     &hn_use_if_start, 0, "Use if_start TX method");
562 #endif
563 
564 /* # of channels to use */
565 static int			hn_chan_cnt = 0;
566 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
567     &hn_chan_cnt, 0,
568     "# of channels to use; each channel has one RX ring and one TX ring");
569 
570 /* # of transmit rings to use */
571 static int			hn_tx_ring_cnt = 0;
572 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
573     &hn_tx_ring_cnt, 0, "# of TX rings to use");
574 
575 /* Software TX ring deptch */
576 static int			hn_tx_swq_depth = 0;
577 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
578     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
579 
580 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
581 #if __FreeBSD_version >= 1100095
582 static u_int			hn_lro_mbufq_depth = 0;
583 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
584     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
585 #endif
586 
587 /* Packet transmission aggregation size limit */
588 static int			hn_tx_agg_size = -1;
589 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
590     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
591 
592 /* Packet transmission aggregation count limit */
593 static int			hn_tx_agg_pkts = -1;
594 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
595     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
596 
597 /* VF list */
598 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist,
599     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
600     hn_vflist_sysctl, "A",
601     "VF list");
602 
603 /* VF mapping */
604 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap,
605     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
606     hn_vfmap_sysctl, "A",
607     "VF mapping");
608 
609 /* Transparent VF */
610 static int			hn_xpnt_vf = 1;
611 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
612     &hn_xpnt_vf, 0, "Transparent VF mod");
613 
614 /* Accurate BPF support for Transparent VF */
615 static int			hn_xpnt_vf_accbpf = 0;
616 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
617     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
618 
619 /* Extra wait for transparent VF attach routing; unit seconds. */
620 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
621 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
622     &hn_xpnt_vf_attwait, 0,
623     "Extra wait for transparent VF attach routing; unit: seconds");
624 
625 static u_int			hn_cpu_index;	/* next CPU for channel */
626 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
627 
628 static struct rmlock		hn_vfmap_lock;
629 static int			hn_vfmap_size;
630 static struct ifnet		**hn_vfmap;
631 
632 #ifndef RSS
633 static const uint8_t
634 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
635 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
636 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
637 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
638 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
639 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
640 };
641 #endif	/* !RSS */
642 
643 static const struct hyperv_guid	hn_guid = {
644 	.hv_guid = {
645 	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
646 	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
647 };
648 
649 static device_method_t hn_methods[] = {
650 	/* Device interface */
651 	DEVMETHOD(device_probe,		hn_probe),
652 	DEVMETHOD(device_attach,	hn_attach),
653 	DEVMETHOD(device_detach,	hn_detach),
654 	DEVMETHOD(device_shutdown,	hn_shutdown),
655 	DEVMETHOD_END
656 };
657 
658 static driver_t hn_driver = {
659 	"hn",
660 	hn_methods,
661 	sizeof(struct hn_softc)
662 };
663 
664 static devclass_t hn_devclass;
665 
666 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
667 MODULE_VERSION(hn, 1);
668 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
669 
670 #if __FreeBSD_version >= 1100099
671 static void
672 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
673 {
674 	int i;
675 
676 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
677 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
678 }
679 #endif
680 
681 static int
682 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
683 {
684 
685 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
686 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
687 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
688 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
689 }
690 
691 static int
692 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
693 {
694 	struct hn_nvs_rndis rndis;
695 
696 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
697 	    txd->chim_size > 0, ("invalid rndis chim txd"));
698 
699 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
700 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
701 	rndis.nvs_chim_idx = txd->chim_index;
702 	rndis.nvs_chim_sz = txd->chim_size;
703 
704 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
705 	    &rndis, sizeof(rndis), &txd->send_ctx));
706 }
707 
708 static __inline uint32_t
709 hn_chim_alloc(struct hn_softc *sc)
710 {
711 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
712 	u_long *bmap = sc->hn_chim_bmap;
713 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
714 
715 	for (i = 0; i < bmap_cnt; ++i) {
716 		int idx;
717 
718 		idx = ffsl(~bmap[i]);
719 		if (idx == 0)
720 			continue;
721 
722 		--idx; /* ffsl is 1-based */
723 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
724 		    ("invalid i %d and idx %d", i, idx));
725 
726 		if (atomic_testandset_long(&bmap[i], idx))
727 			continue;
728 
729 		ret = i * LONG_BIT + idx;
730 		break;
731 	}
732 	return (ret);
733 }
734 
735 static __inline void
736 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
737 {
738 	u_long mask;
739 	uint32_t idx;
740 
741 	idx = chim_idx / LONG_BIT;
742 	KASSERT(idx < sc->hn_chim_bmap_cnt,
743 	    ("invalid chimney index 0x%x", chim_idx));
744 
745 	mask = 1UL << (chim_idx % LONG_BIT);
746 	KASSERT(sc->hn_chim_bmap[idx] & mask,
747 	    ("index bitmap 0x%lx, chimney index %u, "
748 	     "bitmap idx %d, bitmask 0x%lx",
749 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
750 
751 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
752 }
753 
754 #if defined(INET6) || defined(INET)
755 
756 #define PULLUP_HDR(m, len)				\
757 do {							\
758 	if (__predict_false((m)->m_len < (len))) {	\
759 		(m) = m_pullup((m), (len));		\
760 		if ((m) == NULL)			\
761 			return (NULL);			\
762 	}						\
763 } while (0)
764 
765 /*
766  * NOTE: If this function failed, the m_head would be freed.
767  */
768 static __inline struct mbuf *
769 hn_tso_fixup(struct mbuf *m_head)
770 {
771 	struct ether_vlan_header *evl;
772 	struct tcphdr *th;
773 	int ehlen;
774 
775 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
776 
777 	PULLUP_HDR(m_head, sizeof(*evl));
778 	evl = mtod(m_head, struct ether_vlan_header *);
779 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
780 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
781 	else
782 		ehlen = ETHER_HDR_LEN;
783 	m_head->m_pkthdr.l2hlen = ehlen;
784 
785 #ifdef INET
786 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
787 		struct ip *ip;
788 		int iphlen;
789 
790 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
791 		ip = mtodo(m_head, ehlen);
792 		iphlen = ip->ip_hl << 2;
793 		m_head->m_pkthdr.l3hlen = iphlen;
794 
795 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
796 		th = mtodo(m_head, ehlen + iphlen);
797 
798 		ip->ip_len = 0;
799 		ip->ip_sum = 0;
800 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
801 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
802 	}
803 #endif
804 #if defined(INET6) && defined(INET)
805 	else
806 #endif
807 #ifdef INET6
808 	{
809 		struct ip6_hdr *ip6;
810 
811 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
812 		ip6 = mtodo(m_head, ehlen);
813 		if (ip6->ip6_nxt != IPPROTO_TCP) {
814 			m_freem(m_head);
815 			return (NULL);
816 		}
817 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
818 
819 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
820 		th = mtodo(m_head, ehlen + sizeof(*ip6));
821 
822 		ip6->ip6_plen = 0;
823 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
824 	}
825 #endif
826 	return (m_head);
827 }
828 
829 /*
830  * NOTE: If this function failed, the m_head would be freed.
831  */
832 static __inline struct mbuf *
833 hn_set_hlen(struct mbuf *m_head)
834 {
835 	const struct ether_vlan_header *evl;
836 	int ehlen;
837 
838 	PULLUP_HDR(m_head, sizeof(*evl));
839 	evl = mtod(m_head, const struct ether_vlan_header *);
840 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
841 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
842 	else
843 		ehlen = ETHER_HDR_LEN;
844 	m_head->m_pkthdr.l2hlen = ehlen;
845 
846 #ifdef INET
847 	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
848 		const struct ip *ip;
849 		int iphlen;
850 
851 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
852 		ip = mtodo(m_head, ehlen);
853 		iphlen = ip->ip_hl << 2;
854 		m_head->m_pkthdr.l3hlen = iphlen;
855 
856 		/*
857 		 * UDP checksum offload does not work in Azure, if the
858 		 * following conditions meet:
859 		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
860 		 * - IP_DF is not set in the IP hdr.
861 		 *
862 		 * Fallback to software checksum for these UDP datagrams.
863 		 */
864 		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
865 		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
866 		    (ntohs(ip->ip_off) & IP_DF) == 0) {
867 			uint16_t off = ehlen + iphlen;
868 
869 			counter_u64_add(hn_udpcs_fixup, 1);
870 			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
871 			*(uint16_t *)(m_head->m_data + off +
872                             m_head->m_pkthdr.csum_data) = in_cksum_skip(
873 			    m_head, m_head->m_pkthdr.len, off);
874 			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
875 		}
876 	}
877 #endif
878 #if defined(INET6) && defined(INET)
879 	else
880 #endif
881 #ifdef INET6
882 	{
883 		const struct ip6_hdr *ip6;
884 
885 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
886 		ip6 = mtodo(m_head, ehlen);
887 		if (ip6->ip6_nxt != IPPROTO_TCP &&
888 		    ip6->ip6_nxt != IPPROTO_UDP) {
889 			m_freem(m_head);
890 			return (NULL);
891 		}
892 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
893 	}
894 #endif
895 	return (m_head);
896 }
897 
898 /*
899  * NOTE: If this function failed, the m_head would be freed.
900  */
901 static __inline struct mbuf *
902 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
903 {
904 	const struct tcphdr *th;
905 	int ehlen, iphlen;
906 
907 	*tcpsyn = 0;
908 	ehlen = m_head->m_pkthdr.l2hlen;
909 	iphlen = m_head->m_pkthdr.l3hlen;
910 
911 	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
912 	th = mtodo(m_head, ehlen + iphlen);
913 	if (th->th_flags & TH_SYN)
914 		*tcpsyn = 1;
915 	return (m_head);
916 }
917 
918 #undef PULLUP_HDR
919 
920 #endif	/* INET6 || INET */
921 
922 static int
923 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
924 {
925 	int error = 0;
926 
927 	HN_LOCK_ASSERT(sc);
928 
929 	if (sc->hn_rx_filter != filter) {
930 		error = hn_rndis_set_rxfilter(sc, filter);
931 		if (!error)
932 			sc->hn_rx_filter = filter;
933 	}
934 	return (error);
935 }
936 
937 static int
938 hn_rxfilter_config(struct hn_softc *sc)
939 {
940 	struct ifnet *ifp = sc->hn_ifp;
941 	uint32_t filter;
942 
943 	HN_LOCK_ASSERT(sc);
944 
945 	/*
946 	 * If the non-transparent mode VF is activated, we don't know how
947 	 * its RX filter is configured, so stick the synthetic device in
948 	 * the promiscous mode.
949 	 */
950 	if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
951 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
952 	} else {
953 		filter = NDIS_PACKET_TYPE_DIRECTED;
954 		if (ifp->if_flags & IFF_BROADCAST)
955 			filter |= NDIS_PACKET_TYPE_BROADCAST;
956 		/* TODO: support multicast list */
957 		if ((ifp->if_flags & IFF_ALLMULTI) ||
958 		    !CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
959 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
960 	}
961 	return (hn_set_rxfilter(sc, filter));
962 }
963 
964 static void
965 hn_set_txagg(struct hn_softc *sc)
966 {
967 	uint32_t size, pkts;
968 	int i;
969 
970 	/*
971 	 * Setup aggregation size.
972 	 */
973 	if (sc->hn_agg_size < 0)
974 		size = UINT32_MAX;
975 	else
976 		size = sc->hn_agg_size;
977 
978 	if (sc->hn_rndis_agg_size < size)
979 		size = sc->hn_rndis_agg_size;
980 
981 	/* NOTE: We only aggregate packets using chimney sending buffers. */
982 	if (size > (uint32_t)sc->hn_chim_szmax)
983 		size = sc->hn_chim_szmax;
984 
985 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
986 		/* Disable */
987 		size = 0;
988 		pkts = 0;
989 		goto done;
990 	}
991 
992 	/* NOTE: Type of the per TX ring setting is 'int'. */
993 	if (size > INT_MAX)
994 		size = INT_MAX;
995 
996 	/*
997 	 * Setup aggregation packet count.
998 	 */
999 	if (sc->hn_agg_pkts < 0)
1000 		pkts = UINT32_MAX;
1001 	else
1002 		pkts = sc->hn_agg_pkts;
1003 
1004 	if (sc->hn_rndis_agg_pkts < pkts)
1005 		pkts = sc->hn_rndis_agg_pkts;
1006 
1007 	if (pkts <= 1) {
1008 		/* Disable */
1009 		size = 0;
1010 		pkts = 0;
1011 		goto done;
1012 	}
1013 
1014 	/* NOTE: Type of the per TX ring setting is 'short'. */
1015 	if (pkts > SHRT_MAX)
1016 		pkts = SHRT_MAX;
1017 
1018 done:
1019 	/* NOTE: Type of the per TX ring setting is 'short'. */
1020 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
1021 		/* Disable */
1022 		size = 0;
1023 		pkts = 0;
1024 	}
1025 
1026 	if (bootverbose) {
1027 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1028 		    size, pkts, sc->hn_rndis_agg_align);
1029 	}
1030 
1031 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1032 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1033 
1034 		mtx_lock(&txr->hn_tx_lock);
1035 		txr->hn_agg_szmax = size;
1036 		txr->hn_agg_pktmax = pkts;
1037 		txr->hn_agg_align = sc->hn_rndis_agg_align;
1038 		mtx_unlock(&txr->hn_tx_lock);
1039 	}
1040 }
1041 
1042 static int
1043 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1044 {
1045 
1046 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1047 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1048 		return txr->hn_txdesc_cnt;
1049 	return hn_tx_swq_depth;
1050 }
1051 
1052 static int
1053 hn_rss_reconfig(struct hn_softc *sc)
1054 {
1055 	int error;
1056 
1057 	HN_LOCK_ASSERT(sc);
1058 
1059 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1060 		return (ENXIO);
1061 
1062 	/*
1063 	 * Disable RSS first.
1064 	 *
1065 	 * NOTE:
1066 	 * Direct reconfiguration by setting the UNCHG flags does
1067 	 * _not_ work properly.
1068 	 */
1069 	if (bootverbose)
1070 		if_printf(sc->hn_ifp, "disable RSS\n");
1071 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1072 	if (error) {
1073 		if_printf(sc->hn_ifp, "RSS disable failed\n");
1074 		return (error);
1075 	}
1076 
1077 	/*
1078 	 * Reenable the RSS w/ the updated RSS key or indirect
1079 	 * table.
1080 	 */
1081 	if (bootverbose)
1082 		if_printf(sc->hn_ifp, "reconfig RSS\n");
1083 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1084 	if (error) {
1085 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1086 		return (error);
1087 	}
1088 	return (0);
1089 }
1090 
1091 static void
1092 hn_rss_ind_fixup(struct hn_softc *sc)
1093 {
1094 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1095 	int i, nchan;
1096 
1097 	nchan = sc->hn_rx_ring_inuse;
1098 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1099 
1100 	/*
1101 	 * Check indirect table to make sure that all channels in it
1102 	 * can be used.
1103 	 */
1104 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1105 		if (rss->rss_ind[i] >= nchan) {
1106 			if_printf(sc->hn_ifp,
1107 			    "RSS indirect table %d fixup: %u -> %d\n",
1108 			    i, rss->rss_ind[i], nchan - 1);
1109 			rss->rss_ind[i] = nchan - 1;
1110 		}
1111 	}
1112 }
1113 
1114 static int
1115 hn_ifmedia_upd(struct ifnet *ifp __unused)
1116 {
1117 
1118 	return EOPNOTSUPP;
1119 }
1120 
1121 static void
1122 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1123 {
1124 	struct hn_softc *sc = ifp->if_softc;
1125 
1126 	ifmr->ifm_status = IFM_AVALID;
1127 	ifmr->ifm_active = IFM_ETHER;
1128 
1129 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1130 		ifmr->ifm_active |= IFM_NONE;
1131 		return;
1132 	}
1133 	ifmr->ifm_status |= IFM_ACTIVE;
1134 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1135 }
1136 
1137 static void
1138 hn_rxvf_set_task(void *xarg, int pending __unused)
1139 {
1140 	struct hn_rxvf_setarg *arg = xarg;
1141 
1142 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1143 }
1144 
1145 static void
1146 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1147 {
1148 	struct hn_rx_ring *rxr;
1149 	struct hn_rxvf_setarg arg;
1150 	struct task task;
1151 	int i;
1152 
1153 	HN_LOCK_ASSERT(sc);
1154 
1155 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1156 
1157 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1158 		rxr = &sc->hn_rx_ring[i];
1159 
1160 		if (i < sc->hn_rx_ring_inuse) {
1161 			arg.rxr = rxr;
1162 			arg.vf_ifp = vf_ifp;
1163 			vmbus_chan_run_task(rxr->hn_chan, &task);
1164 		} else {
1165 			rxr->hn_rxvf_ifp = vf_ifp;
1166 		}
1167 	}
1168 }
1169 
1170 static bool
1171 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1172 {
1173 	const struct ifnet *hn_ifp;
1174 
1175 	hn_ifp = sc->hn_ifp;
1176 
1177 	if (ifp == hn_ifp)
1178 		return (false);
1179 
1180 	if (ifp->if_alloctype != IFT_ETHER)
1181 		return (false);
1182 
1183 	/* Ignore lagg/vlan interfaces */
1184 	if (strcmp(ifp->if_dname, "lagg") == 0 ||
1185 	    strcmp(ifp->if_dname, "vlan") == 0)
1186 		return (false);
1187 
1188 	/*
1189 	 * During detach events ifp->if_addr might be NULL.
1190 	 * Make sure the bcmp() below doesn't panic on that:
1191 	 */
1192 	if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL)
1193 		return (false);
1194 
1195 	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1196 		return (false);
1197 
1198 	return (true);
1199 }
1200 
1201 static void
1202 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1203 {
1204 	struct ifnet *hn_ifp;
1205 
1206 	HN_LOCK(sc);
1207 
1208 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1209 		goto out;
1210 
1211 	if (!hn_ismyvf(sc, ifp))
1212 		goto out;
1213 	hn_ifp = sc->hn_ifp;
1214 
1215 	if (rxvf) {
1216 		if (sc->hn_flags & HN_FLAG_RXVF)
1217 			goto out;
1218 
1219 		sc->hn_flags |= HN_FLAG_RXVF;
1220 		hn_rxfilter_config(sc);
1221 	} else {
1222 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1223 			goto out;
1224 
1225 		sc->hn_flags &= ~HN_FLAG_RXVF;
1226 		if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1227 			hn_rxfilter_config(sc);
1228 		else
1229 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1230 	}
1231 
1232 	hn_nvs_set_datapath(sc,
1233 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1234 
1235 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1236 
1237 	if (rxvf) {
1238 		hn_vf_rss_fixup(sc, true);
1239 		hn_suspend_mgmt(sc);
1240 		sc->hn_link_flags &=
1241 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1242 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1243 	} else {
1244 		hn_vf_rss_restore(sc);
1245 		hn_resume_mgmt(sc);
1246 	}
1247 
1248 	devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1249 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1250 
1251 	if (bootverbose) {
1252 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1253 		    rxvf ? "to" : "from", ifp->if_xname);
1254 	}
1255 out:
1256 	HN_UNLOCK(sc);
1257 }
1258 
1259 static void
1260 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1261 {
1262 
1263 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1264 		return;
1265 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1266 }
1267 
1268 static void
1269 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1270 {
1271 
1272 	hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1273 }
1274 
1275 static int
1276 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1277 {
1278 	struct ifnet *ifp, *vf_ifp;
1279 	uint64_t tmp;
1280 	int error;
1281 
1282 	HN_LOCK_ASSERT(sc);
1283 	ifp = sc->hn_ifp;
1284 	vf_ifp = sc->hn_vf_ifp;
1285 
1286 	/*
1287 	 * Fix up requested capabilities w/ supported capabilities,
1288 	 * since the supported capabilities could have been changed.
1289 	 */
1290 	ifr->ifr_reqcap &= ifp->if_capabilities;
1291 	/* Pass SIOCSIFCAP to VF. */
1292 	error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1293 
1294 	/*
1295 	 * NOTE:
1296 	 * The error will be propagated to the callers, however, it
1297 	 * is _not_ useful here.
1298 	 */
1299 
1300 	/*
1301 	 * Merge VF's enabled capabilities.
1302 	 */
1303 	ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1304 
1305 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1306 	if (ifp->if_capenable & IFCAP_TXCSUM)
1307 		ifp->if_hwassist |= tmp;
1308 	else
1309 		ifp->if_hwassist &= ~tmp;
1310 
1311 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1312 	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1313 		ifp->if_hwassist |= tmp;
1314 	else
1315 		ifp->if_hwassist &= ~tmp;
1316 
1317 	tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1318 	if (ifp->if_capenable & IFCAP_TSO4)
1319 		ifp->if_hwassist |= tmp;
1320 	else
1321 		ifp->if_hwassist &= ~tmp;
1322 
1323 	tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1324 	if (ifp->if_capenable & IFCAP_TSO6)
1325 		ifp->if_hwassist |= tmp;
1326 	else
1327 		ifp->if_hwassist &= ~tmp;
1328 
1329 	return (error);
1330 }
1331 
1332 static int
1333 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1334 {
1335 	struct ifnet *vf_ifp;
1336 	struct ifreq ifr;
1337 
1338 	HN_LOCK_ASSERT(sc);
1339 	vf_ifp = sc->hn_vf_ifp;
1340 
1341 	memset(&ifr, 0, sizeof(ifr));
1342 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1343 	ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1344 	ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1345 	return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1346 }
1347 
1348 static void
1349 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1350 {
1351 	struct ifnet *ifp = sc->hn_ifp;
1352 	int allmulti = 0;
1353 
1354 	HN_LOCK_ASSERT(sc);
1355 
1356 	/* XXX vlan(4) style mcast addr maintenance */
1357 	if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
1358 		allmulti = IFF_ALLMULTI;
1359 
1360 	/* Always set the VF's if_flags */
1361 	sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1362 }
1363 
1364 static void
1365 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1366 {
1367 	struct rm_priotracker pt;
1368 	struct ifnet *hn_ifp = NULL;
1369 	struct mbuf *mn;
1370 
1371 	/*
1372 	 * XXX racy, if hn(4) ever detached.
1373 	 */
1374 	rm_rlock(&hn_vfmap_lock, &pt);
1375 	if (vf_ifp->if_index < hn_vfmap_size)
1376 		hn_ifp = hn_vfmap[vf_ifp->if_index];
1377 	rm_runlock(&hn_vfmap_lock, &pt);
1378 
1379 	if (hn_ifp != NULL) {
1380 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1381 			/*
1382 			 * Allow tapping on the VF.
1383 			 */
1384 			ETHER_BPF_MTAP(vf_ifp, mn);
1385 
1386 			/*
1387 			 * Update VF stats.
1388 			 */
1389 			if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1390 				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1391 				    mn->m_pkthdr.len);
1392 			}
1393 			/*
1394 			 * XXX IFCOUNTER_IMCAST
1395 			 * This stat updating is kinda invasive, since it
1396 			 * requires two checks on the mbuf: the length check
1397 			 * and the ethernet header check.  As of this write,
1398 			 * all multicast packets go directly to hn(4), which
1399 			 * makes imcast stat updating in the VF a try in vian.
1400 			 */
1401 
1402 			/*
1403 			 * Fix up rcvif and increase hn(4)'s ipackets.
1404 			 */
1405 			mn->m_pkthdr.rcvif = hn_ifp;
1406 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1407 		}
1408 		/*
1409 		 * Go through hn(4)'s if_input.
1410 		 */
1411 		hn_ifp->if_input(hn_ifp, m);
1412 	} else {
1413 		/*
1414 		 * In the middle of the transition; free this
1415 		 * mbuf chain.
1416 		 */
1417 		while (m != NULL) {
1418 			mn = m->m_nextpkt;
1419 			m->m_nextpkt = NULL;
1420 			m_freem(m);
1421 			m = mn;
1422 		}
1423 	}
1424 }
1425 
1426 static void
1427 hn_mtu_change_fixup(struct hn_softc *sc)
1428 {
1429 	struct ifnet *ifp;
1430 
1431 	HN_LOCK_ASSERT(sc);
1432 	ifp = sc->hn_ifp;
1433 
1434 	hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1435 #if __FreeBSD_version >= 1100099
1436 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1437 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1438 #endif
1439 }
1440 
1441 static uint32_t
1442 hn_rss_type_fromndis(uint32_t rss_hash)
1443 {
1444 	uint32_t types = 0;
1445 
1446 	if (rss_hash & NDIS_HASH_IPV4)
1447 		types |= RSS_TYPE_IPV4;
1448 	if (rss_hash & NDIS_HASH_TCP_IPV4)
1449 		types |= RSS_TYPE_TCP_IPV4;
1450 	if (rss_hash & NDIS_HASH_IPV6)
1451 		types |= RSS_TYPE_IPV6;
1452 	if (rss_hash & NDIS_HASH_IPV6_EX)
1453 		types |= RSS_TYPE_IPV6_EX;
1454 	if (rss_hash & NDIS_HASH_TCP_IPV6)
1455 		types |= RSS_TYPE_TCP_IPV6;
1456 	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1457 		types |= RSS_TYPE_TCP_IPV6_EX;
1458 	if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1459 		types |= RSS_TYPE_UDP_IPV4;
1460 	return (types);
1461 }
1462 
1463 static uint32_t
1464 hn_rss_type_tondis(uint32_t types)
1465 {
1466 	uint32_t rss_hash = 0;
1467 
1468 	KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1469 	    ("UDP6 and UDP6EX are not supported"));
1470 
1471 	if (types & RSS_TYPE_IPV4)
1472 		rss_hash |= NDIS_HASH_IPV4;
1473 	if (types & RSS_TYPE_TCP_IPV4)
1474 		rss_hash |= NDIS_HASH_TCP_IPV4;
1475 	if (types & RSS_TYPE_IPV6)
1476 		rss_hash |= NDIS_HASH_IPV6;
1477 	if (types & RSS_TYPE_IPV6_EX)
1478 		rss_hash |= NDIS_HASH_IPV6_EX;
1479 	if (types & RSS_TYPE_TCP_IPV6)
1480 		rss_hash |= NDIS_HASH_TCP_IPV6;
1481 	if (types & RSS_TYPE_TCP_IPV6_EX)
1482 		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1483 	if (types & RSS_TYPE_UDP_IPV4)
1484 		rss_hash |= NDIS_HASH_UDP_IPV4_X;
1485 	return (rss_hash);
1486 }
1487 
1488 static void
1489 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1490 {
1491 	int i;
1492 
1493 	HN_LOCK_ASSERT(sc);
1494 
1495 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1496 		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1497 }
1498 
1499 static void
1500 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1501 {
1502 	struct ifnet *ifp, *vf_ifp;
1503 	struct ifrsshash ifrh;
1504 	struct ifrsskey ifrk;
1505 	int error;
1506 	uint32_t my_types, diff_types, mbuf_types = 0;
1507 
1508 	HN_LOCK_ASSERT(sc);
1509 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1510 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1511 
1512 	if (sc->hn_rx_ring_inuse == 1) {
1513 		/* No RSS on synthetic parts; done. */
1514 		return;
1515 	}
1516 	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1517 		/* Synthetic parts do not support Toeplitz; done. */
1518 		return;
1519 	}
1520 
1521 	ifp = sc->hn_ifp;
1522 	vf_ifp = sc->hn_vf_ifp;
1523 
1524 	/*
1525 	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
1526 	 * supported.
1527 	 */
1528 	memset(&ifrk, 0, sizeof(ifrk));
1529 	strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1530 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1531 	if (error) {
1532 		if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
1533 		    vf_ifp->if_xname, error);
1534 		goto done;
1535 	}
1536 	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1537 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1538 		    vf_ifp->if_xname, ifrk.ifrk_func);
1539 		goto done;
1540 	}
1541 	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1542 		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1543 		    vf_ifp->if_xname, ifrk.ifrk_keylen);
1544 		goto done;
1545 	}
1546 
1547 	/*
1548 	 * Extract VF's RSS hash.  Only Toeplitz is supported.
1549 	 */
1550 	memset(&ifrh, 0, sizeof(ifrh));
1551 	strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1552 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1553 	if (error) {
1554 		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1555 		    vf_ifp->if_xname, error);
1556 		goto done;
1557 	}
1558 	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1559 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1560 		    vf_ifp->if_xname, ifrh.ifrh_func);
1561 		goto done;
1562 	}
1563 
1564 	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1565 	if ((ifrh.ifrh_types & my_types) == 0) {
1566 		/* This disables RSS; ignore it then */
1567 		if_printf(ifp, "%s intersection of RSS types failed.  "
1568 		    "VF %#x, mine %#x\n", vf_ifp->if_xname,
1569 		    ifrh.ifrh_types, my_types);
1570 		goto done;
1571 	}
1572 
1573 	diff_types = my_types ^ ifrh.ifrh_types;
1574 	my_types &= ifrh.ifrh_types;
1575 	mbuf_types = my_types;
1576 
1577 	/*
1578 	 * Detect RSS hash value/type confliction.
1579 	 *
1580 	 * NOTE:
1581 	 * We don't disable the hash type, but stop delivery the hash
1582 	 * value/type through mbufs on RX path.
1583 	 *
1584 	 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1585 	 * hash is delivered with type of TCP_IPV4.  This means if
1586 	 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1587 	 * least to hn_mbuf_hash.  However, given that _all_ of the
1588 	 * NICs implement TCP_IPV4, this will _not_ impose any issues
1589 	 * here.
1590 	 */
1591 	if ((my_types & RSS_TYPE_IPV4) &&
1592 	    (diff_types & ifrh.ifrh_types &
1593 	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1594 		/* Conflict; disable IPV4 hash type/value delivery. */
1595 		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1596 		mbuf_types &= ~RSS_TYPE_IPV4;
1597 	}
1598 	if ((my_types & RSS_TYPE_IPV6) &&
1599 	    (diff_types & ifrh.ifrh_types &
1600 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1601 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1602 	      RSS_TYPE_IPV6_EX))) {
1603 		/* Conflict; disable IPV6 hash type/value delivery. */
1604 		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1605 		mbuf_types &= ~RSS_TYPE_IPV6;
1606 	}
1607 	if ((my_types & RSS_TYPE_IPV6_EX) &&
1608 	    (diff_types & ifrh.ifrh_types &
1609 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1610 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1611 	      RSS_TYPE_IPV6))) {
1612 		/* Conflict; disable IPV6_EX hash type/value delivery. */
1613 		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1614 		mbuf_types &= ~RSS_TYPE_IPV6_EX;
1615 	}
1616 	if ((my_types & RSS_TYPE_TCP_IPV6) &&
1617 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1618 		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
1619 		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1620 		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1621 	}
1622 	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1623 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1624 		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1625 		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1626 		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1627 	}
1628 	if ((my_types & RSS_TYPE_UDP_IPV6) &&
1629 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1630 		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
1631 		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1632 		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1633 	}
1634 	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1635 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1636 		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1637 		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1638 		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1639 	}
1640 
1641 	/*
1642 	 * Indirect table does not matter.
1643 	 */
1644 
1645 	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1646 	    hn_rss_type_tondis(my_types);
1647 	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1648 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1649 
1650 	if (reconf) {
1651 		error = hn_rss_reconfig(sc);
1652 		if (error) {
1653 			/* XXX roll-back? */
1654 			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1655 			/* XXX keep going. */
1656 		}
1657 	}
1658 done:
1659 	/* Hash deliverability for mbufs. */
1660 	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1661 }
1662 
1663 static void
1664 hn_vf_rss_restore(struct hn_softc *sc)
1665 {
1666 
1667 	HN_LOCK_ASSERT(sc);
1668 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1669 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1670 
1671 	if (sc->hn_rx_ring_inuse == 1)
1672 		goto done;
1673 
1674 	/*
1675 	 * Restore hash types.  Key does _not_ matter.
1676 	 */
1677 	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1678 		int error;
1679 
1680 		sc->hn_rss_hash = sc->hn_rss_hcap;
1681 		error = hn_rss_reconfig(sc);
1682 		if (error) {
1683 			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1684 			    error);
1685 			/* XXX keep going. */
1686 		}
1687 	}
1688 done:
1689 	/* Hash deliverability for mbufs. */
1690 	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1691 }
1692 
1693 static void
1694 hn_xpnt_vf_setready(struct hn_softc *sc)
1695 {
1696 	struct ifnet *ifp, *vf_ifp;
1697 	struct ifreq ifr;
1698 
1699 	HN_LOCK_ASSERT(sc);
1700 	ifp = sc->hn_ifp;
1701 	vf_ifp = sc->hn_vf_ifp;
1702 
1703 	/*
1704 	 * Mark the VF ready.
1705 	 */
1706 	sc->hn_vf_rdytick = 0;
1707 
1708 	/*
1709 	 * Save information for restoration.
1710 	 */
1711 	sc->hn_saved_caps = ifp->if_capabilities;
1712 	sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1713 	sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1714 	sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1715 
1716 	/*
1717 	 * Intersect supported/enabled capabilities.
1718 	 *
1719 	 * NOTE:
1720 	 * if_hwassist is not changed here.
1721 	 */
1722 	ifp->if_capabilities &= vf_ifp->if_capabilities;
1723 	ifp->if_capenable &= ifp->if_capabilities;
1724 
1725 	/*
1726 	 * Fix TSO settings.
1727 	 */
1728 	if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1729 		ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1730 	if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1731 		ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1732 	if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1733 		ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1734 
1735 	/*
1736 	 * Change VF's enabled capabilities.
1737 	 */
1738 	memset(&ifr, 0, sizeof(ifr));
1739 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1740 	ifr.ifr_reqcap = ifp->if_capenable;
1741 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1742 
1743 	if (ifp->if_mtu != ETHERMTU) {
1744 		int error;
1745 
1746 		/*
1747 		 * Change VF's MTU.
1748 		 */
1749 		memset(&ifr, 0, sizeof(ifr));
1750 		strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1751 		ifr.ifr_mtu = ifp->if_mtu;
1752 		error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1753 		if (error) {
1754 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1755 			    vf_ifp->if_xname, ifp->if_mtu);
1756 			if (ifp->if_mtu > ETHERMTU) {
1757 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1758 
1759 				/*
1760 				 * XXX
1761 				 * No need to adjust the synthetic parts' MTU;
1762 				 * failure of the adjustment will cause us
1763 				 * infinite headache.
1764 				 */
1765 				ifp->if_mtu = ETHERMTU;
1766 				hn_mtu_change_fixup(sc);
1767 			}
1768 		}
1769 	}
1770 }
1771 
1772 static bool
1773 hn_xpnt_vf_isready(struct hn_softc *sc)
1774 {
1775 
1776 	HN_LOCK_ASSERT(sc);
1777 
1778 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1779 		return (false);
1780 
1781 	if (sc->hn_vf_rdytick == 0)
1782 		return (true);
1783 
1784 	if (sc->hn_vf_rdytick > ticks)
1785 		return (false);
1786 
1787 	/* Mark VF as ready. */
1788 	hn_xpnt_vf_setready(sc);
1789 	return (true);
1790 }
1791 
1792 static void
1793 hn_xpnt_vf_setenable(struct hn_softc *sc)
1794 {
1795 	int i;
1796 
1797 	HN_LOCK_ASSERT(sc);
1798 
1799 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1800 	rm_wlock(&sc->hn_vf_lock);
1801 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1802 	rm_wunlock(&sc->hn_vf_lock);
1803 
1804 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1805 		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1806 }
1807 
1808 static void
1809 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1810 {
1811 	int i;
1812 
1813 	HN_LOCK_ASSERT(sc);
1814 
1815 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1816 	rm_wlock(&sc->hn_vf_lock);
1817 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1818 	if (clear_vf)
1819 		sc->hn_vf_ifp = NULL;
1820 	rm_wunlock(&sc->hn_vf_lock);
1821 
1822 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1823 		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1824 }
1825 
1826 static void
1827 hn_xpnt_vf_init(struct hn_softc *sc)
1828 {
1829 	int error;
1830 
1831 	HN_LOCK_ASSERT(sc);
1832 
1833 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1834 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1835 
1836 	if (bootverbose) {
1837 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1838 		    sc->hn_vf_ifp->if_xname);
1839 	}
1840 
1841 	/*
1842 	 * Bring the VF up.
1843 	 */
1844 	hn_xpnt_vf_saveifflags(sc);
1845 	sc->hn_vf_ifp->if_flags |= IFF_UP;
1846 	error = hn_xpnt_vf_iocsetflags(sc);
1847 	if (error) {
1848 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1849 		    sc->hn_vf_ifp->if_xname, error);
1850 		return;
1851 	}
1852 
1853 	/*
1854 	 * NOTE:
1855 	 * Datapath setting must happen _after_ bringing the VF up.
1856 	 */
1857 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1858 
1859 	/*
1860 	 * NOTE:
1861 	 * Fixup RSS related bits _after_ the VF is brought up, since
1862 	 * many VFs generate RSS key during it's initialization.
1863 	 */
1864 	hn_vf_rss_fixup(sc, true);
1865 
1866 	/* Mark transparent mode VF as enabled. */
1867 	hn_xpnt_vf_setenable(sc);
1868 }
1869 
1870 static void
1871 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1872 {
1873 	struct hn_softc *sc = xsc;
1874 
1875 	HN_LOCK(sc);
1876 
1877 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1878 		goto done;
1879 	if (sc->hn_vf_ifp == NULL)
1880 		goto done;
1881 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1882 		goto done;
1883 
1884 	if (sc->hn_vf_rdytick != 0) {
1885 		/* Mark VF as ready. */
1886 		hn_xpnt_vf_setready(sc);
1887 	}
1888 
1889 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1890 		/*
1891 		 * Delayed VF initialization.
1892 		 */
1893 		if (bootverbose) {
1894 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1895 			    sc->hn_vf_ifp->if_xname);
1896 		}
1897 		hn_xpnt_vf_init(sc);
1898 	}
1899 done:
1900 	HN_UNLOCK(sc);
1901 }
1902 
1903 static void
1904 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1905 {
1906 	struct hn_softc *sc = xsc;
1907 
1908 	HN_LOCK(sc);
1909 
1910 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1911 		goto done;
1912 
1913 	if (!hn_ismyvf(sc, ifp))
1914 		goto done;
1915 
1916 	if (sc->hn_vf_ifp != NULL) {
1917 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1918 		    sc->hn_vf_ifp->if_xname);
1919 		goto done;
1920 	}
1921 
1922 	if (hn_xpnt_vf && ifp->if_start != NULL) {
1923 		/*
1924 		 * ifnet.if_start is _not_ supported by transparent
1925 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1926 		 */
1927 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1928 		    "in transparent VF mode.\n", ifp->if_xname);
1929 		goto done;
1930 	}
1931 
1932 	rm_wlock(&hn_vfmap_lock);
1933 
1934 	if (ifp->if_index >= hn_vfmap_size) {
1935 		struct ifnet **newmap;
1936 		int newsize;
1937 
1938 		newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1939 		newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1940 		    M_WAITOK | M_ZERO);
1941 
1942 		memcpy(newmap, hn_vfmap,
1943 		    sizeof(struct ifnet *) * hn_vfmap_size);
1944 		free(hn_vfmap, M_DEVBUF);
1945 		hn_vfmap = newmap;
1946 		hn_vfmap_size = newsize;
1947 	}
1948 	KASSERT(hn_vfmap[ifp->if_index] == NULL,
1949 	    ("%s: ifindex %d was mapped to %s",
1950 	     ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1951 	hn_vfmap[ifp->if_index] = sc->hn_ifp;
1952 
1953 	rm_wunlock(&hn_vfmap_lock);
1954 
1955 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1956 	rm_wlock(&sc->hn_vf_lock);
1957 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1958 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1959 	sc->hn_vf_ifp = ifp;
1960 	rm_wunlock(&sc->hn_vf_lock);
1961 
1962 	if (hn_xpnt_vf) {
1963 		int wait_ticks;
1964 
1965 		/*
1966 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1967 		 * Save vf_ifp's current if_input for later restoration.
1968 		 */
1969 		sc->hn_vf_input = ifp->if_input;
1970 		ifp->if_input = hn_xpnt_vf_input;
1971 
1972 		/*
1973 		 * Stop link status management; use the VF's.
1974 		 */
1975 		hn_suspend_mgmt(sc);
1976 
1977 		/*
1978 		 * Give VF sometime to complete its attach routing.
1979 		 */
1980 		wait_ticks = hn_xpnt_vf_attwait * hz;
1981 		sc->hn_vf_rdytick = ticks + wait_ticks;
1982 
1983 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1984 		    wait_ticks);
1985 	}
1986 done:
1987 	HN_UNLOCK(sc);
1988 }
1989 
1990 static void
1991 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1992 {
1993 	struct hn_softc *sc = xsc;
1994 
1995 	HN_LOCK(sc);
1996 
1997 	if (sc->hn_vf_ifp == NULL)
1998 		goto done;
1999 
2000 	if (!hn_ismyvf(sc, ifp))
2001 		goto done;
2002 
2003 	if (hn_xpnt_vf) {
2004 		/*
2005 		 * Make sure that the delayed initialization is not running.
2006 		 *
2007 		 * NOTE:
2008 		 * - This lock _must_ be released, since the hn_vf_init task
2009 		 *   will try holding this lock.
2010 		 * - It is safe to release this lock here, since the
2011 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
2012 		 *
2013 		 * XXX racy, if hn(4) ever detached.
2014 		 */
2015 		HN_UNLOCK(sc);
2016 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
2017 		HN_LOCK(sc);
2018 
2019 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
2020 		    sc->hn_ifp->if_xname));
2021 		ifp->if_input = sc->hn_vf_input;
2022 		sc->hn_vf_input = NULL;
2023 
2024 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
2025 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
2026 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
2027 
2028 		if (sc->hn_vf_rdytick == 0) {
2029 			/*
2030 			 * The VF was ready; restore some settings.
2031 			 */
2032 			sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
2033 			/*
2034 			 * NOTE:
2035 			 * There is _no_ need to fixup if_capenable and
2036 			 * if_hwassist, since the if_capabilities before
2037 			 * restoration was an intersection of the VF's
2038 			 * if_capabilites and the synthetic device's
2039 			 * if_capabilites.
2040 			 */
2041 			sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
2042 			sc->hn_ifp->if_hw_tsomaxsegcount =
2043 			    sc->hn_saved_tsosegcnt;
2044 			sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
2045 		}
2046 
2047 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2048 			/*
2049 			 * Restore RSS settings.
2050 			 */
2051 			hn_vf_rss_restore(sc);
2052 
2053 			/*
2054 			 * Resume link status management, which was suspended
2055 			 * by hn_ifnet_attevent().
2056 			 */
2057 			hn_resume_mgmt(sc);
2058 		}
2059 	}
2060 
2061 	/* Mark transparent mode VF as disabled. */
2062 	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2063 
2064 	rm_wlock(&hn_vfmap_lock);
2065 
2066 	KASSERT(ifp->if_index < hn_vfmap_size,
2067 	    ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
2068 	if (hn_vfmap[ifp->if_index] != NULL) {
2069 		KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
2070 		    ("%s: ifindex %d was mapped to %s",
2071 		     ifp->if_xname, ifp->if_index,
2072 		     hn_vfmap[ifp->if_index]->if_xname));
2073 		hn_vfmap[ifp->if_index] = NULL;
2074 	}
2075 
2076 	rm_wunlock(&hn_vfmap_lock);
2077 done:
2078 	HN_UNLOCK(sc);
2079 }
2080 
2081 static void
2082 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
2083 {
2084 	struct hn_softc *sc = xsc;
2085 
2086 	if (sc->hn_vf_ifp == ifp)
2087 		if_link_state_change(sc->hn_ifp, link_state);
2088 }
2089 
2090 static int
2091 hn_probe(device_t dev)
2092 {
2093 
2094 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2095 		device_set_desc(dev, "Hyper-V Network Interface");
2096 		return BUS_PROBE_DEFAULT;
2097 	}
2098 	return ENXIO;
2099 }
2100 
2101 static int
2102 hn_attach(device_t dev)
2103 {
2104 	struct hn_softc *sc = device_get_softc(dev);
2105 	struct sysctl_oid_list *child;
2106 	struct sysctl_ctx_list *ctx;
2107 	uint8_t eaddr[ETHER_ADDR_LEN];
2108 	struct ifnet *ifp = NULL;
2109 	int error, ring_cnt, tx_ring_cnt;
2110 	uint32_t mtu;
2111 
2112 	sc->hn_dev = dev;
2113 	sc->hn_prichan = vmbus_get_channel(dev);
2114 	HN_LOCK_INIT(sc);
2115 	rm_init(&sc->hn_vf_lock, "hnvf");
2116 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2117 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2118 
2119 	/*
2120 	 * Initialize these tunables once.
2121 	 */
2122 	sc->hn_agg_size = hn_tx_agg_size;
2123 	sc->hn_agg_pkts = hn_tx_agg_pkts;
2124 
2125 	/*
2126 	 * Setup taskqueue for transmission.
2127 	 */
2128 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2129 		int i;
2130 
2131 		sc->hn_tx_taskqs =
2132 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2133 		    M_DEVBUF, M_WAITOK);
2134 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2135 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2136 			    M_WAITOK, taskqueue_thread_enqueue,
2137 			    &sc->hn_tx_taskqs[i]);
2138 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2139 			    "%s tx%d", device_get_nameunit(dev), i);
2140 		}
2141 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2142 		sc->hn_tx_taskqs = hn_tx_taskque;
2143 	}
2144 
2145 	/*
2146 	 * Setup taskqueue for mangement tasks, e.g. link status.
2147 	 */
2148 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2149 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2150 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2151 	    device_get_nameunit(dev));
2152 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2153 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2154 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2155 	    hn_netchg_status_taskfunc, sc);
2156 
2157 	if (hn_xpnt_vf) {
2158 		/*
2159 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2160 		 */
2161 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2162 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2163 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2164 		    device_get_nameunit(dev));
2165 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2166 		    hn_xpnt_vf_init_taskfunc, sc);
2167 	}
2168 
2169 	/*
2170 	 * Allocate ifnet and setup its name earlier, so that if_printf
2171 	 * can be used by functions, which will be called after
2172 	 * ether_ifattach().
2173 	 */
2174 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2175 	ifp->if_softc = sc;
2176 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2177 
2178 	/*
2179 	 * Initialize ifmedia earlier so that it can be unconditionally
2180 	 * destroyed, if error happened later on.
2181 	 */
2182 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2183 
2184 	/*
2185 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2186 	 * to use (tx_ring_cnt).
2187 	 *
2188 	 * NOTE:
2189 	 * The # of RX rings to use is same as the # of channels to use.
2190 	 */
2191 	ring_cnt = hn_chan_cnt;
2192 	if (ring_cnt <= 0) {
2193 		/* Default */
2194 		ring_cnt = mp_ncpus;
2195 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
2196 			ring_cnt = HN_RING_CNT_DEF_MAX;
2197 	} else if (ring_cnt > mp_ncpus) {
2198 		ring_cnt = mp_ncpus;
2199 	}
2200 #ifdef RSS
2201 	if (ring_cnt > rss_getnumbuckets())
2202 		ring_cnt = rss_getnumbuckets();
2203 #endif
2204 
2205 	tx_ring_cnt = hn_tx_ring_cnt;
2206 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2207 		tx_ring_cnt = ring_cnt;
2208 #ifdef HN_IFSTART_SUPPORT
2209 	if (hn_use_if_start) {
2210 		/* ifnet.if_start only needs one TX ring. */
2211 		tx_ring_cnt = 1;
2212 	}
2213 #endif
2214 
2215 	/*
2216 	 * Set the leader CPU for channels.
2217 	 */
2218 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2219 
2220 	/*
2221 	 * Create enough TX/RX rings, even if only limited number of
2222 	 * channels can be allocated.
2223 	 */
2224 	error = hn_create_tx_data(sc, tx_ring_cnt);
2225 	if (error)
2226 		goto failed;
2227 	error = hn_create_rx_data(sc, ring_cnt);
2228 	if (error)
2229 		goto failed;
2230 
2231 	/*
2232 	 * Create transaction context for NVS and RNDIS transactions.
2233 	 */
2234 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2235 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2236 	if (sc->hn_xact == NULL) {
2237 		error = ENXIO;
2238 		goto failed;
2239 	}
2240 
2241 	/*
2242 	 * Install orphan handler for the revocation of this device's
2243 	 * primary channel.
2244 	 *
2245 	 * NOTE:
2246 	 * The processing order is critical here:
2247 	 * Install the orphan handler, _before_ testing whether this
2248 	 * device's primary channel has been revoked or not.
2249 	 */
2250 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2251 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2252 		error = ENXIO;
2253 		goto failed;
2254 	}
2255 
2256 	/*
2257 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
2258 	 */
2259 	error = hn_synth_attach(sc, ETHERMTU);
2260 	if (error)
2261 		goto failed;
2262 
2263 	error = hn_rndis_get_eaddr(sc, eaddr);
2264 	if (error)
2265 		goto failed;
2266 
2267 	error = hn_rndis_get_mtu(sc, &mtu);
2268 	if (error)
2269 		mtu = ETHERMTU;
2270 	else if (bootverbose)
2271 		device_printf(dev, "RNDIS mtu %u\n", mtu);
2272 
2273 #if __FreeBSD_version >= 1100099
2274 	if (sc->hn_rx_ring_inuse > 1) {
2275 		/*
2276 		 * Reduce TCP segment aggregation limit for multiple
2277 		 * RX rings to increase ACK timeliness.
2278 		 */
2279 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2280 	}
2281 #endif
2282 
2283 	/*
2284 	 * Fixup TX/RX stuffs after synthetic parts are attached.
2285 	 */
2286 	hn_fixup_tx_data(sc);
2287 	hn_fixup_rx_data(sc);
2288 
2289 	ctx = device_get_sysctl_ctx(dev);
2290 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2291 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2292 	    &sc->hn_nvs_ver, 0, "NVS version");
2293 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2294 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2295 	    hn_ndis_version_sysctl, "A", "NDIS version");
2296 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2297 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2298 	    hn_caps_sysctl, "A", "capabilities");
2299 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2300 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2301 	    hn_hwassist_sysctl, "A", "hwassist");
2302 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2303 	    CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2304 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2305 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2306 	    "max # of TSO segments");
2307 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2308 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2309 	    "max size of TSO segment");
2310 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2311 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2312 	    hn_rxfilter_sysctl, "A", "rxfilter");
2313 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2314 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2315 	    hn_rss_hash_sysctl, "A", "RSS hash");
2316 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2317 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2318 	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2319 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2320 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2321 	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2322 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2323 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2324 #ifndef RSS
2325 	/*
2326 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
2327 	 */
2328 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2329 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2330 	    hn_rss_key_sysctl, "IU", "RSS key");
2331 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2332 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2333 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
2334 #endif
2335 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2336 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2337 	    "RNDIS offered packet transmission aggregation size limit");
2338 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2339 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2340 	    "RNDIS offered packet transmission aggregation count limit");
2341 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2342 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2343 	    "RNDIS packet transmission aggregation alignment");
2344 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2345 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2346 	    hn_txagg_size_sysctl, "I",
2347 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2348 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2349 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2350 	    hn_txagg_pkts_sysctl, "I",
2351 	    "Packet transmission aggregation packets, "
2352 	    "0 -- disable, -1 -- auto");
2353 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2354 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2355 	    hn_polling_sysctl, "I",
2356 	    "Polling frequency: [100,1000000], 0 disable polling");
2357 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2358 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2359 	    hn_vf_sysctl, "A", "Virtual Function's name");
2360 	if (!hn_xpnt_vf) {
2361 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2362 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2363 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2364 	} else {
2365 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2366 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2367 		    hn_xpnt_vf_enabled_sysctl, "I",
2368 		    "Transparent VF enabled");
2369 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2370 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2371 		    hn_xpnt_vf_accbpf_sysctl, "I",
2372 		    "Accurate BPF for transparent VF");
2373 	}
2374 
2375 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rsc_switch",
2376 	    CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_rsc_sysctl, "A",
2377 	    "switch to rsc");
2378 
2379 	/*
2380 	 * Setup the ifmedia, which has been initialized earlier.
2381 	 */
2382 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2383 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2384 	/* XXX ifmedia_set really should do this for us */
2385 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2386 
2387 	/*
2388 	 * Setup the ifnet for this interface.
2389 	 */
2390 
2391 	ifp->if_baudrate = IF_Gbps(10);
2392 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2393 	ifp->if_ioctl = hn_ioctl;
2394 	ifp->if_init = hn_init;
2395 #ifdef HN_IFSTART_SUPPORT
2396 	if (hn_use_if_start) {
2397 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2398 
2399 		ifp->if_start = hn_start;
2400 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2401 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2402 		IFQ_SET_READY(&ifp->if_snd);
2403 	} else
2404 #endif
2405 	{
2406 		ifp->if_transmit = hn_transmit;
2407 		ifp->if_qflush = hn_xmit_qflush;
2408 	}
2409 
2410 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2411 #ifdef foo
2412 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2413 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2414 #endif
2415 	if (sc->hn_caps & HN_CAP_VLAN) {
2416 		/* XXX not sure about VLAN_MTU. */
2417 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2418 	}
2419 
2420 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2421 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2422 		ifp->if_capabilities |= IFCAP_TXCSUM;
2423 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2424 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2425 	if (sc->hn_caps & HN_CAP_TSO4) {
2426 		ifp->if_capabilities |= IFCAP_TSO4;
2427 		ifp->if_hwassist |= CSUM_IP_TSO;
2428 	}
2429 	if (sc->hn_caps & HN_CAP_TSO6) {
2430 		ifp->if_capabilities |= IFCAP_TSO6;
2431 		ifp->if_hwassist |= CSUM_IP6_TSO;
2432 	}
2433 
2434 	/* Enable all available capabilities by default. */
2435 	ifp->if_capenable = ifp->if_capabilities;
2436 
2437 	/*
2438 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2439 	 * be enabled through SIOCSIFCAP.
2440 	 */
2441 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2442 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2443 
2444 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2445 		/*
2446 		 * Lock hn_set_tso_maxsize() to simplify its
2447 		 * internal logic.
2448 		 */
2449 		HN_LOCK(sc);
2450 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2451 		HN_UNLOCK(sc);
2452 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2453 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2454 	}
2455 
2456 	ether_ifattach(ifp, eaddr);
2457 
2458 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2459 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2460 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2461 	}
2462 	if (mtu < ETHERMTU) {
2463 		if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu);
2464 		ifp->if_mtu = mtu;
2465 	}
2466 
2467 	/* Inform the upper layer about the long frame support. */
2468 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2469 
2470 	/*
2471 	 * Kick off link status check.
2472 	 */
2473 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2474 	hn_update_link_status(sc);
2475 
2476 	if (!hn_xpnt_vf) {
2477 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2478 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2479 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2480 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2481 	} else {
2482 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2483 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2484 	}
2485 
2486 	/*
2487 	 * NOTE:
2488 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2489 	 * since interface's LLADDR is needed; interface LLADDR is not
2490 	 * available when ifnet_arrival event is triggered.
2491 	 */
2492 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2493 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2494 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2495 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2496 
2497 	return (0);
2498 failed:
2499 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2500 		hn_synth_detach(sc);
2501 	hn_detach(dev);
2502 	return (error);
2503 }
2504 
2505 static int
2506 hn_detach(device_t dev)
2507 {
2508 	struct hn_softc *sc = device_get_softc(dev);
2509 	struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2510 
2511 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2512 		/*
2513 		 * In case that the vmbus missed the orphan handler
2514 		 * installation.
2515 		 */
2516 		vmbus_xact_ctx_orphan(sc->hn_xact);
2517 	}
2518 
2519 	if (sc->hn_ifaddr_evthand != NULL)
2520 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2521 	if (sc->hn_ifnet_evthand != NULL)
2522 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2523 	if (sc->hn_ifnet_atthand != NULL) {
2524 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2525 		    sc->hn_ifnet_atthand);
2526 	}
2527 	if (sc->hn_ifnet_dethand != NULL) {
2528 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2529 		    sc->hn_ifnet_dethand);
2530 	}
2531 	if (sc->hn_ifnet_lnkhand != NULL)
2532 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2533 
2534 	vf_ifp = sc->hn_vf_ifp;
2535 	__compiler_membar();
2536 	if (vf_ifp != NULL)
2537 		hn_ifnet_detevent(sc, vf_ifp);
2538 
2539 	if (device_is_attached(dev)) {
2540 		HN_LOCK(sc);
2541 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2542 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2543 				hn_stop(sc, true);
2544 			/*
2545 			 * NOTE:
2546 			 * hn_stop() only suspends data, so managment
2547 			 * stuffs have to be suspended manually here.
2548 			 */
2549 			hn_suspend_mgmt(sc);
2550 			hn_synth_detach(sc);
2551 		}
2552 		HN_UNLOCK(sc);
2553 		ether_ifdetach(ifp);
2554 	}
2555 
2556 	ifmedia_removeall(&sc->hn_media);
2557 	hn_destroy_rx_data(sc);
2558 	hn_destroy_tx_data(sc);
2559 
2560 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2561 		int i;
2562 
2563 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2564 			taskqueue_free(sc->hn_tx_taskqs[i]);
2565 		free(sc->hn_tx_taskqs, M_DEVBUF);
2566 	}
2567 	taskqueue_free(sc->hn_mgmt_taskq0);
2568 	if (sc->hn_vf_taskq != NULL)
2569 		taskqueue_free(sc->hn_vf_taskq);
2570 
2571 	if (sc->hn_xact != NULL) {
2572 		/*
2573 		 * Uninstall the orphan handler _before_ the xact is
2574 		 * destructed.
2575 		 */
2576 		vmbus_chan_unset_orphan(sc->hn_prichan);
2577 		vmbus_xact_ctx_destroy(sc->hn_xact);
2578 	}
2579 
2580 	if_free(ifp);
2581 
2582 	HN_LOCK_DESTROY(sc);
2583 	rm_destroy(&sc->hn_vf_lock);
2584 	return (0);
2585 }
2586 
2587 static int
2588 hn_shutdown(device_t dev)
2589 {
2590 
2591 	return (0);
2592 }
2593 
2594 static void
2595 hn_link_status(struct hn_softc *sc)
2596 {
2597 	uint32_t link_status;
2598 	int error;
2599 
2600 	error = hn_rndis_get_linkstatus(sc, &link_status);
2601 	if (error) {
2602 		/* XXX what to do? */
2603 		return;
2604 	}
2605 
2606 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2607 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2608 	else
2609 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2610 	if_link_state_change(sc->hn_ifp,
2611 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2612 	    LINK_STATE_UP : LINK_STATE_DOWN);
2613 }
2614 
2615 static void
2616 hn_link_taskfunc(void *xsc, int pending __unused)
2617 {
2618 	struct hn_softc *sc = xsc;
2619 
2620 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2621 		return;
2622 	hn_link_status(sc);
2623 }
2624 
2625 static void
2626 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2627 {
2628 	struct hn_softc *sc = xsc;
2629 
2630 	/* Prevent any link status checks from running. */
2631 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2632 
2633 	/*
2634 	 * Fake up a [link down --> link up] state change; 5 seconds
2635 	 * delay is used, which closely simulates miibus reaction
2636 	 * upon link down event.
2637 	 */
2638 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2639 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2640 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2641 	    &sc->hn_netchg_status, 5 * hz);
2642 }
2643 
2644 static void
2645 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2646 {
2647 	struct hn_softc *sc = xsc;
2648 
2649 	/* Re-allow link status checks. */
2650 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2651 	hn_link_status(sc);
2652 }
2653 
2654 static void
2655 hn_update_link_status(struct hn_softc *sc)
2656 {
2657 
2658 	if (sc->hn_mgmt_taskq != NULL)
2659 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2660 }
2661 
2662 static void
2663 hn_change_network(struct hn_softc *sc)
2664 {
2665 
2666 	if (sc->hn_mgmt_taskq != NULL)
2667 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2668 }
2669 
2670 static __inline int
2671 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2672     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2673 {
2674 	struct mbuf *m = *m_head;
2675 	int error;
2676 
2677 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2678 
2679 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2680 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2681 	if (error == EFBIG) {
2682 		struct mbuf *m_new;
2683 
2684 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2685 		if (m_new == NULL)
2686 			return ENOBUFS;
2687 		else
2688 			*m_head = m = m_new;
2689 		txr->hn_tx_collapsed++;
2690 
2691 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2692 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2693 	}
2694 	if (!error) {
2695 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2696 		    BUS_DMASYNC_PREWRITE);
2697 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2698 	}
2699 	return error;
2700 }
2701 
2702 static __inline int
2703 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2704 {
2705 
2706 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2707 	    ("put an onlist txd %#x", txd->flags));
2708 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2709 	    ("put an onagg txd %#x", txd->flags));
2710 
2711 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2712 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2713 		return 0;
2714 
2715 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2716 		struct hn_txdesc *tmp_txd;
2717 
2718 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2719 			int freed __diagused;
2720 
2721 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2722 			    ("resursive aggregation on aggregated txdesc"));
2723 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2724 			    ("not aggregated txdesc"));
2725 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2726 			    ("aggregated txdesc uses dmamap"));
2727 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2728 			    ("aggregated txdesc consumes "
2729 			     "chimney sending buffer"));
2730 			KASSERT(tmp_txd->chim_size == 0,
2731 			    ("aggregated txdesc has non-zero "
2732 			     "chimney sending size"));
2733 
2734 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2735 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2736 			freed = hn_txdesc_put(txr, tmp_txd);
2737 			KASSERT(freed, ("failed to free aggregated txdesc"));
2738 		}
2739 	}
2740 
2741 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2742 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2743 		    ("chim txd uses dmamap"));
2744 		hn_chim_free(txr->hn_sc, txd->chim_index);
2745 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2746 		txd->chim_size = 0;
2747 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2748 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2749 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2750 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2751 		    txd->data_dmap);
2752 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2753 	}
2754 
2755 	if (txd->m != NULL) {
2756 		m_freem(txd->m);
2757 		txd->m = NULL;
2758 	}
2759 
2760 	txd->flags |= HN_TXD_FLAG_ONLIST;
2761 #ifndef HN_USE_TXDESC_BUFRING
2762 	mtx_lock_spin(&txr->hn_txlist_spin);
2763 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2764 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2765 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2766 	txr->hn_txdesc_avail++;
2767 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2768 	mtx_unlock_spin(&txr->hn_txlist_spin);
2769 #else	/* HN_USE_TXDESC_BUFRING */
2770 #ifdef HN_DEBUG
2771 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2772 #endif
2773 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2774 #endif	/* !HN_USE_TXDESC_BUFRING */
2775 
2776 	return 1;
2777 }
2778 
2779 static __inline struct hn_txdesc *
2780 hn_txdesc_get(struct hn_tx_ring *txr)
2781 {
2782 	struct hn_txdesc *txd;
2783 
2784 #ifndef HN_USE_TXDESC_BUFRING
2785 	mtx_lock_spin(&txr->hn_txlist_spin);
2786 	txd = SLIST_FIRST(&txr->hn_txlist);
2787 	if (txd != NULL) {
2788 		KASSERT(txr->hn_txdesc_avail > 0,
2789 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2790 		txr->hn_txdesc_avail--;
2791 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2792 	}
2793 	mtx_unlock_spin(&txr->hn_txlist_spin);
2794 #else
2795 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2796 #endif
2797 
2798 	if (txd != NULL) {
2799 #ifdef HN_USE_TXDESC_BUFRING
2800 #ifdef HN_DEBUG
2801 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2802 #endif
2803 #endif	/* HN_USE_TXDESC_BUFRING */
2804 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2805 		    STAILQ_EMPTY(&txd->agg_list) &&
2806 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2807 		    txd->chim_size == 0 &&
2808 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2809 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2810 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2811 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2812 		txd->refs = 1;
2813 	}
2814 	return txd;
2815 }
2816 
2817 static __inline void
2818 hn_txdesc_hold(struct hn_txdesc *txd)
2819 {
2820 
2821 	/* 0->1 transition will never work */
2822 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2823 	atomic_add_int(&txd->refs, 1);
2824 }
2825 
2826 static __inline void
2827 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2828 {
2829 
2830 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2831 	    ("recursive aggregation on aggregating txdesc"));
2832 
2833 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2834 	    ("already aggregated"));
2835 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2836 	    ("recursive aggregation on to-be-aggregated txdesc"));
2837 
2838 	txd->flags |= HN_TXD_FLAG_ONAGG;
2839 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2840 }
2841 
2842 static bool
2843 hn_tx_ring_pending(struct hn_tx_ring *txr)
2844 {
2845 	bool pending = false;
2846 
2847 #ifndef HN_USE_TXDESC_BUFRING
2848 	mtx_lock_spin(&txr->hn_txlist_spin);
2849 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2850 		pending = true;
2851 	mtx_unlock_spin(&txr->hn_txlist_spin);
2852 #else
2853 	if (!buf_ring_full(txr->hn_txdesc_br))
2854 		pending = true;
2855 #endif
2856 	return (pending);
2857 }
2858 
2859 static __inline void
2860 hn_txeof(struct hn_tx_ring *txr)
2861 {
2862 	txr->hn_has_txeof = 0;
2863 	txr->hn_txeof(txr);
2864 }
2865 
2866 static void
2867 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2868     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2869 {
2870 	struct hn_txdesc *txd = sndc->hn_cbarg;
2871 	struct hn_tx_ring *txr;
2872 
2873 	txr = txd->txr;
2874 	KASSERT(txr->hn_chan == chan,
2875 	    ("channel mismatch, on chan%u, should be chan%u",
2876 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2877 
2878 	txr->hn_has_txeof = 1;
2879 	hn_txdesc_put(txr, txd);
2880 
2881 	++txr->hn_txdone_cnt;
2882 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2883 		txr->hn_txdone_cnt = 0;
2884 		if (txr->hn_oactive)
2885 			hn_txeof(txr);
2886 	}
2887 }
2888 
2889 static void
2890 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2891 {
2892 #if defined(INET) || defined(INET6)
2893 	struct epoch_tracker et;
2894 
2895 	NET_EPOCH_ENTER(et);
2896 	tcp_lro_flush_all(&rxr->hn_lro);
2897 	NET_EPOCH_EXIT(et);
2898 #endif
2899 
2900 	/*
2901 	 * NOTE:
2902 	 * 'txr' could be NULL, if multiple channels and
2903 	 * ifnet.if_start method are enabled.
2904 	 */
2905 	if (txr == NULL || !txr->hn_has_txeof)
2906 		return;
2907 
2908 	txr->hn_txdone_cnt = 0;
2909 	hn_txeof(txr);
2910 }
2911 
2912 static __inline uint32_t
2913 hn_rndis_pktmsg_offset(uint32_t ofs)
2914 {
2915 
2916 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2917 	    ("invalid RNDIS packet msg offset %u", ofs));
2918 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2919 }
2920 
2921 static __inline void *
2922 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2923     size_t pi_dlen, uint32_t pi_type)
2924 {
2925 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2926 	struct rndis_pktinfo *pi;
2927 
2928 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2929 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2930 
2931 	/*
2932 	 * Per-packet-info does not move; it only grows.
2933 	 *
2934 	 * NOTE:
2935 	 * rm_pktinfooffset in this phase counts from the beginning
2936 	 * of rndis_packet_msg.
2937 	 */
2938 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2939 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2940 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2941 	    pkt->rm_pktinfolen);
2942 	pkt->rm_pktinfolen += pi_size;
2943 
2944 	pi->rm_size = pi_size;
2945 	pi->rm_type = pi_type;
2946 	pi->rm_internal = 0;
2947 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2948 
2949 	return (pi->rm_data);
2950 }
2951 
2952 static __inline int
2953 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2954 {
2955 	struct hn_txdesc *txd;
2956 	struct mbuf *m;
2957 	int error, pkts;
2958 
2959 	txd = txr->hn_agg_txd;
2960 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2961 
2962 	/*
2963 	 * Since hn_txpkt() will reset this temporary stat, save
2964 	 * it now, so that oerrors can be updated properly, if
2965 	 * hn_txpkt() ever fails.
2966 	 */
2967 	pkts = txr->hn_stat_pkts;
2968 
2969 	/*
2970 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2971 	 * failure, save it for later freeing, if hn_txpkt() ever
2972 	 * fails.
2973 	 */
2974 	m = txd->m;
2975 	error = hn_txpkt(ifp, txr, txd);
2976 	if (__predict_false(error)) {
2977 		/* txd is freed, but m is not. */
2978 		m_freem(m);
2979 
2980 		txr->hn_flush_failed++;
2981 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2982 	}
2983 
2984 	/* Reset all aggregation states. */
2985 	txr->hn_agg_txd = NULL;
2986 	txr->hn_agg_szleft = 0;
2987 	txr->hn_agg_pktleft = 0;
2988 	txr->hn_agg_prevpkt = NULL;
2989 
2990 	return (error);
2991 }
2992 
2993 static void *
2994 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2995     int pktsize)
2996 {
2997 	void *chim;
2998 
2999 	if (txr->hn_agg_txd != NULL) {
3000 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
3001 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
3002 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
3003 			int olen;
3004 
3005 			/*
3006 			 * Update the previous RNDIS packet's total length,
3007 			 * it can be increased due to the mandatory alignment
3008 			 * padding for this RNDIS packet.  And update the
3009 			 * aggregating txdesc's chimney sending buffer size
3010 			 * accordingly.
3011 			 *
3012 			 * XXX
3013 			 * Zero-out the padding, as required by the RNDIS spec.
3014 			 */
3015 			olen = pkt->rm_len;
3016 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
3017 			agg_txd->chim_size += pkt->rm_len - olen;
3018 
3019 			/* Link this txdesc to the parent. */
3020 			hn_txdesc_agg(agg_txd, txd);
3021 
3022 			chim = (uint8_t *)pkt + pkt->rm_len;
3023 			/* Save the current packet for later fixup. */
3024 			txr->hn_agg_prevpkt = chim;
3025 
3026 			txr->hn_agg_pktleft--;
3027 			txr->hn_agg_szleft -= pktsize;
3028 			if (txr->hn_agg_szleft <=
3029 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3030 				/*
3031 				 * Probably can't aggregate more packets,
3032 				 * flush this aggregating txdesc proactively.
3033 				 */
3034 				txr->hn_agg_pktleft = 0;
3035 			}
3036 			/* Done! */
3037 			return (chim);
3038 		}
3039 		hn_flush_txagg(ifp, txr);
3040 	}
3041 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3042 
3043 	txr->hn_tx_chimney_tried++;
3044 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
3045 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3046 		return (NULL);
3047 	txr->hn_tx_chimney++;
3048 
3049 	chim = txr->hn_sc->hn_chim +
3050 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3051 
3052 	if (txr->hn_agg_pktmax > 1 &&
3053 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3054 		txr->hn_agg_txd = txd;
3055 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3056 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3057 		txr->hn_agg_prevpkt = chim;
3058 	}
3059 	return (chim);
3060 }
3061 
3062 /*
3063  * NOTE:
3064  * If this function fails, then both txd and m_head0 will be freed.
3065  */
3066 static int
3067 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3068     struct mbuf **m_head0)
3069 {
3070 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3071 	int error, nsegs, i;
3072 	struct mbuf *m_head = *m_head0;
3073 	struct rndis_packet_msg *pkt;
3074 	uint32_t *pi_data;
3075 	void *chim = NULL;
3076 	int pkt_hlen, pkt_size;
3077 
3078 	pkt = txd->rndis_pkt;
3079 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3080 	if (pkt_size < txr->hn_chim_size) {
3081 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3082 		if (chim != NULL)
3083 			pkt = chim;
3084 	} else {
3085 		if (txr->hn_agg_txd != NULL)
3086 			hn_flush_txagg(ifp, txr);
3087 	}
3088 
3089 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3090 	pkt->rm_len = m_head->m_pkthdr.len;
3091 	pkt->rm_dataoffset = 0;
3092 	pkt->rm_datalen = m_head->m_pkthdr.len;
3093 	pkt->rm_oobdataoffset = 0;
3094 	pkt->rm_oobdatalen = 0;
3095 	pkt->rm_oobdataelements = 0;
3096 	pkt->rm_pktinfooffset = sizeof(*pkt);
3097 	pkt->rm_pktinfolen = 0;
3098 	pkt->rm_vchandle = 0;
3099 	pkt->rm_reserved = 0;
3100 
3101 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3102 		/*
3103 		 * Set the hash value for this packet.
3104 		 */
3105 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3106 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3107 
3108 		if (M_HASHTYPE_ISHASH(m_head))
3109 			/*
3110 			 * The flowid field contains the hash value host
3111 			 * set in the rx queue if it is a ip forwarding pkt.
3112 			 * Set the same hash value so host can send on the
3113 			 * cpu it was received.
3114 			 */
3115 			*pi_data = m_head->m_pkthdr.flowid;
3116 		else
3117 			/*
3118 			 * Otherwise just put the tx queue index.
3119 			 */
3120 			*pi_data = txr->hn_tx_idx;
3121 	}
3122 
3123 	if (m_head->m_flags & M_VLANTAG) {
3124 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3125 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3126 		*pi_data = NDIS_VLAN_INFO_MAKE(
3127 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3128 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3129 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3130 	}
3131 
3132 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3133 #if defined(INET6) || defined(INET)
3134 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3135 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3136 #ifdef INET
3137 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3138 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3139 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3140 			    m_head->m_pkthdr.tso_segsz);
3141 		}
3142 #endif
3143 #if defined(INET6) && defined(INET)
3144 		else
3145 #endif
3146 #ifdef INET6
3147 		{
3148 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3149 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3150 			    m_head->m_pkthdr.tso_segsz);
3151 		}
3152 #endif
3153 #endif	/* INET6 || INET */
3154 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3155 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3156 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3157 		if (m_head->m_pkthdr.csum_flags &
3158 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3159 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
3160 		} else {
3161 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
3162 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3163 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
3164 		}
3165 
3166 		if (m_head->m_pkthdr.csum_flags &
3167 		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3168 			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3169 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3170 		} else if (m_head->m_pkthdr.csum_flags &
3171 		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3172 			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3173 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3174 		}
3175 	}
3176 
3177 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3178 	/* Fixup RNDIS packet message total length */
3179 	pkt->rm_len += pkt_hlen;
3180 	/* Convert RNDIS packet message offsets */
3181 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3182 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3183 
3184 	/*
3185 	 * Fast path: Chimney sending.
3186 	 */
3187 	if (chim != NULL) {
3188 		struct hn_txdesc *tgt_txd = txd;
3189 
3190 		if (txr->hn_agg_txd != NULL) {
3191 			tgt_txd = txr->hn_agg_txd;
3192 #ifdef INVARIANTS
3193 			*m_head0 = NULL;
3194 #endif
3195 		}
3196 
3197 		KASSERT(pkt == chim,
3198 		    ("RNDIS pkt not in chimney sending buffer"));
3199 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3200 		    ("chimney sending buffer is not used"));
3201 		tgt_txd->chim_size += pkt->rm_len;
3202 
3203 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
3204 		    ((uint8_t *)chim) + pkt_hlen);
3205 
3206 		txr->hn_gpa_cnt = 0;
3207 		txr->hn_sendpkt = hn_txpkt_chim;
3208 		goto done;
3209 	}
3210 
3211 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3212 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3213 	    ("chimney buffer is used"));
3214 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3215 
3216 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3217 	if (__predict_false(error)) {
3218 		int freed __diagused;
3219 
3220 		/*
3221 		 * This mbuf is not linked w/ the txd yet, so free it now.
3222 		 */
3223 		m_freem(m_head);
3224 		*m_head0 = NULL;
3225 
3226 		freed = hn_txdesc_put(txr, txd);
3227 		KASSERT(freed != 0,
3228 		    ("fail to free txd upon txdma error"));
3229 
3230 		txr->hn_txdma_failed++;
3231 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3232 		return error;
3233 	}
3234 	*m_head0 = m_head;
3235 
3236 	/* +1 RNDIS packet message */
3237 	txr->hn_gpa_cnt = nsegs + 1;
3238 
3239 	/* send packet with page buffer */
3240 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3241 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3242 	txr->hn_gpa[0].gpa_len = pkt_hlen;
3243 
3244 	/*
3245 	 * Fill the page buffers with mbuf info after the page
3246 	 * buffer for RNDIS packet message.
3247 	 */
3248 	for (i = 0; i < nsegs; ++i) {
3249 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3250 
3251 		gpa->gpa_page = atop(segs[i].ds_addr);
3252 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3253 		gpa->gpa_len = segs[i].ds_len;
3254 	}
3255 
3256 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3257 	txd->chim_size = 0;
3258 	txr->hn_sendpkt = hn_txpkt_sglist;
3259 done:
3260 	txd->m = m_head;
3261 
3262 	/* Set the completion routine */
3263 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3264 
3265 	/* Update temporary stats for later use. */
3266 	txr->hn_stat_pkts++;
3267 	txr->hn_stat_size += m_head->m_pkthdr.len;
3268 	if (m_head->m_flags & M_MCAST)
3269 		txr->hn_stat_mcasts++;
3270 
3271 	return 0;
3272 }
3273 
3274 /*
3275  * NOTE:
3276  * If this function fails, then txd will be freed, but the mbuf
3277  * associated w/ the txd will _not_ be freed.
3278  */
3279 static int
3280 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3281 {
3282 	int error, send_failed = 0, has_bpf;
3283 
3284 again:
3285 	has_bpf = bpf_peers_present(ifp->if_bpf);
3286 	if (has_bpf) {
3287 		/*
3288 		 * Make sure that this txd and any aggregated txds are not
3289 		 * freed before ETHER_BPF_MTAP.
3290 		 */
3291 		hn_txdesc_hold(txd);
3292 	}
3293 	error = txr->hn_sendpkt(txr, txd);
3294 	if (!error) {
3295 		if (has_bpf) {
3296 			const struct hn_txdesc *tmp_txd;
3297 
3298 			ETHER_BPF_MTAP(ifp, txd->m);
3299 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3300 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
3301 		}
3302 
3303 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3304 #ifdef HN_IFSTART_SUPPORT
3305 		if (!hn_use_if_start)
3306 #endif
3307 		{
3308 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
3309 			    txr->hn_stat_size);
3310 			if (txr->hn_stat_mcasts != 0) {
3311 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3312 				    txr->hn_stat_mcasts);
3313 			}
3314 		}
3315 		txr->hn_pkts += txr->hn_stat_pkts;
3316 		txr->hn_sends++;
3317 	}
3318 	if (has_bpf)
3319 		hn_txdesc_put(txr, txd);
3320 
3321 	if (__predict_false(error)) {
3322 		int freed __diagused;
3323 
3324 		/*
3325 		 * This should "really rarely" happen.
3326 		 *
3327 		 * XXX Too many RX to be acked or too many sideband
3328 		 * commands to run?  Ask netvsc_channel_rollup()
3329 		 * to kick start later.
3330 		 */
3331 		txr->hn_has_txeof = 1;
3332 		if (!send_failed) {
3333 			txr->hn_send_failed++;
3334 			send_failed = 1;
3335 			/*
3336 			 * Try sending again after set hn_has_txeof;
3337 			 * in case that we missed the last
3338 			 * netvsc_channel_rollup().
3339 			 */
3340 			goto again;
3341 		}
3342 		if_printf(ifp, "send failed\n");
3343 
3344 		/*
3345 		 * Caller will perform further processing on the
3346 		 * associated mbuf, so don't free it in hn_txdesc_put();
3347 		 * only unload it from the DMA map in hn_txdesc_put(),
3348 		 * if it was loaded.
3349 		 */
3350 		txd->m = NULL;
3351 		freed = hn_txdesc_put(txr, txd);
3352 		KASSERT(freed != 0,
3353 		    ("fail to free txd upon send error"));
3354 
3355 		txr->hn_send_failed++;
3356 	}
3357 
3358 	/* Reset temporary stats, after this sending is done. */
3359 	txr->hn_stat_size = 0;
3360 	txr->hn_stat_pkts = 0;
3361 	txr->hn_stat_mcasts = 0;
3362 
3363 	return (error);
3364 }
3365 
3366 /*
3367  * Append the specified data to the indicated mbuf chain,
3368  * Extend the mbuf chain if the new data does not fit in
3369  * existing space.
3370  *
3371  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3372  * There should be an equivalent in the kernel mbuf code,
3373  * but there does not appear to be one yet.
3374  *
3375  * Differs from m_append() in that additional mbufs are
3376  * allocated with cluster size MJUMPAGESIZE, and filled
3377  * accordingly.
3378  *
3379  * Return the last mbuf in the chain or NULL if failed to
3380  * allocate new mbuf.
3381  */
3382 static struct mbuf *
3383 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3384 {
3385 	struct mbuf *m, *n;
3386 	int remainder, space;
3387 
3388 	for (m = m0; m->m_next != NULL; m = m->m_next)
3389 		;
3390 	remainder = len;
3391 	space = M_TRAILINGSPACE(m);
3392 	if (space > 0) {
3393 		/*
3394 		 * Copy into available space.
3395 		 */
3396 		if (space > remainder)
3397 			space = remainder;
3398 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3399 		m->m_len += space;
3400 		cp += space;
3401 		remainder -= space;
3402 	}
3403 	while (remainder > 0) {
3404 		/*
3405 		 * Allocate a new mbuf; could check space
3406 		 * and allocate a cluster instead.
3407 		 */
3408 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3409 		if (n == NULL)
3410 			return NULL;
3411 		n->m_len = min(MJUMPAGESIZE, remainder);
3412 		bcopy(cp, mtod(n, caddr_t), n->m_len);
3413 		cp += n->m_len;
3414 		remainder -= n->m_len;
3415 		m->m_next = n;
3416 		m = n;
3417 	}
3418 
3419 	return m;
3420 }
3421 
3422 #if defined(INET) || defined(INET6)
3423 static __inline int
3424 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3425 {
3426 #if __FreeBSD_version >= 1100095
3427 	if (hn_lro_mbufq_depth) {
3428 		tcp_lro_queue_mbuf(lc, m);
3429 		return 0;
3430 	}
3431 #endif
3432 	return tcp_lro_rx(lc, m, 0);
3433 }
3434 #endif
3435 
3436 static int
3437 hn_rxpkt(struct hn_rx_ring *rxr)
3438 {
3439 	struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3440 	struct mbuf *m_new, *n;
3441 	int size, do_lro = 0, do_csum = 1, is_vf = 0;
3442 	int hash_type = M_HASHTYPE_NONE;
3443 	int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3444 	int i;
3445 
3446 	ifp = hn_ifp;
3447 	if (rxr->hn_rxvf_ifp != NULL) {
3448 		/*
3449 		 * Non-transparent mode VF; pretend this packet is from
3450 		 * the VF.
3451 		 */
3452 		ifp = rxr->hn_rxvf_ifp;
3453 		is_vf = 1;
3454 	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3455 		/* Transparent mode VF. */
3456 		is_vf = 1;
3457 	}
3458 
3459 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3460 		/*
3461 		 * NOTE:
3462 		 * See the NOTE of hn_rndis_init_fixat().  This
3463 		 * function can be reached, immediately after the
3464 		 * RNDIS is initialized but before the ifnet is
3465 		 * setup on the hn_attach() path; drop the unexpected
3466 		 * packets.
3467 		 */
3468 		return (0);
3469 	}
3470 
3471 	if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) {
3472 		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3473 		return (0);
3474 	}
3475 
3476 	if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) {
3477 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3478 		if (m_new == NULL) {
3479 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3480 			return (0);
3481 		}
3482 		memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0],
3483 		    rxr->rsc.frag_len[0]);
3484 		m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0];
3485 	} else {
3486 		/*
3487 		 * Get an mbuf with a cluster.  For packets 2K or less,
3488 		 * get a standard 2K cluster.  For anything larger, get a
3489 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3490 		 * if looped around to the Hyper-V TX channel, so avoid them.
3491 		 */
3492 		size = MCLBYTES;
3493 		if (rxr->rsc.pktlen > MCLBYTES) {
3494 			/* 4096 */
3495 			size = MJUMPAGESIZE;
3496 		}
3497 
3498 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3499 		if (m_new == NULL) {
3500 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3501 			return (0);
3502 		}
3503 
3504 		n = m_new;
3505 		for (i = 0; i < rxr->rsc.cnt; i++) {
3506 			n = hv_m_append(n, rxr->rsc.frag_len[i],
3507 			    rxr->rsc.frag_data[i]);
3508 			if (n == NULL) {
3509 				if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3510 				return (0);
3511 			} else {
3512 				m_new->m_pkthdr.len += rxr->rsc.frag_len[i];
3513 			}
3514 		}
3515 	}
3516 	if (rxr->rsc.pktlen <= MHLEN)
3517 		rxr->hn_small_pkts++;
3518 
3519 	m_new->m_pkthdr.rcvif = ifp;
3520 
3521 	if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3522 		do_csum = 0;
3523 
3524 	/* receive side checksum offload */
3525 	if (rxr->rsc.csum_info != NULL) {
3526 		/* IP csum offload */
3527 		if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3528 			m_new->m_pkthdr.csum_flags |=
3529 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3530 			rxr->hn_csum_ip++;
3531 		}
3532 
3533 		/* TCP/UDP csum offload */
3534 		if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK |
3535 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3536 			m_new->m_pkthdr.csum_flags |=
3537 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3538 			m_new->m_pkthdr.csum_data = 0xffff;
3539 			if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK)
3540 				rxr->hn_csum_tcp++;
3541 			else
3542 				rxr->hn_csum_udp++;
3543 		}
3544 
3545 		/*
3546 		 * XXX
3547 		 * As of this write (Oct 28th, 2016), host side will turn
3548 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3549 		 * the do_lro setting here is actually _not_ accurate.  We
3550 		 * depend on the RSS hash type check to reset do_lro.
3551 		 */
3552 		if ((*(rxr->rsc.csum_info) &
3553 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3554 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3555 			do_lro = 1;
3556 	} else {
3557 		hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3558 		if (l3proto == ETHERTYPE_IP) {
3559 			if (l4proto == IPPROTO_TCP) {
3560 				if (do_csum &&
3561 				    (rxr->hn_trust_hcsum &
3562 				     HN_TRUST_HCSUM_TCP)) {
3563 					rxr->hn_csum_trusted++;
3564 					m_new->m_pkthdr.csum_flags |=
3565 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3566 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3567 					m_new->m_pkthdr.csum_data = 0xffff;
3568 				}
3569 				do_lro = 1;
3570 			} else if (l4proto == IPPROTO_UDP) {
3571 				if (do_csum &&
3572 				    (rxr->hn_trust_hcsum &
3573 				     HN_TRUST_HCSUM_UDP)) {
3574 					rxr->hn_csum_trusted++;
3575 					m_new->m_pkthdr.csum_flags |=
3576 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3577 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3578 					m_new->m_pkthdr.csum_data = 0xffff;
3579 				}
3580 			} else if (l4proto != IPPROTO_DONE && do_csum &&
3581 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3582 				rxr->hn_csum_trusted++;
3583 				m_new->m_pkthdr.csum_flags |=
3584 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3585 			}
3586 		}
3587 	}
3588 
3589 	if (rxr->rsc.vlan_info != NULL) {
3590 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3591 		    NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)),
3592 		    NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)),
3593 		    NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info)));
3594 		m_new->m_flags |= M_VLANTAG;
3595 	}
3596 
3597 	/*
3598 	 * If VF is activated (tranparent/non-transparent mode does not
3599 	 * matter here).
3600 	 *
3601 	 * - Disable LRO
3602 	 *
3603 	 *   hn(4) will only receive broadcast packets, multicast packets,
3604 	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3605 	 *   packet types.
3606 	 *
3607 	 *   For non-transparent, we definitely _cannot_ enable LRO at
3608 	 *   all, since the LRO flush will use hn(4) as the receiving
3609 	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3610 	 */
3611 	if (is_vf)
3612 		do_lro = 0;
3613 
3614 	/*
3615 	 * If VF is activated (tranparent/non-transparent mode does not
3616 	 * matter here), do _not_ mess with unsupported hash types or
3617 	 * functions.
3618 	 */
3619 	if (rxr->rsc.hash_info != NULL) {
3620 		rxr->hn_rss_pkts++;
3621 		m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value);
3622 		if (!is_vf)
3623 			hash_type = M_HASHTYPE_OPAQUE_HASH;
3624 		if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) ==
3625 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3626 			uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK &
3627 			    rxr->hn_mbuf_hash);
3628 
3629 			/*
3630 			 * NOTE:
3631 			 * do_lro is resetted, if the hash types are not TCP
3632 			 * related.  See the comment in the above csum_flags
3633 			 * setup section.
3634 			 */
3635 			switch (type) {
3636 			case NDIS_HASH_IPV4:
3637 				hash_type = M_HASHTYPE_RSS_IPV4;
3638 				do_lro = 0;
3639 				break;
3640 
3641 			case NDIS_HASH_TCP_IPV4:
3642 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3643 				if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3644 					int def_htype = M_HASHTYPE_OPAQUE_HASH;
3645 
3646 					if (is_vf)
3647 						def_htype = M_HASHTYPE_NONE;
3648 
3649 					/*
3650 					 * UDP 4-tuple hash is delivered as
3651 					 * TCP 4-tuple hash.
3652 					 */
3653 					if (l3proto == ETHERTYPE_MAX) {
3654 						hn_rxpkt_proto(m_new,
3655 						    &l3proto, &l4proto);
3656 					}
3657 					if (l3proto == ETHERTYPE_IP) {
3658 						if (l4proto == IPPROTO_UDP &&
3659 						    (rxr->hn_mbuf_hash &
3660 						     NDIS_HASH_UDP_IPV4_X)) {
3661 							hash_type =
3662 							M_HASHTYPE_RSS_UDP_IPV4;
3663 							do_lro = 0;
3664 						} else if (l4proto !=
3665 						    IPPROTO_TCP) {
3666 							hash_type = def_htype;
3667 							do_lro = 0;
3668 						}
3669 					} else {
3670 						hash_type = def_htype;
3671 						do_lro = 0;
3672 					}
3673 				}
3674 				break;
3675 
3676 			case NDIS_HASH_IPV6:
3677 				hash_type = M_HASHTYPE_RSS_IPV6;
3678 				do_lro = 0;
3679 				break;
3680 
3681 			case NDIS_HASH_IPV6_EX:
3682 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3683 				do_lro = 0;
3684 				break;
3685 
3686 			case NDIS_HASH_TCP_IPV6:
3687 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3688 				break;
3689 
3690 			case NDIS_HASH_TCP_IPV6_EX:
3691 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3692 				break;
3693 			}
3694 		}
3695 	} else if (!is_vf) {
3696 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3697 		hash_type = M_HASHTYPE_OPAQUE;
3698 	}
3699 	M_HASHTYPE_SET(m_new, hash_type);
3700 
3701 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3702 	if (hn_ifp != ifp) {
3703 		const struct ether_header *eh;
3704 
3705 		/*
3706 		 * Non-transparent mode VF is activated.
3707 		 */
3708 
3709 		/*
3710 		 * Allow tapping on hn(4).
3711 		 */
3712 		ETHER_BPF_MTAP(hn_ifp, m_new);
3713 
3714 		/*
3715 		 * Update hn(4)'s stats.
3716 		 */
3717 		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3718 		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3719 		/* Checked at the beginning of this function. */
3720 		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3721 		eh = mtod(m_new, struct ether_header *);
3722 		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3723 			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3724 	}
3725 	rxr->hn_pkts++;
3726 
3727 	if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3728 #if defined(INET) || defined(INET6)
3729 		struct lro_ctrl *lro = &rxr->hn_lro;
3730 
3731 		if (lro->lro_cnt) {
3732 			rxr->hn_lro_tried++;
3733 			if (hn_lro_rx(lro, m_new) == 0) {
3734 				/* DONE! */
3735 				return 0;
3736 			}
3737 		}
3738 #endif
3739 	}
3740 	ifp->if_input(ifp, m_new);
3741 
3742 	return (0);
3743 }
3744 
3745 static int
3746 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3747 {
3748 	struct hn_softc *sc = ifp->if_softc;
3749 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3750 	struct ifnet *vf_ifp;
3751 	int mask, error = 0;
3752 	struct ifrsskey *ifrk;
3753 	struct ifrsshash *ifrh;
3754 	uint32_t mtu;
3755 
3756 	switch (cmd) {
3757 	case SIOCSIFMTU:
3758 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3759 			error = EINVAL;
3760 			break;
3761 		}
3762 
3763 		HN_LOCK(sc);
3764 
3765 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3766 			HN_UNLOCK(sc);
3767 			break;
3768 		}
3769 
3770 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3771 			/* Can't change MTU */
3772 			HN_UNLOCK(sc);
3773 			error = EOPNOTSUPP;
3774 			break;
3775 		}
3776 
3777 		if (ifp->if_mtu == ifr->ifr_mtu) {
3778 			HN_UNLOCK(sc);
3779 			break;
3780 		}
3781 
3782 		if (hn_xpnt_vf_isready(sc)) {
3783 			vf_ifp = sc->hn_vf_ifp;
3784 			ifr_vf = *ifr;
3785 			strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3786 			    sizeof(ifr_vf.ifr_name));
3787 			error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3788 			    (caddr_t)&ifr_vf);
3789 			if (error) {
3790 				HN_UNLOCK(sc);
3791 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3792 				    vf_ifp->if_xname, ifr->ifr_mtu, error);
3793 				break;
3794 			}
3795 		}
3796 
3797 		/*
3798 		 * Suspend this interface before the synthetic parts
3799 		 * are ripped.
3800 		 */
3801 		hn_suspend(sc);
3802 
3803 		/*
3804 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3805 		 */
3806 		hn_synth_detach(sc);
3807 
3808 		/*
3809 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3810 		 * with the new MTU setting.
3811 		 */
3812 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3813 		if (error) {
3814 			HN_UNLOCK(sc);
3815 			break;
3816 		}
3817 
3818 		error = hn_rndis_get_mtu(sc, &mtu);
3819 		if (error)
3820 			mtu = ifr->ifr_mtu;
3821 		else if (bootverbose)
3822 			if_printf(ifp, "RNDIS mtu %u\n", mtu);
3823 
3824 		/*
3825 		 * Commit the requested MTU, after the synthetic parts
3826 		 * have been successfully attached.
3827 		 */
3828 		if (mtu >= ifr->ifr_mtu) {
3829 			mtu = ifr->ifr_mtu;
3830 		} else {
3831 			if_printf(ifp, "fixup mtu %d -> %u\n",
3832 			    ifr->ifr_mtu, mtu);
3833 		}
3834 		ifp->if_mtu = mtu;
3835 
3836 		/*
3837 		 * Synthetic parts' reattach may change the chimney
3838 		 * sending size; update it.
3839 		 */
3840 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3841 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3842 
3843 		/*
3844 		 * Make sure that various parameters based on MTU are
3845 		 * still valid, after the MTU change.
3846 		 */
3847 		hn_mtu_change_fixup(sc);
3848 
3849 		/*
3850 		 * All done!  Resume the interface now.
3851 		 */
3852 		hn_resume(sc);
3853 
3854 		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3855 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3856 			/*
3857 			 * Since we have reattached the NVS part,
3858 			 * change the datapath to VF again; in case
3859 			 * that it is lost, after the NVS was detached.
3860 			 */
3861 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3862 		}
3863 
3864 		HN_UNLOCK(sc);
3865 		break;
3866 
3867 	case SIOCSIFFLAGS:
3868 		HN_LOCK(sc);
3869 
3870 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3871 			HN_UNLOCK(sc);
3872 			break;
3873 		}
3874 
3875 		if (hn_xpnt_vf_isready(sc))
3876 			hn_xpnt_vf_saveifflags(sc);
3877 
3878 		if (ifp->if_flags & IFF_UP) {
3879 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3880 				/*
3881 				 * Caller meight hold mutex, e.g.
3882 				 * bpf; use busy-wait for the RNDIS
3883 				 * reply.
3884 				 */
3885 				HN_NO_SLEEPING(sc);
3886 				hn_rxfilter_config(sc);
3887 				HN_SLEEPING_OK(sc);
3888 
3889 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3890 					error = hn_xpnt_vf_iocsetflags(sc);
3891 			} else {
3892 				hn_init_locked(sc);
3893 			}
3894 		} else {
3895 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3896 				hn_stop(sc, false);
3897 		}
3898 		sc->hn_if_flags = ifp->if_flags;
3899 
3900 		HN_UNLOCK(sc);
3901 		break;
3902 
3903 	case SIOCSIFCAP:
3904 		HN_LOCK(sc);
3905 
3906 		if (hn_xpnt_vf_isready(sc)) {
3907 			ifr_vf = *ifr;
3908 			strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3909 			    sizeof(ifr_vf.ifr_name));
3910 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3911 			HN_UNLOCK(sc);
3912 			break;
3913 		}
3914 
3915 		/*
3916 		 * Fix up requested capabilities w/ supported capabilities,
3917 		 * since the supported capabilities could have been changed.
3918 		 */
3919 		mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3920 		    ifp->if_capenable;
3921 
3922 		if (mask & IFCAP_TXCSUM) {
3923 			ifp->if_capenable ^= IFCAP_TXCSUM;
3924 			if (ifp->if_capenable & IFCAP_TXCSUM)
3925 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3926 			else
3927 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3928 		}
3929 		if (mask & IFCAP_TXCSUM_IPV6) {
3930 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3931 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3932 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3933 			else
3934 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3935 		}
3936 
3937 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3938 		if (mask & IFCAP_RXCSUM)
3939 			ifp->if_capenable ^= IFCAP_RXCSUM;
3940 #ifdef foo
3941 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3942 		if (mask & IFCAP_RXCSUM_IPV6)
3943 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3944 #endif
3945 
3946 		if (mask & IFCAP_LRO)
3947 			ifp->if_capenable ^= IFCAP_LRO;
3948 
3949 		if (mask & IFCAP_TSO4) {
3950 			ifp->if_capenable ^= IFCAP_TSO4;
3951 			if (ifp->if_capenable & IFCAP_TSO4)
3952 				ifp->if_hwassist |= CSUM_IP_TSO;
3953 			else
3954 				ifp->if_hwassist &= ~CSUM_IP_TSO;
3955 		}
3956 		if (mask & IFCAP_TSO6) {
3957 			ifp->if_capenable ^= IFCAP_TSO6;
3958 			if (ifp->if_capenable & IFCAP_TSO6)
3959 				ifp->if_hwassist |= CSUM_IP6_TSO;
3960 			else
3961 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
3962 		}
3963 
3964 		HN_UNLOCK(sc);
3965 		break;
3966 
3967 	case SIOCADDMULTI:
3968 	case SIOCDELMULTI:
3969 		HN_LOCK(sc);
3970 
3971 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3972 			HN_UNLOCK(sc);
3973 			break;
3974 		}
3975 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3976 			/*
3977 			 * Multicast uses mutex; use busy-wait for
3978 			 * the RNDIS reply.
3979 			 */
3980 			HN_NO_SLEEPING(sc);
3981 			hn_rxfilter_config(sc);
3982 			HN_SLEEPING_OK(sc);
3983 		}
3984 
3985 		/* XXX vlan(4) style mcast addr maintenance */
3986 		if (hn_xpnt_vf_isready(sc)) {
3987 			int old_if_flags;
3988 
3989 			old_if_flags = sc->hn_vf_ifp->if_flags;
3990 			hn_xpnt_vf_saveifflags(sc);
3991 
3992 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3993 			    ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3994 			     IFF_ALLMULTI))
3995 				error = hn_xpnt_vf_iocsetflags(sc);
3996 		}
3997 
3998 		HN_UNLOCK(sc);
3999 		break;
4000 
4001 	case SIOCSIFMEDIA:
4002 	case SIOCGIFMEDIA:
4003 		HN_LOCK(sc);
4004 		if (hn_xpnt_vf_isready(sc)) {
4005 			/*
4006 			 * SIOCGIFMEDIA expects ifmediareq, so don't
4007 			 * create and pass ifr_vf to the VF here; just
4008 			 * replace the ifr_name.
4009 			 */
4010 			vf_ifp = sc->hn_vf_ifp;
4011 			strlcpy(ifr->ifr_name, vf_ifp->if_xname,
4012 			    sizeof(ifr->ifr_name));
4013 			error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
4014 			/* Restore the ifr_name. */
4015 			strlcpy(ifr->ifr_name, ifp->if_xname,
4016 			    sizeof(ifr->ifr_name));
4017 			HN_UNLOCK(sc);
4018 			break;
4019 		}
4020 		HN_UNLOCK(sc);
4021 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
4022 		break;
4023 
4024 	case SIOCGIFRSSHASH:
4025 		ifrh = (struct ifrsshash *)data;
4026 		HN_LOCK(sc);
4027 		if (sc->hn_rx_ring_inuse == 1) {
4028 			HN_UNLOCK(sc);
4029 			ifrh->ifrh_func = RSS_FUNC_NONE;
4030 			ifrh->ifrh_types = 0;
4031 			break;
4032 		}
4033 
4034 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4035 			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
4036 		else
4037 			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
4038 		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
4039 		HN_UNLOCK(sc);
4040 		break;
4041 
4042 	case SIOCGIFRSSKEY:
4043 		ifrk = (struct ifrsskey *)data;
4044 		HN_LOCK(sc);
4045 		if (sc->hn_rx_ring_inuse == 1) {
4046 			HN_UNLOCK(sc);
4047 			ifrk->ifrk_func = RSS_FUNC_NONE;
4048 			ifrk->ifrk_keylen = 0;
4049 			break;
4050 		}
4051 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4052 			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
4053 		else
4054 			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
4055 		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
4056 		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
4057 		    NDIS_HASH_KEYSIZE_TOEPLITZ);
4058 		HN_UNLOCK(sc);
4059 		break;
4060 
4061 	default:
4062 		error = ether_ioctl(ifp, cmd, data);
4063 		break;
4064 	}
4065 	return (error);
4066 }
4067 
4068 static void
4069 hn_stop(struct hn_softc *sc, bool detaching)
4070 {
4071 	struct ifnet *ifp = sc->hn_ifp;
4072 	int i;
4073 
4074 	HN_LOCK_ASSERT(sc);
4075 
4076 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4077 	    ("synthetic parts were not attached"));
4078 
4079 	/* Clear RUNNING bit ASAP. */
4080 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4081 
4082 	/* Disable polling. */
4083 	hn_polling(sc, 0);
4084 
4085 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4086 		KASSERT(sc->hn_vf_ifp != NULL,
4087 		    ("%s: VF is not attached", ifp->if_xname));
4088 
4089 		/* Mark transparent mode VF as disabled. */
4090 		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4091 
4092 		/*
4093 		 * NOTE:
4094 		 * Datapath setting must happen _before_ bringing
4095 		 * the VF down.
4096 		 */
4097 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4098 
4099 		/*
4100 		 * Bring the VF down.
4101 		 */
4102 		hn_xpnt_vf_saveifflags(sc);
4103 		sc->hn_vf_ifp->if_flags &= ~IFF_UP;
4104 		hn_xpnt_vf_iocsetflags(sc);
4105 	}
4106 
4107 	/* Suspend data transfers. */
4108 	hn_suspend_data(sc);
4109 
4110 	/* Clear OACTIVE bit. */
4111 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4112 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4113 		sc->hn_tx_ring[i].hn_oactive = 0;
4114 
4115 	/*
4116 	 * If the non-transparent mode VF is active, make sure
4117 	 * that the RX filter still allows packet reception.
4118 	 */
4119 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4120 		hn_rxfilter_config(sc);
4121 }
4122 
4123 static void
4124 hn_init_locked(struct hn_softc *sc)
4125 {
4126 	struct ifnet *ifp = sc->hn_ifp;
4127 	int i;
4128 
4129 	HN_LOCK_ASSERT(sc);
4130 
4131 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4132 		return;
4133 
4134 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
4135 		return;
4136 
4137 	/* Configure RX filter */
4138 	hn_rxfilter_config(sc);
4139 
4140 	/* Clear OACTIVE bit. */
4141 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4142 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4143 		sc->hn_tx_ring[i].hn_oactive = 0;
4144 
4145 	/* Clear TX 'suspended' bit. */
4146 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4147 
4148 	if (hn_xpnt_vf_isready(sc)) {
4149 		/* Initialize transparent VF. */
4150 		hn_xpnt_vf_init(sc);
4151 	}
4152 
4153 	/* Everything is ready; unleash! */
4154 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4155 
4156 	/* Re-enable polling if requested. */
4157 	if (sc->hn_pollhz > 0)
4158 		hn_polling(sc, sc->hn_pollhz);
4159 }
4160 
4161 static void
4162 hn_init(void *xsc)
4163 {
4164 	struct hn_softc *sc = xsc;
4165 
4166 	HN_LOCK(sc);
4167 	hn_init_locked(sc);
4168 	HN_UNLOCK(sc);
4169 }
4170 
4171 #if __FreeBSD_version >= 1100099
4172 
4173 static int
4174 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4175 {
4176 	struct hn_softc *sc = arg1;
4177 	unsigned int lenlim;
4178 	int error;
4179 
4180 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4181 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
4182 	if (error || req->newptr == NULL)
4183 		return error;
4184 
4185 	HN_LOCK(sc);
4186 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4187 	    lenlim > TCP_LRO_LENGTH_MAX) {
4188 		HN_UNLOCK(sc);
4189 		return EINVAL;
4190 	}
4191 	hn_set_lro_lenlim(sc, lenlim);
4192 	HN_UNLOCK(sc);
4193 
4194 	return 0;
4195 }
4196 
4197 static int
4198 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4199 {
4200 	struct hn_softc *sc = arg1;
4201 	int ackcnt, error, i;
4202 
4203 	/*
4204 	 * lro_ackcnt_lim is append count limit,
4205 	 * +1 to turn it into aggregation limit.
4206 	 */
4207 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4208 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4209 	if (error || req->newptr == NULL)
4210 		return error;
4211 
4212 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4213 		return EINVAL;
4214 
4215 	/*
4216 	 * Convert aggregation limit back to append
4217 	 * count limit.
4218 	 */
4219 	--ackcnt;
4220 	HN_LOCK(sc);
4221 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4222 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4223 	HN_UNLOCK(sc);
4224 	return 0;
4225 }
4226 
4227 #endif
4228 
4229 static int
4230 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4231 {
4232 	struct hn_softc *sc = arg1;
4233 	int hcsum = arg2;
4234 	int on, error, i;
4235 
4236 	on = 0;
4237 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4238 		on = 1;
4239 
4240 	error = sysctl_handle_int(oidp, &on, 0, req);
4241 	if (error || req->newptr == NULL)
4242 		return error;
4243 
4244 	HN_LOCK(sc);
4245 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4246 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4247 
4248 		if (on)
4249 			rxr->hn_trust_hcsum |= hcsum;
4250 		else
4251 			rxr->hn_trust_hcsum &= ~hcsum;
4252 	}
4253 	HN_UNLOCK(sc);
4254 	return 0;
4255 }
4256 
4257 static int
4258 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4259 {
4260 	struct hn_softc *sc = arg1;
4261 	int chim_size, error;
4262 
4263 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
4264 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
4265 	if (error || req->newptr == NULL)
4266 		return error;
4267 
4268 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4269 		return EINVAL;
4270 
4271 	HN_LOCK(sc);
4272 	hn_set_chim_size(sc, chim_size);
4273 	HN_UNLOCK(sc);
4274 	return 0;
4275 }
4276 
4277 #if __FreeBSD_version < 1100095
4278 static int
4279 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4280 {
4281 	struct hn_softc *sc = arg1;
4282 	int ofs = arg2, i, error;
4283 	struct hn_rx_ring *rxr;
4284 	uint64_t stat;
4285 
4286 	stat = 0;
4287 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4288 		rxr = &sc->hn_rx_ring[i];
4289 		stat += *((int *)((uint8_t *)rxr + ofs));
4290 	}
4291 
4292 	error = sysctl_handle_64(oidp, &stat, 0, req);
4293 	if (error || req->newptr == NULL)
4294 		return error;
4295 
4296 	/* Zero out this stat. */
4297 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4298 		rxr = &sc->hn_rx_ring[i];
4299 		*((int *)((uint8_t *)rxr + ofs)) = 0;
4300 	}
4301 	return 0;
4302 }
4303 #else
4304 static int
4305 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4306 {
4307 	struct hn_softc *sc = arg1;
4308 	int ofs = arg2, i, error;
4309 	struct hn_rx_ring *rxr;
4310 	uint64_t stat;
4311 
4312 	stat = 0;
4313 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4314 		rxr = &sc->hn_rx_ring[i];
4315 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4316 	}
4317 
4318 	error = sysctl_handle_64(oidp, &stat, 0, req);
4319 	if (error || req->newptr == NULL)
4320 		return error;
4321 
4322 	/* Zero out this stat. */
4323 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4324 		rxr = &sc->hn_rx_ring[i];
4325 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4326 	}
4327 	return 0;
4328 }
4329 
4330 #endif
4331 
4332 static int
4333 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4334 {
4335 	struct hn_softc *sc = arg1;
4336 	int ofs = arg2, i, error;
4337 	struct hn_rx_ring *rxr;
4338 	u_long stat;
4339 
4340 	stat = 0;
4341 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4342 		rxr = &sc->hn_rx_ring[i];
4343 		stat += *((u_long *)((uint8_t *)rxr + ofs));
4344 	}
4345 
4346 	error = sysctl_handle_long(oidp, &stat, 0, req);
4347 	if (error || req->newptr == NULL)
4348 		return error;
4349 
4350 	/* Zero out this stat. */
4351 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4352 		rxr = &sc->hn_rx_ring[i];
4353 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
4354 	}
4355 	return 0;
4356 }
4357 
4358 static int
4359 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4360 {
4361 	struct hn_softc *sc = arg1;
4362 	int ofs = arg2, i, error;
4363 	struct hn_tx_ring *txr;
4364 	u_long stat;
4365 
4366 	stat = 0;
4367 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4368 		txr = &sc->hn_tx_ring[i];
4369 		stat += *((u_long *)((uint8_t *)txr + ofs));
4370 	}
4371 
4372 	error = sysctl_handle_long(oidp, &stat, 0, req);
4373 	if (error || req->newptr == NULL)
4374 		return error;
4375 
4376 	/* Zero out this stat. */
4377 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4378 		txr = &sc->hn_tx_ring[i];
4379 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
4380 	}
4381 	return 0;
4382 }
4383 
4384 static int
4385 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4386 {
4387 	struct hn_softc *sc = arg1;
4388 	int ofs = arg2, i, error, conf;
4389 	struct hn_tx_ring *txr;
4390 
4391 	txr = &sc->hn_tx_ring[0];
4392 	conf = *((int *)((uint8_t *)txr + ofs));
4393 
4394 	error = sysctl_handle_int(oidp, &conf, 0, req);
4395 	if (error || req->newptr == NULL)
4396 		return error;
4397 
4398 	HN_LOCK(sc);
4399 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4400 		txr = &sc->hn_tx_ring[i];
4401 		*((int *)((uint8_t *)txr + ofs)) = conf;
4402 	}
4403 	HN_UNLOCK(sc);
4404 
4405 	return 0;
4406 }
4407 
4408 static int
4409 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4410 {
4411 	struct hn_softc *sc = arg1;
4412 	int error, size;
4413 
4414 	size = sc->hn_agg_size;
4415 	error = sysctl_handle_int(oidp, &size, 0, req);
4416 	if (error || req->newptr == NULL)
4417 		return (error);
4418 
4419 	HN_LOCK(sc);
4420 	sc->hn_agg_size = size;
4421 	hn_set_txagg(sc);
4422 	HN_UNLOCK(sc);
4423 
4424 	return (0);
4425 }
4426 
4427 static int
4428 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4429 {
4430 	struct hn_softc *sc = arg1;
4431 	int error, pkts;
4432 
4433 	pkts = sc->hn_agg_pkts;
4434 	error = sysctl_handle_int(oidp, &pkts, 0, req);
4435 	if (error || req->newptr == NULL)
4436 		return (error);
4437 
4438 	HN_LOCK(sc);
4439 	sc->hn_agg_pkts = pkts;
4440 	hn_set_txagg(sc);
4441 	HN_UNLOCK(sc);
4442 
4443 	return (0);
4444 }
4445 
4446 static int
4447 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4448 {
4449 	struct hn_softc *sc = arg1;
4450 	int pkts;
4451 
4452 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4453 	return (sysctl_handle_int(oidp, &pkts, 0, req));
4454 }
4455 
4456 static int
4457 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4458 {
4459 	struct hn_softc *sc = arg1;
4460 	int align;
4461 
4462 	align = sc->hn_tx_ring[0].hn_agg_align;
4463 	return (sysctl_handle_int(oidp, &align, 0, req));
4464 }
4465 
4466 static void
4467 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4468 {
4469 	if (pollhz == 0)
4470 		vmbus_chan_poll_disable(chan);
4471 	else
4472 		vmbus_chan_poll_enable(chan, pollhz);
4473 }
4474 
4475 static void
4476 hn_polling(struct hn_softc *sc, u_int pollhz)
4477 {
4478 	int nsubch = sc->hn_rx_ring_inuse - 1;
4479 
4480 	HN_LOCK_ASSERT(sc);
4481 
4482 	if (nsubch > 0) {
4483 		struct vmbus_channel **subch;
4484 		int i;
4485 
4486 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4487 		for (i = 0; i < nsubch; ++i)
4488 			hn_chan_polling(subch[i], pollhz);
4489 		vmbus_subchan_rel(subch, nsubch);
4490 	}
4491 	hn_chan_polling(sc->hn_prichan, pollhz);
4492 }
4493 
4494 static int
4495 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4496 {
4497 	struct hn_softc *sc = arg1;
4498 	int pollhz, error;
4499 
4500 	pollhz = sc->hn_pollhz;
4501 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4502 	if (error || req->newptr == NULL)
4503 		return (error);
4504 
4505 	if (pollhz != 0 &&
4506 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4507 		return (EINVAL);
4508 
4509 	HN_LOCK(sc);
4510 	if (sc->hn_pollhz != pollhz) {
4511 		sc->hn_pollhz = pollhz;
4512 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4513 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4514 			hn_polling(sc, sc->hn_pollhz);
4515 	}
4516 	HN_UNLOCK(sc);
4517 
4518 	return (0);
4519 }
4520 
4521 static int
4522 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4523 {
4524 	struct hn_softc *sc = arg1;
4525 	char verstr[16];
4526 
4527 	snprintf(verstr, sizeof(verstr), "%u.%u",
4528 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4529 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4530 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4531 }
4532 
4533 static int
4534 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4535 {
4536 	struct hn_softc *sc = arg1;
4537 	char caps_str[128];
4538 	uint32_t caps;
4539 
4540 	HN_LOCK(sc);
4541 	caps = sc->hn_caps;
4542 	HN_UNLOCK(sc);
4543 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4544 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4545 }
4546 
4547 static int
4548 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4549 {
4550 	struct hn_softc *sc = arg1;
4551 	char assist_str[128];
4552 	uint32_t hwassist;
4553 
4554 	HN_LOCK(sc);
4555 	hwassist = sc->hn_ifp->if_hwassist;
4556 	HN_UNLOCK(sc);
4557 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4558 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4559 }
4560 
4561 static int
4562 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4563 {
4564 	struct hn_softc *sc = arg1;
4565 	char filter_str[128];
4566 	uint32_t filter;
4567 
4568 	HN_LOCK(sc);
4569 	filter = sc->hn_rx_filter;
4570 	HN_UNLOCK(sc);
4571 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4572 	    NDIS_PACKET_TYPES);
4573 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4574 }
4575 
4576 static int
4577 hn_rsc_sysctl(SYSCTL_HANDLER_ARGS)
4578 {
4579 	struct hn_softc *sc = arg1;
4580 	uint32_t mtu;
4581 	int error;
4582 	HN_LOCK(sc);
4583 	error = hn_rndis_get_mtu(sc, &mtu);
4584 	if (error) {
4585 		if_printf(sc->hn_ifp, "failed to get mtu\n");
4586 		goto back;
4587 	}
4588 	error = SYSCTL_OUT(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl));
4589 	if (error || req->newptr == NULL)
4590 		goto back;
4591 
4592 	error = SYSCTL_IN(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl));
4593 	if (error)
4594 		goto back;
4595 	error = hn_rndis_reconf_offload(sc, mtu);
4596 back:
4597 	HN_UNLOCK(sc);
4598 	return (error);
4599 }
4600 #ifndef RSS
4601 
4602 static int
4603 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4604 {
4605 	struct hn_softc *sc = arg1;
4606 	int error;
4607 
4608 	HN_LOCK(sc);
4609 
4610 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4611 	if (error || req->newptr == NULL)
4612 		goto back;
4613 
4614 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
4615 	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4616 		/*
4617 		 * RSS key is synchronized w/ VF's, don't allow users
4618 		 * to change it.
4619 		 */
4620 		error = EBUSY;
4621 		goto back;
4622 	}
4623 
4624 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4625 	if (error)
4626 		goto back;
4627 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4628 
4629 	if (sc->hn_rx_ring_inuse > 1) {
4630 		error = hn_rss_reconfig(sc);
4631 	} else {
4632 		/* Not RSS capable, at least for now; just save the RSS key. */
4633 		error = 0;
4634 	}
4635 back:
4636 	HN_UNLOCK(sc);
4637 	return (error);
4638 }
4639 
4640 static int
4641 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4642 {
4643 	struct hn_softc *sc = arg1;
4644 	int error;
4645 
4646 	HN_LOCK(sc);
4647 
4648 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4649 	if (error || req->newptr == NULL)
4650 		goto back;
4651 
4652 	/*
4653 	 * Don't allow RSS indirect table change, if this interface is not
4654 	 * RSS capable currently.
4655 	 */
4656 	if (sc->hn_rx_ring_inuse == 1) {
4657 		error = EOPNOTSUPP;
4658 		goto back;
4659 	}
4660 
4661 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4662 	if (error)
4663 		goto back;
4664 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4665 
4666 	hn_rss_ind_fixup(sc);
4667 	error = hn_rss_reconfig(sc);
4668 back:
4669 	HN_UNLOCK(sc);
4670 	return (error);
4671 }
4672 
4673 #endif	/* !RSS */
4674 
4675 static int
4676 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4677 {
4678 	struct hn_softc *sc = arg1;
4679 	char hash_str[128];
4680 	uint32_t hash;
4681 
4682 	HN_LOCK(sc);
4683 	hash = sc->hn_rss_hash;
4684 	HN_UNLOCK(sc);
4685 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4686 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4687 }
4688 
4689 static int
4690 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4691 {
4692 	struct hn_softc *sc = arg1;
4693 	char hash_str[128];
4694 	uint32_t hash;
4695 
4696 	HN_LOCK(sc);
4697 	hash = sc->hn_rss_hcap;
4698 	HN_UNLOCK(sc);
4699 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4700 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4701 }
4702 
4703 static int
4704 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4705 {
4706 	struct hn_softc *sc = arg1;
4707 	char hash_str[128];
4708 	uint32_t hash;
4709 
4710 	HN_LOCK(sc);
4711 	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4712 	HN_UNLOCK(sc);
4713 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4714 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4715 }
4716 
4717 static int
4718 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4719 {
4720 	struct hn_softc *sc = arg1;
4721 	char vf_name[IFNAMSIZ + 1];
4722 	struct ifnet *vf_ifp;
4723 
4724 	HN_LOCK(sc);
4725 	vf_name[0] = '\0';
4726 	vf_ifp = sc->hn_vf_ifp;
4727 	if (vf_ifp != NULL)
4728 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4729 	HN_UNLOCK(sc);
4730 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4731 }
4732 
4733 static int
4734 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4735 {
4736 	struct hn_softc *sc = arg1;
4737 	char vf_name[IFNAMSIZ + 1];
4738 	struct ifnet *vf_ifp;
4739 
4740 	HN_LOCK(sc);
4741 	vf_name[0] = '\0';
4742 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4743 	if (vf_ifp != NULL)
4744 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4745 	HN_UNLOCK(sc);
4746 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4747 }
4748 
4749 static int
4750 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4751 {
4752 	struct rm_priotracker pt;
4753 	struct sbuf *sb;
4754 	int error, i;
4755 	bool first;
4756 
4757 	error = sysctl_wire_old_buffer(req, 0);
4758 	if (error != 0)
4759 		return (error);
4760 
4761 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4762 	if (sb == NULL)
4763 		return (ENOMEM);
4764 
4765 	rm_rlock(&hn_vfmap_lock, &pt);
4766 
4767 	first = true;
4768 	for (i = 0; i < hn_vfmap_size; ++i) {
4769 		struct epoch_tracker et;
4770 		struct ifnet *ifp;
4771 
4772 		if (hn_vfmap[i] == NULL)
4773 			continue;
4774 
4775 		NET_EPOCH_ENTER(et);
4776 		ifp = ifnet_byindex(i);
4777 		if (ifp != NULL) {
4778 			if (first)
4779 				sbuf_printf(sb, "%s", ifp->if_xname);
4780 			else
4781 				sbuf_printf(sb, " %s", ifp->if_xname);
4782 			first = false;
4783 		}
4784 		NET_EPOCH_EXIT(et);
4785 	}
4786 
4787 	rm_runlock(&hn_vfmap_lock, &pt);
4788 
4789 	error = sbuf_finish(sb);
4790 	sbuf_delete(sb);
4791 	return (error);
4792 }
4793 
4794 static int
4795 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4796 {
4797 	struct rm_priotracker pt;
4798 	struct sbuf *sb;
4799 	int error, i;
4800 	bool first;
4801 
4802 	error = sysctl_wire_old_buffer(req, 0);
4803 	if (error != 0)
4804 		return (error);
4805 
4806 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4807 	if (sb == NULL)
4808 		return (ENOMEM);
4809 
4810 	rm_rlock(&hn_vfmap_lock, &pt);
4811 
4812 	first = true;
4813 	for (i = 0; i < hn_vfmap_size; ++i) {
4814 		struct epoch_tracker et;
4815 		struct ifnet *ifp, *hn_ifp;
4816 
4817 		hn_ifp = hn_vfmap[i];
4818 		if (hn_ifp == NULL)
4819 			continue;
4820 
4821 		NET_EPOCH_ENTER(et);
4822 		ifp = ifnet_byindex(i);
4823 		if (ifp != NULL) {
4824 			if (first) {
4825 				sbuf_printf(sb, "%s:%s", ifp->if_xname,
4826 				    hn_ifp->if_xname);
4827 			} else {
4828 				sbuf_printf(sb, " %s:%s", ifp->if_xname,
4829 				    hn_ifp->if_xname);
4830 			}
4831 			first = false;
4832 		}
4833 		NET_EPOCH_EXIT(et);
4834 	}
4835 
4836 	rm_runlock(&hn_vfmap_lock, &pt);
4837 
4838 	error = sbuf_finish(sb);
4839 	sbuf_delete(sb);
4840 	return (error);
4841 }
4842 
4843 static int
4844 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4845 {
4846 	struct hn_softc *sc = arg1;
4847 	int error, onoff = 0;
4848 
4849 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4850 		onoff = 1;
4851 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4852 	if (error || req->newptr == NULL)
4853 		return (error);
4854 
4855 	HN_LOCK(sc);
4856 	/* NOTE: hn_vf_lock for hn_transmit() */
4857 	rm_wlock(&sc->hn_vf_lock);
4858 	if (onoff)
4859 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4860 	else
4861 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4862 	rm_wunlock(&sc->hn_vf_lock);
4863 	HN_UNLOCK(sc);
4864 
4865 	return (0);
4866 }
4867 
4868 static int
4869 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4870 {
4871 	struct hn_softc *sc = arg1;
4872 	int enabled = 0;
4873 
4874 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4875 		enabled = 1;
4876 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4877 }
4878 
4879 static int
4880 hn_check_iplen(const struct mbuf *m, int hoff)
4881 {
4882 	const struct ip *ip;
4883 	int len, iphlen, iplen;
4884 	const struct tcphdr *th;
4885 	int thoff;				/* TCP data offset */
4886 
4887 	len = hoff + sizeof(struct ip);
4888 
4889 	/* The packet must be at least the size of an IP header. */
4890 	if (m->m_pkthdr.len < len)
4891 		return IPPROTO_DONE;
4892 
4893 	/* The fixed IP header must reside completely in the first mbuf. */
4894 	if (m->m_len < len)
4895 		return IPPROTO_DONE;
4896 
4897 	ip = mtodo(m, hoff);
4898 
4899 	/* Bound check the packet's stated IP header length. */
4900 	iphlen = ip->ip_hl << 2;
4901 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4902 		return IPPROTO_DONE;
4903 
4904 	/* The full IP header must reside completely in the one mbuf. */
4905 	if (m->m_len < hoff + iphlen)
4906 		return IPPROTO_DONE;
4907 
4908 	iplen = ntohs(ip->ip_len);
4909 
4910 	/*
4911 	 * Check that the amount of data in the buffers is as
4912 	 * at least much as the IP header would have us expect.
4913 	 */
4914 	if (m->m_pkthdr.len < hoff + iplen)
4915 		return IPPROTO_DONE;
4916 
4917 	/*
4918 	 * Ignore IP fragments.
4919 	 */
4920 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4921 		return IPPROTO_DONE;
4922 
4923 	/*
4924 	 * The TCP/IP or UDP/IP header must be entirely contained within
4925 	 * the first fragment of a packet.
4926 	 */
4927 	switch (ip->ip_p) {
4928 	case IPPROTO_TCP:
4929 		if (iplen < iphlen + sizeof(struct tcphdr))
4930 			return IPPROTO_DONE;
4931 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4932 			return IPPROTO_DONE;
4933 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4934 		thoff = th->th_off << 2;
4935 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4936 			return IPPROTO_DONE;
4937 		if (m->m_len < hoff + iphlen + thoff)
4938 			return IPPROTO_DONE;
4939 		break;
4940 	case IPPROTO_UDP:
4941 		if (iplen < iphlen + sizeof(struct udphdr))
4942 			return IPPROTO_DONE;
4943 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4944 			return IPPROTO_DONE;
4945 		break;
4946 	default:
4947 		if (iplen < iphlen)
4948 			return IPPROTO_DONE;
4949 		break;
4950 	}
4951 	return ip->ip_p;
4952 }
4953 
4954 static void
4955 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4956 {
4957 	const struct ether_header *eh;
4958 	uint16_t etype;
4959 	int hoff;
4960 
4961 	hoff = sizeof(*eh);
4962 	/* Checked at the beginning of this function. */
4963 	KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4964 
4965 	eh = mtod(m_new, const struct ether_header *);
4966 	etype = ntohs(eh->ether_type);
4967 	if (etype == ETHERTYPE_VLAN) {
4968 		const struct ether_vlan_header *evl;
4969 
4970 		hoff = sizeof(*evl);
4971 		if (m_new->m_len < hoff)
4972 			return;
4973 		evl = mtod(m_new, const struct ether_vlan_header *);
4974 		etype = ntohs(evl->evl_proto);
4975 	}
4976 	*l3proto = etype;
4977 
4978 	if (etype == ETHERTYPE_IP)
4979 		*l4proto = hn_check_iplen(m_new, hoff);
4980 	else
4981 		*l4proto = IPPROTO_DONE;
4982 }
4983 
4984 static int
4985 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4986 {
4987 	struct sysctl_oid_list *child;
4988 	struct sysctl_ctx_list *ctx;
4989 	device_t dev = sc->hn_dev;
4990 #if defined(INET) || defined(INET6)
4991 #if __FreeBSD_version >= 1100095
4992 	int lroent_cnt;
4993 #endif
4994 #endif
4995 	int i;
4996 
4997 	/*
4998 	 * Create RXBUF for reception.
4999 	 *
5000 	 * NOTE:
5001 	 * - It is shared by all channels.
5002 	 * - A large enough buffer is allocated, certain version of NVSes
5003 	 *   may further limit the usable space.
5004 	 */
5005 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
5006 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
5007 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
5008 	if (sc->hn_rxbuf == NULL) {
5009 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
5010 		return (ENOMEM);
5011 	}
5012 
5013 	sc->hn_rx_ring_cnt = ring_cnt;
5014 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
5015 
5016 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
5017 	    M_DEVBUF, M_WAITOK | M_ZERO);
5018 
5019 #if defined(INET) || defined(INET6)
5020 #if __FreeBSD_version >= 1100095
5021 	lroent_cnt = hn_lro_entry_count;
5022 	if (lroent_cnt < TCP_LRO_ENTRIES)
5023 		lroent_cnt = TCP_LRO_ENTRIES;
5024 	if (bootverbose)
5025 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
5026 #endif
5027 #endif	/* INET || INET6 */
5028 
5029 	ctx = device_get_sysctl_ctx(dev);
5030 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
5031 
5032 	/* Create dev.hn.UNIT.rx sysctl tree */
5033 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
5034 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5035 
5036 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5037 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5038 
5039 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
5040 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
5041 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
5042 		if (rxr->hn_br == NULL) {
5043 			device_printf(dev, "allocate bufring failed\n");
5044 			return (ENOMEM);
5045 		}
5046 
5047 		if (hn_trust_hosttcp)
5048 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
5049 		if (hn_trust_hostudp)
5050 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
5051 		if (hn_trust_hostip)
5052 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
5053 		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
5054 		rxr->hn_ifp = sc->hn_ifp;
5055 		if (i < sc->hn_tx_ring_cnt)
5056 			rxr->hn_txr = &sc->hn_tx_ring[i];
5057 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
5058 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
5059 		rxr->hn_rx_idx = i;
5060 		rxr->hn_rxbuf = sc->hn_rxbuf;
5061 
5062 		/*
5063 		 * Initialize LRO.
5064 		 */
5065 #if defined(INET) || defined(INET6)
5066 #if __FreeBSD_version >= 1100095
5067 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
5068 		    hn_lro_mbufq_depth);
5069 #else
5070 		tcp_lro_init(&rxr->hn_lro);
5071 		rxr->hn_lro.ifp = sc->hn_ifp;
5072 #endif
5073 #if __FreeBSD_version >= 1100099
5074 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
5075 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
5076 #endif
5077 #endif	/* INET || INET6 */
5078 
5079 		if (sc->hn_rx_sysctl_tree != NULL) {
5080 			char name[16];
5081 
5082 			/*
5083 			 * Create per RX ring sysctl tree:
5084 			 * dev.hn.UNIT.rx.RINGID
5085 			 */
5086 			snprintf(name, sizeof(name), "%d", i);
5087 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
5088 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
5089 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5090 
5091 			if (rxr->hn_rx_sysctl_tree != NULL) {
5092 				SYSCTL_ADD_ULONG(ctx,
5093 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5094 				    OID_AUTO, "packets",
5095 				    CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts,
5096 				    "# of packets received");
5097 				SYSCTL_ADD_ULONG(ctx,
5098 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5099 				    OID_AUTO, "rss_pkts",
5100 				    CTLFLAG_RW | CTLFLAG_STATS,
5101 				    &rxr->hn_rss_pkts,
5102 				    "# of packets w/ RSS info received");
5103 				SYSCTL_ADD_ULONG(ctx,
5104 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5105 				    OID_AUTO, "rsc_pkts",
5106 				    CTLFLAG_RW | CTLFLAG_STATS,
5107 				    &rxr->hn_rsc_pkts,
5108 				    "# of RSC packets received");
5109 				SYSCTL_ADD_ULONG(ctx,
5110 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5111 				    OID_AUTO, "rsc_drop",
5112 				    CTLFLAG_RW | CTLFLAG_STATS,
5113 				    &rxr->hn_rsc_drop,
5114 				    "# of RSC fragments dropped");
5115 				SYSCTL_ADD_INT(ctx,
5116 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5117 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5118 				    &rxr->hn_pktbuf_len, 0,
5119 				    "Temporary channel packet buffer length");
5120 			}
5121 		}
5122 	}
5123 
5124 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5125 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5126 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5127 #if __FreeBSD_version < 1100095
5128 	    hn_rx_stat_int_sysctl,
5129 #else
5130 	    hn_rx_stat_u64_sysctl,
5131 #endif
5132 	    "LU", "LRO queued");
5133 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5134 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5135 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5136 #if __FreeBSD_version < 1100095
5137 	    hn_rx_stat_int_sysctl,
5138 #else
5139 	    hn_rx_stat_u64_sysctl,
5140 #endif
5141 	    "LU", "LRO flushed");
5142 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5143 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5144 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
5145 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5146 #if __FreeBSD_version >= 1100099
5147 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5148 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5149 	    hn_lro_lenlim_sysctl, "IU",
5150 	    "Max # of data bytes to be aggregated by LRO");
5151 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5152 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5153 	    hn_lro_ackcnt_sysctl, "I",
5154 	    "Max # of ACKs to be aggregated by LRO");
5155 #endif
5156 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5157 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5158 	    hn_trust_hcsum_sysctl, "I",
5159 	    "Trust tcp segment verification on host side, "
5160 	    "when csum info is missing");
5161 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5162 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5163 	    hn_trust_hcsum_sysctl, "I",
5164 	    "Trust udp datagram verification on host side, "
5165 	    "when csum info is missing");
5166 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5167 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5168 	    hn_trust_hcsum_sysctl, "I",
5169 	    "Trust ip packet verification on host side, "
5170 	    "when csum info is missing");
5171 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5172 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5173 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
5174 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5175 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5176 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5177 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
5178 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5179 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5180 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5181 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
5182 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5183 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5184 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5185 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
5186 	    hn_rx_stat_ulong_sysctl, "LU",
5187 	    "# of packets that we trust host's csum verification");
5188 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5189 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5190 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
5191 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5192 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5193 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5194 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
5195 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5196 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5197 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5198 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5199 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5200 
5201 	return (0);
5202 }
5203 
5204 static void
5205 hn_destroy_rx_data(struct hn_softc *sc)
5206 {
5207 	int i;
5208 
5209 	if (sc->hn_rxbuf != NULL) {
5210 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5211 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
5212 		else
5213 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
5214 		sc->hn_rxbuf = NULL;
5215 	}
5216 
5217 	if (sc->hn_rx_ring_cnt == 0)
5218 		return;
5219 
5220 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5221 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5222 
5223 		if (rxr->hn_br == NULL)
5224 			continue;
5225 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5226 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5227 		} else {
5228 			device_printf(sc->hn_dev,
5229 			    "%dth channel bufring is referenced", i);
5230 		}
5231 		rxr->hn_br = NULL;
5232 
5233 #if defined(INET) || defined(INET6)
5234 		tcp_lro_free(&rxr->hn_lro);
5235 #endif
5236 		free(rxr->hn_pktbuf, M_DEVBUF);
5237 	}
5238 	free(sc->hn_rx_ring, M_DEVBUF);
5239 	sc->hn_rx_ring = NULL;
5240 
5241 	sc->hn_rx_ring_cnt = 0;
5242 	sc->hn_rx_ring_inuse = 0;
5243 }
5244 
5245 static int
5246 hn_tx_ring_create(struct hn_softc *sc, int id)
5247 {
5248 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5249 	device_t dev = sc->hn_dev;
5250 	bus_dma_tag_t parent_dtag;
5251 	int error, i;
5252 
5253 	txr->hn_sc = sc;
5254 	txr->hn_tx_idx = id;
5255 
5256 #ifndef HN_USE_TXDESC_BUFRING
5257 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5258 #endif
5259 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5260 
5261 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5262 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5263 	    M_DEVBUF, M_WAITOK | M_ZERO);
5264 #ifndef HN_USE_TXDESC_BUFRING
5265 	SLIST_INIT(&txr->hn_txlist);
5266 #else
5267 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5268 	    M_WAITOK, &txr->hn_tx_lock);
5269 #endif
5270 
5271 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5272 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5273 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5274 	} else {
5275 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5276 	}
5277 
5278 #ifdef HN_IFSTART_SUPPORT
5279 	if (hn_use_if_start) {
5280 		txr->hn_txeof = hn_start_txeof;
5281 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5282 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5283 	} else
5284 #endif
5285 	{
5286 		int br_depth;
5287 
5288 		txr->hn_txeof = hn_xmit_txeof;
5289 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5290 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5291 
5292 		br_depth = hn_get_txswq_depth(txr);
5293 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5294 		    M_WAITOK, &txr->hn_tx_lock);
5295 	}
5296 
5297 	txr->hn_direct_tx_size = hn_direct_tx_size;
5298 
5299 	/*
5300 	 * Always schedule transmission instead of trying to do direct
5301 	 * transmission.  This one gives the best performance so far.
5302 	 */
5303 	txr->hn_sched_tx = 1;
5304 
5305 	parent_dtag = bus_get_dma_tag(dev);
5306 
5307 	/* DMA tag for RNDIS packet messages. */
5308 	error = bus_dma_tag_create(parent_dtag, /* parent */
5309 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
5310 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
5311 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5312 	    BUS_SPACE_MAXADDR,		/* highaddr */
5313 	    NULL, NULL,			/* filter, filterarg */
5314 	    HN_RNDIS_PKT_LEN,		/* maxsize */
5315 	    1,				/* nsegments */
5316 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
5317 	    0,				/* flags */
5318 	    NULL,			/* lockfunc */
5319 	    NULL,			/* lockfuncarg */
5320 	    &txr->hn_tx_rndis_dtag);
5321 	if (error) {
5322 		device_printf(dev, "failed to create rndis dmatag\n");
5323 		return error;
5324 	}
5325 
5326 	/* DMA tag for data. */
5327 	error = bus_dma_tag_create(parent_dtag, /* parent */
5328 	    1,				/* alignment */
5329 	    HN_TX_DATA_BOUNDARY,	/* boundary */
5330 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5331 	    BUS_SPACE_MAXADDR,		/* highaddr */
5332 	    NULL, NULL,			/* filter, filterarg */
5333 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
5334 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
5335 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
5336 	    0,				/* flags */
5337 	    NULL,			/* lockfunc */
5338 	    NULL,			/* lockfuncarg */
5339 	    &txr->hn_tx_data_dtag);
5340 	if (error) {
5341 		device_printf(dev, "failed to create data dmatag\n");
5342 		return error;
5343 	}
5344 
5345 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5346 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
5347 
5348 		txd->txr = txr;
5349 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5350 		STAILQ_INIT(&txd->agg_list);
5351 
5352 		/*
5353 		 * Allocate and load RNDIS packet message.
5354 		 */
5355         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5356 		    (void **)&txd->rndis_pkt,
5357 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5358 		    &txd->rndis_pkt_dmap);
5359 		if (error) {
5360 			device_printf(dev,
5361 			    "failed to allocate rndis_packet_msg, %d\n", i);
5362 			return error;
5363 		}
5364 
5365 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5366 		    txd->rndis_pkt_dmap,
5367 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5368 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5369 		    BUS_DMA_NOWAIT);
5370 		if (error) {
5371 			device_printf(dev,
5372 			    "failed to load rndis_packet_msg, %d\n", i);
5373 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5374 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5375 			return error;
5376 		}
5377 
5378 		/* DMA map for TX data. */
5379 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5380 		    &txd->data_dmap);
5381 		if (error) {
5382 			device_printf(dev,
5383 			    "failed to allocate tx data dmamap\n");
5384 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5385 			    txd->rndis_pkt_dmap);
5386 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5387 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5388 			return error;
5389 		}
5390 
5391 		/* All set, put it to list */
5392 		txd->flags |= HN_TXD_FLAG_ONLIST;
5393 #ifndef HN_USE_TXDESC_BUFRING
5394 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5395 #else
5396 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
5397 #endif
5398 	}
5399 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5400 
5401 	if (sc->hn_tx_sysctl_tree != NULL) {
5402 		struct sysctl_oid_list *child;
5403 		struct sysctl_ctx_list *ctx;
5404 		char name[16];
5405 
5406 		/*
5407 		 * Create per TX ring sysctl tree:
5408 		 * dev.hn.UNIT.tx.RINGID
5409 		 */
5410 		ctx = device_get_sysctl_ctx(dev);
5411 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5412 
5413 		snprintf(name, sizeof(name), "%d", id);
5414 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5415 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5416 
5417 		if (txr->hn_tx_sysctl_tree != NULL) {
5418 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5419 
5420 #ifdef HN_DEBUG
5421 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5422 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5423 			    "# of available TX descs");
5424 #endif
5425 #ifdef HN_IFSTART_SUPPORT
5426 			if (!hn_use_if_start)
5427 #endif
5428 			{
5429 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5430 				    CTLFLAG_RD, &txr->hn_oactive, 0,
5431 				    "over active");
5432 			}
5433 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5434 			    CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts,
5435 			    "# of packets transmitted");
5436 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5437 			    CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends,
5438 			    "# of sends");
5439 		}
5440 	}
5441 
5442 	return 0;
5443 }
5444 
5445 static void
5446 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5447 {
5448 	struct hn_tx_ring *txr = txd->txr;
5449 
5450 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
5451 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5452 
5453 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5454 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5455 	    txd->rndis_pkt_dmap);
5456 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5457 }
5458 
5459 static void
5460 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5461 {
5462 
5463 	KASSERT(txd->refs == 0 || txd->refs == 1,
5464 	    ("invalid txd refs %d", txd->refs));
5465 
5466 	/* Aggregated txds will be freed by their aggregating txd. */
5467 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5468 		int freed __diagused;
5469 
5470 		freed = hn_txdesc_put(txr, txd);
5471 		KASSERT(freed, ("can't free txdesc"));
5472 	}
5473 }
5474 
5475 static void
5476 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5477 {
5478 	int i;
5479 
5480 	if (txr->hn_txdesc == NULL)
5481 		return;
5482 
5483 	/*
5484 	 * NOTE:
5485 	 * Because the freeing of aggregated txds will be deferred
5486 	 * to the aggregating txd, two passes are used here:
5487 	 * - The first pass GCes any pending txds.  This GC is necessary,
5488 	 *   since if the channels are revoked, hypervisor will not
5489 	 *   deliver send-done for all pending txds.
5490 	 * - The second pass frees the busdma stuffs, i.e. after all txds
5491 	 *   were freed.
5492 	 */
5493 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5494 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5495 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5496 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5497 
5498 	if (txr->hn_tx_data_dtag != NULL)
5499 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5500 	if (txr->hn_tx_rndis_dtag != NULL)
5501 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5502 
5503 #ifdef HN_USE_TXDESC_BUFRING
5504 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5505 #endif
5506 
5507 	free(txr->hn_txdesc, M_DEVBUF);
5508 	txr->hn_txdesc = NULL;
5509 
5510 	if (txr->hn_mbuf_br != NULL)
5511 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5512 
5513 #ifndef HN_USE_TXDESC_BUFRING
5514 	mtx_destroy(&txr->hn_txlist_spin);
5515 #endif
5516 	mtx_destroy(&txr->hn_tx_lock);
5517 }
5518 
5519 static int
5520 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5521 {
5522 	struct sysctl_oid_list *child;
5523 	struct sysctl_ctx_list *ctx;
5524 	int i;
5525 
5526 	/*
5527 	 * Create TXBUF for chimney sending.
5528 	 *
5529 	 * NOTE: It is shared by all channels.
5530 	 */
5531 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5532 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5533 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
5534 	if (sc->hn_chim == NULL) {
5535 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
5536 		return (ENOMEM);
5537 	}
5538 
5539 	sc->hn_tx_ring_cnt = ring_cnt;
5540 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5541 
5542 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5543 	    M_DEVBUF, M_WAITOK | M_ZERO);
5544 
5545 	ctx = device_get_sysctl_ctx(sc->hn_dev);
5546 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5547 
5548 	/* Create dev.hn.UNIT.tx sysctl tree */
5549 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5550 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5551 
5552 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5553 		int error;
5554 
5555 		error = hn_tx_ring_create(sc, i);
5556 		if (error)
5557 			return error;
5558 	}
5559 
5560 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5561 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5562 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5563 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5564 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5565 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5566 	    __offsetof(struct hn_tx_ring, hn_send_failed),
5567 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5568 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5569 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5570 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5571 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5572 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5573 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5574 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5575 	    hn_tx_stat_ulong_sysctl, "LU",
5576 	    "# of packet transmission aggregation flush failure");
5577 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5578 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5579 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5580 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5581 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5582 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5583 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5584 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5585 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5586 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5587 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5588 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5589 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5590 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5591 	    "# of total TX descs");
5592 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5593 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5594 	    "Chimney send packet size upper boundary");
5595 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5596 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5597 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5598 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5599 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5600 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5601 	    hn_tx_conf_int_sysctl, "I",
5602 	    "Size of the packet for direct transmission");
5603 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5604 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5605 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5606 	    hn_tx_conf_int_sysctl, "I",
5607 	    "Always schedule transmission "
5608 	    "instead of doing direct transmission");
5609 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5610 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5611 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5612 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5613 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5614 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5615 	    "Applied packet transmission aggregation size");
5616 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5617 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5618 	    hn_txagg_pktmax_sysctl, "I",
5619 	    "Applied packet transmission aggregation packets");
5620 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5621 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5622 	    hn_txagg_align_sysctl, "I",
5623 	    "Applied packet transmission aggregation alignment");
5624 
5625 	return 0;
5626 }
5627 
5628 static void
5629 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5630 {
5631 	int i;
5632 
5633 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5634 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5635 }
5636 
5637 static void
5638 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5639 {
5640 	struct ifnet *ifp = sc->hn_ifp;
5641 	u_int hw_tsomax;
5642 	int tso_minlen;
5643 
5644 	HN_LOCK_ASSERT(sc);
5645 
5646 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5647 		return;
5648 
5649 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5650 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5651 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5652 
5653 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5654 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5655 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5656 
5657 	if (tso_maxlen < tso_minlen)
5658 		tso_maxlen = tso_minlen;
5659 	else if (tso_maxlen > IP_MAXPACKET)
5660 		tso_maxlen = IP_MAXPACKET;
5661 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5662 		tso_maxlen = sc->hn_ndis_tso_szmax;
5663 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5664 
5665 	if (hn_xpnt_vf_isready(sc)) {
5666 		if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5667 			hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5668 	}
5669 	ifp->if_hw_tsomax = hw_tsomax;
5670 	if (bootverbose)
5671 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5672 }
5673 
5674 static void
5675 hn_fixup_tx_data(struct hn_softc *sc)
5676 {
5677 	uint64_t csum_assist;
5678 	int i;
5679 
5680 	hn_set_chim_size(sc, sc->hn_chim_szmax);
5681 	if (hn_tx_chimney_size > 0 &&
5682 	    hn_tx_chimney_size < sc->hn_chim_szmax)
5683 		hn_set_chim_size(sc, hn_tx_chimney_size);
5684 
5685 	csum_assist = 0;
5686 	if (sc->hn_caps & HN_CAP_IPCS)
5687 		csum_assist |= CSUM_IP;
5688 	if (sc->hn_caps & HN_CAP_TCP4CS)
5689 		csum_assist |= CSUM_IP_TCP;
5690 	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5691 		csum_assist |= CSUM_IP_UDP;
5692 	if (sc->hn_caps & HN_CAP_TCP6CS)
5693 		csum_assist |= CSUM_IP6_TCP;
5694 	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5695 		csum_assist |= CSUM_IP6_UDP;
5696 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5697 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5698 
5699 	if (sc->hn_caps & HN_CAP_HASHVAL) {
5700 		/*
5701 		 * Support HASHVAL pktinfo on TX path.
5702 		 */
5703 		if (bootverbose)
5704 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5705 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5706 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5707 	}
5708 }
5709 
5710 static void
5711 hn_fixup_rx_data(struct hn_softc *sc)
5712 {
5713 
5714 	if (sc->hn_caps & HN_CAP_UDPHASH) {
5715 		int i;
5716 
5717 		for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5718 			sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5719 	}
5720 }
5721 
5722 static void
5723 hn_destroy_tx_data(struct hn_softc *sc)
5724 {
5725 	int i;
5726 
5727 	if (sc->hn_chim != NULL) {
5728 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5729 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5730 		} else {
5731 			device_printf(sc->hn_dev,
5732 			    "chimney sending buffer is referenced");
5733 		}
5734 		sc->hn_chim = NULL;
5735 	}
5736 
5737 	if (sc->hn_tx_ring_cnt == 0)
5738 		return;
5739 
5740 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5741 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5742 
5743 	free(sc->hn_tx_ring, M_DEVBUF);
5744 	sc->hn_tx_ring = NULL;
5745 
5746 	sc->hn_tx_ring_cnt = 0;
5747 	sc->hn_tx_ring_inuse = 0;
5748 }
5749 
5750 #ifdef HN_IFSTART_SUPPORT
5751 
5752 static void
5753 hn_start_taskfunc(void *xtxr, int pending __unused)
5754 {
5755 	struct hn_tx_ring *txr = xtxr;
5756 
5757 	mtx_lock(&txr->hn_tx_lock);
5758 	hn_start_locked(txr, 0);
5759 	mtx_unlock(&txr->hn_tx_lock);
5760 }
5761 
5762 static int
5763 hn_start_locked(struct hn_tx_ring *txr, int len)
5764 {
5765 	struct hn_softc *sc = txr->hn_sc;
5766 	struct ifnet *ifp = sc->hn_ifp;
5767 	int sched = 0;
5768 
5769 	KASSERT(hn_use_if_start,
5770 	    ("hn_start_locked is called, when if_start is disabled"));
5771 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5772 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5773 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5774 
5775 	if (__predict_false(txr->hn_suspended))
5776 		return (0);
5777 
5778 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5779 	    IFF_DRV_RUNNING)
5780 		return (0);
5781 
5782 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5783 		struct hn_txdesc *txd;
5784 		struct mbuf *m_head;
5785 		int error;
5786 
5787 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5788 		if (m_head == NULL)
5789 			break;
5790 
5791 		if (len > 0 && m_head->m_pkthdr.len > len) {
5792 			/*
5793 			 * This sending could be time consuming; let callers
5794 			 * dispatch this packet sending (and sending of any
5795 			 * following up packets) to tx taskqueue.
5796 			 */
5797 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5798 			sched = 1;
5799 			break;
5800 		}
5801 
5802 #if defined(INET6) || defined(INET)
5803 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5804 			m_head = hn_tso_fixup(m_head);
5805 			if (__predict_false(m_head == NULL)) {
5806 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5807 				continue;
5808 			}
5809 		} else if (m_head->m_pkthdr.csum_flags &
5810 		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5811 			m_head = hn_set_hlen(m_head);
5812 			if (__predict_false(m_head == NULL)) {
5813 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5814 				continue;
5815 			}
5816 		}
5817 #endif
5818 
5819 		txd = hn_txdesc_get(txr);
5820 		if (txd == NULL) {
5821 			txr->hn_no_txdescs++;
5822 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5823 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5824 			break;
5825 		}
5826 
5827 		error = hn_encap(ifp, txr, txd, &m_head);
5828 		if (error) {
5829 			/* Both txd and m_head are freed */
5830 			KASSERT(txr->hn_agg_txd == NULL,
5831 			    ("encap failed w/ pending aggregating txdesc"));
5832 			continue;
5833 		}
5834 
5835 		if (txr->hn_agg_pktleft == 0) {
5836 			if (txr->hn_agg_txd != NULL) {
5837 				KASSERT(m_head == NULL,
5838 				    ("pending mbuf for aggregating txdesc"));
5839 				error = hn_flush_txagg(ifp, txr);
5840 				if (__predict_false(error)) {
5841 					atomic_set_int(&ifp->if_drv_flags,
5842 					    IFF_DRV_OACTIVE);
5843 					break;
5844 				}
5845 			} else {
5846 				KASSERT(m_head != NULL, ("mbuf was freed"));
5847 				error = hn_txpkt(ifp, txr, txd);
5848 				if (__predict_false(error)) {
5849 					/* txd is freed, but m_head is not */
5850 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5851 					atomic_set_int(&ifp->if_drv_flags,
5852 					    IFF_DRV_OACTIVE);
5853 					break;
5854 				}
5855 			}
5856 		}
5857 #ifdef INVARIANTS
5858 		else {
5859 			KASSERT(txr->hn_agg_txd != NULL,
5860 			    ("no aggregating txdesc"));
5861 			KASSERT(m_head == NULL,
5862 			    ("pending mbuf for aggregating txdesc"));
5863 		}
5864 #endif
5865 	}
5866 
5867 	/* Flush pending aggerated transmission. */
5868 	if (txr->hn_agg_txd != NULL)
5869 		hn_flush_txagg(ifp, txr);
5870 	return (sched);
5871 }
5872 
5873 static void
5874 hn_start(struct ifnet *ifp)
5875 {
5876 	struct hn_softc *sc = ifp->if_softc;
5877 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5878 
5879 	if (txr->hn_sched_tx)
5880 		goto do_sched;
5881 
5882 	if (mtx_trylock(&txr->hn_tx_lock)) {
5883 		int sched;
5884 
5885 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5886 		mtx_unlock(&txr->hn_tx_lock);
5887 		if (!sched)
5888 			return;
5889 	}
5890 do_sched:
5891 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5892 }
5893 
5894 static void
5895 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5896 {
5897 	struct hn_tx_ring *txr = xtxr;
5898 
5899 	mtx_lock(&txr->hn_tx_lock);
5900 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5901 	hn_start_locked(txr, 0);
5902 	mtx_unlock(&txr->hn_tx_lock);
5903 }
5904 
5905 static void
5906 hn_start_txeof(struct hn_tx_ring *txr)
5907 {
5908 	struct hn_softc *sc = txr->hn_sc;
5909 	struct ifnet *ifp = sc->hn_ifp;
5910 
5911 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5912 
5913 	if (txr->hn_sched_tx)
5914 		goto do_sched;
5915 
5916 	if (mtx_trylock(&txr->hn_tx_lock)) {
5917 		int sched;
5918 
5919 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5920 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5921 		mtx_unlock(&txr->hn_tx_lock);
5922 		if (sched) {
5923 			taskqueue_enqueue(txr->hn_tx_taskq,
5924 			    &txr->hn_tx_task);
5925 		}
5926 	} else {
5927 do_sched:
5928 		/*
5929 		 * Release the OACTIVE earlier, with the hope, that
5930 		 * others could catch up.  The task will clear the
5931 		 * flag again with the hn_tx_lock to avoid possible
5932 		 * races.
5933 		 */
5934 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5935 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5936 	}
5937 }
5938 
5939 #endif	/* HN_IFSTART_SUPPORT */
5940 
5941 static int
5942 hn_xmit(struct hn_tx_ring *txr, int len)
5943 {
5944 	struct hn_softc *sc = txr->hn_sc;
5945 	struct ifnet *ifp = sc->hn_ifp;
5946 	struct mbuf *m_head;
5947 	int sched = 0;
5948 
5949 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5950 #ifdef HN_IFSTART_SUPPORT
5951 	KASSERT(hn_use_if_start == 0,
5952 	    ("hn_xmit is called, when if_start is enabled"));
5953 #endif
5954 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5955 
5956 	if (__predict_false(txr->hn_suspended))
5957 		return (0);
5958 
5959 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5960 		return (0);
5961 
5962 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5963 		struct hn_txdesc *txd;
5964 		int error;
5965 
5966 		if (len > 0 && m_head->m_pkthdr.len > len) {
5967 			/*
5968 			 * This sending could be time consuming; let callers
5969 			 * dispatch this packet sending (and sending of any
5970 			 * following up packets) to tx taskqueue.
5971 			 */
5972 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5973 			sched = 1;
5974 			break;
5975 		}
5976 
5977 		txd = hn_txdesc_get(txr);
5978 		if (txd == NULL) {
5979 			txr->hn_no_txdescs++;
5980 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5981 			txr->hn_oactive = 1;
5982 			break;
5983 		}
5984 
5985 		error = hn_encap(ifp, txr, txd, &m_head);
5986 		if (error) {
5987 			/* Both txd and m_head are freed; discard */
5988 			KASSERT(txr->hn_agg_txd == NULL,
5989 			    ("encap failed w/ pending aggregating txdesc"));
5990 			drbr_advance(ifp, txr->hn_mbuf_br);
5991 			continue;
5992 		}
5993 
5994 		if (txr->hn_agg_pktleft == 0) {
5995 			if (txr->hn_agg_txd != NULL) {
5996 				KASSERT(m_head == NULL,
5997 				    ("pending mbuf for aggregating txdesc"));
5998 				error = hn_flush_txagg(ifp, txr);
5999 				if (__predict_false(error)) {
6000 					txr->hn_oactive = 1;
6001 					break;
6002 				}
6003 			} else {
6004 				KASSERT(m_head != NULL, ("mbuf was freed"));
6005 				error = hn_txpkt(ifp, txr, txd);
6006 				if (__predict_false(error)) {
6007 					/* txd is freed, but m_head is not */
6008 					drbr_putback(ifp, txr->hn_mbuf_br,
6009 					    m_head);
6010 					txr->hn_oactive = 1;
6011 					break;
6012 				}
6013 			}
6014 		}
6015 #ifdef INVARIANTS
6016 		else {
6017 			KASSERT(txr->hn_agg_txd != NULL,
6018 			    ("no aggregating txdesc"));
6019 			KASSERT(m_head == NULL,
6020 			    ("pending mbuf for aggregating txdesc"));
6021 		}
6022 #endif
6023 
6024 		/* Sent */
6025 		drbr_advance(ifp, txr->hn_mbuf_br);
6026 	}
6027 
6028 	/* Flush pending aggerated transmission. */
6029 	if (txr->hn_agg_txd != NULL)
6030 		hn_flush_txagg(ifp, txr);
6031 	return (sched);
6032 }
6033 
6034 static int
6035 hn_transmit(struct ifnet *ifp, struct mbuf *m)
6036 {
6037 	struct hn_softc *sc = ifp->if_softc;
6038 	struct hn_tx_ring *txr;
6039 	int error, idx = 0;
6040 
6041 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
6042 		struct rm_priotracker pt;
6043 
6044 		rm_rlock(&sc->hn_vf_lock, &pt);
6045 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6046 			struct mbuf *m_bpf = NULL;
6047 			int obytes, omcast;
6048 
6049 			obytes = m->m_pkthdr.len;
6050 			omcast = (m->m_flags & M_MCAST) != 0;
6051 
6052 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
6053 				if (bpf_peers_present(ifp->if_bpf)) {
6054 					m_bpf = m_copypacket(m, M_NOWAIT);
6055 					if (m_bpf == NULL) {
6056 						/*
6057 						 * Failed to grab a shallow
6058 						 * copy; tap now.
6059 						 */
6060 						ETHER_BPF_MTAP(ifp, m);
6061 					}
6062 				}
6063 			} else {
6064 				ETHER_BPF_MTAP(ifp, m);
6065 			}
6066 
6067 			error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
6068 			rm_runlock(&sc->hn_vf_lock, &pt);
6069 
6070 			if (m_bpf != NULL) {
6071 				if (!error)
6072 					ETHER_BPF_MTAP(ifp, m_bpf);
6073 				m_freem(m_bpf);
6074 			}
6075 
6076 			if (error == ENOBUFS) {
6077 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6078 			} else if (error) {
6079 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6080 			} else {
6081 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
6082 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
6083 				if (omcast) {
6084 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
6085 					    omcast);
6086 				}
6087 			}
6088 			return (error);
6089 		}
6090 		rm_runlock(&sc->hn_vf_lock, &pt);
6091 	}
6092 
6093 #if defined(INET6) || defined(INET)
6094 	/*
6095 	 * Perform TSO packet header fixup or get l2/l3 header length now,
6096 	 * since packet headers should be cache-hot.
6097 	 */
6098 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
6099 		m = hn_tso_fixup(m);
6100 		if (__predict_false(m == NULL)) {
6101 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6102 			return EIO;
6103 		}
6104 	} else if (m->m_pkthdr.csum_flags &
6105 	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6106 		m = hn_set_hlen(m);
6107 		if (__predict_false(m == NULL)) {
6108 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6109 			return EIO;
6110 		}
6111 	}
6112 #endif
6113 
6114 	/*
6115 	 * Select the TX ring based on flowid
6116 	 */
6117 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6118 #ifdef RSS
6119 		uint32_t bid;
6120 
6121 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
6122 		    &bid) == 0)
6123 			idx = bid % sc->hn_tx_ring_inuse;
6124 		else
6125 #endif
6126 		{
6127 #if defined(INET6) || defined(INET)
6128 			int tcpsyn = 0;
6129 
6130 			if (m->m_pkthdr.len < 128 &&
6131 			    (m->m_pkthdr.csum_flags &
6132 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6133 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6134 				m = hn_check_tcpsyn(m, &tcpsyn);
6135 				if (__predict_false(m == NULL)) {
6136 					if_inc_counter(ifp,
6137 					    IFCOUNTER_OERRORS, 1);
6138 					return (EIO);
6139 				}
6140 			}
6141 #else
6142 			const int tcpsyn = 0;
6143 #endif
6144 			if (tcpsyn)
6145 				idx = 0;
6146 			else
6147 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6148 		}
6149 	}
6150 	txr = &sc->hn_tx_ring[idx];
6151 
6152 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6153 	if (error) {
6154 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6155 		return error;
6156 	}
6157 
6158 	if (txr->hn_oactive)
6159 		return 0;
6160 
6161 	if (txr->hn_sched_tx)
6162 		goto do_sched;
6163 
6164 	if (mtx_trylock(&txr->hn_tx_lock)) {
6165 		int sched;
6166 
6167 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6168 		mtx_unlock(&txr->hn_tx_lock);
6169 		if (!sched)
6170 			return 0;
6171 	}
6172 do_sched:
6173 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6174 	return 0;
6175 }
6176 
6177 static void
6178 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6179 {
6180 	struct mbuf *m;
6181 
6182 	mtx_lock(&txr->hn_tx_lock);
6183 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6184 		m_freem(m);
6185 	mtx_unlock(&txr->hn_tx_lock);
6186 }
6187 
6188 static void
6189 hn_xmit_qflush(struct ifnet *ifp)
6190 {
6191 	struct hn_softc *sc = ifp->if_softc;
6192 	struct rm_priotracker pt;
6193 	int i;
6194 
6195 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6196 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6197 	if_qflush(ifp);
6198 
6199 	rm_rlock(&sc->hn_vf_lock, &pt);
6200 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6201 		sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
6202 	rm_runlock(&sc->hn_vf_lock, &pt);
6203 }
6204 
6205 static void
6206 hn_xmit_txeof(struct hn_tx_ring *txr)
6207 {
6208 
6209 	if (txr->hn_sched_tx)
6210 		goto do_sched;
6211 
6212 	if (mtx_trylock(&txr->hn_tx_lock)) {
6213 		int sched;
6214 
6215 		txr->hn_oactive = 0;
6216 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6217 		mtx_unlock(&txr->hn_tx_lock);
6218 		if (sched) {
6219 			taskqueue_enqueue(txr->hn_tx_taskq,
6220 			    &txr->hn_tx_task);
6221 		}
6222 	} else {
6223 do_sched:
6224 		/*
6225 		 * Release the oactive earlier, with the hope, that
6226 		 * others could catch up.  The task will clear the
6227 		 * oactive again with the hn_tx_lock to avoid possible
6228 		 * races.
6229 		 */
6230 		txr->hn_oactive = 0;
6231 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6232 	}
6233 }
6234 
6235 static void
6236 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6237 {
6238 	struct hn_tx_ring *txr = xtxr;
6239 
6240 	mtx_lock(&txr->hn_tx_lock);
6241 	hn_xmit(txr, 0);
6242 	mtx_unlock(&txr->hn_tx_lock);
6243 }
6244 
6245 static void
6246 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6247 {
6248 	struct hn_tx_ring *txr = xtxr;
6249 
6250 	mtx_lock(&txr->hn_tx_lock);
6251 	txr->hn_oactive = 0;
6252 	hn_xmit(txr, 0);
6253 	mtx_unlock(&txr->hn_tx_lock);
6254 }
6255 
6256 static int
6257 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6258 {
6259 	struct vmbus_chan_br cbr;
6260 	struct hn_rx_ring *rxr;
6261 	struct hn_tx_ring *txr = NULL;
6262 	int idx, error;
6263 
6264 	idx = vmbus_chan_subidx(chan);
6265 
6266 	/*
6267 	 * Link this channel to RX/TX ring.
6268 	 */
6269 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6270 	    ("invalid channel index %d, should > 0 && < %d",
6271 	     idx, sc->hn_rx_ring_inuse));
6272 	rxr = &sc->hn_rx_ring[idx];
6273 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6274 	    ("RX ring %d already attached", idx));
6275 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6276 	rxr->hn_chan = chan;
6277 
6278 	if (bootverbose) {
6279 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6280 		    idx, vmbus_chan_id(chan));
6281 	}
6282 
6283 	if (idx < sc->hn_tx_ring_inuse) {
6284 		txr = &sc->hn_tx_ring[idx];
6285 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6286 		    ("TX ring %d already attached", idx));
6287 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6288 
6289 		txr->hn_chan = chan;
6290 		if (bootverbose) {
6291 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6292 			    idx, vmbus_chan_id(chan));
6293 		}
6294 	}
6295 
6296 	/* Bind this channel to a proper CPU. */
6297 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6298 
6299 	/*
6300 	 * Open this channel
6301 	 */
6302 	cbr.cbr = rxr->hn_br;
6303 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6304 	cbr.cbr_txsz = HN_TXBR_SIZE;
6305 	cbr.cbr_rxsz = HN_RXBR_SIZE;
6306 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6307 	if (error) {
6308 		if (error == EISCONN) {
6309 			if_printf(sc->hn_ifp, "bufring is connected after "
6310 			    "chan%u open failure\n", vmbus_chan_id(chan));
6311 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6312 		} else {
6313 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6314 			    vmbus_chan_id(chan), error);
6315 		}
6316 	}
6317 	return (error);
6318 }
6319 
6320 static void
6321 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6322 {
6323 	struct hn_rx_ring *rxr;
6324 	int idx, error;
6325 
6326 	idx = vmbus_chan_subidx(chan);
6327 
6328 	/*
6329 	 * Link this channel to RX/TX ring.
6330 	 */
6331 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6332 	    ("invalid channel index %d, should > 0 && < %d",
6333 	     idx, sc->hn_rx_ring_inuse));
6334 	rxr = &sc->hn_rx_ring[idx];
6335 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6336 	    ("RX ring %d is not attached", idx));
6337 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6338 
6339 	if (idx < sc->hn_tx_ring_inuse) {
6340 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6341 
6342 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6343 		    ("TX ring %d is not attached attached", idx));
6344 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6345 	}
6346 
6347 	/*
6348 	 * Close this channel.
6349 	 *
6350 	 * NOTE:
6351 	 * Channel closing does _not_ destroy the target channel.
6352 	 */
6353 	error = vmbus_chan_close_direct(chan);
6354 	if (error == EISCONN) {
6355 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
6356 		    "after being closed\n", vmbus_chan_id(chan));
6357 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6358 	} else if (error) {
6359 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6360 		    vmbus_chan_id(chan), error);
6361 	}
6362 }
6363 
6364 static int
6365 hn_attach_subchans(struct hn_softc *sc)
6366 {
6367 	struct vmbus_channel **subchans;
6368 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6369 	int i, error = 0;
6370 
6371 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
6372 
6373 	/* Attach the sub-channels. */
6374 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6375 	for (i = 0; i < subchan_cnt; ++i) {
6376 		int error1;
6377 
6378 		error1 = hn_chan_attach(sc, subchans[i]);
6379 		if (error1) {
6380 			error = error1;
6381 			/* Move on; all channels will be detached later. */
6382 		}
6383 	}
6384 	vmbus_subchan_rel(subchans, subchan_cnt);
6385 
6386 	if (error) {
6387 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6388 	} else {
6389 		if (bootverbose) {
6390 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6391 			    subchan_cnt);
6392 		}
6393 	}
6394 	return (error);
6395 }
6396 
6397 static void
6398 hn_detach_allchans(struct hn_softc *sc)
6399 {
6400 	struct vmbus_channel **subchans;
6401 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6402 	int i;
6403 
6404 	if (subchan_cnt == 0)
6405 		goto back;
6406 
6407 	/* Detach the sub-channels. */
6408 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6409 	for (i = 0; i < subchan_cnt; ++i)
6410 		hn_chan_detach(sc, subchans[i]);
6411 	vmbus_subchan_rel(subchans, subchan_cnt);
6412 
6413 back:
6414 	/*
6415 	 * Detach the primary channel, _after_ all sub-channels
6416 	 * are detached.
6417 	 */
6418 	hn_chan_detach(sc, sc->hn_prichan);
6419 
6420 	/* Wait for sub-channels to be destroyed, if any. */
6421 	vmbus_subchan_drain(sc->hn_prichan);
6422 
6423 #ifdef INVARIANTS
6424 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6425 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6426 		    HN_RX_FLAG_ATTACHED) == 0,
6427 		    ("%dth RX ring is still attached", i));
6428 	}
6429 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6430 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6431 		    HN_TX_FLAG_ATTACHED) == 0,
6432 		    ("%dth TX ring is still attached", i));
6433 	}
6434 #endif
6435 }
6436 
6437 static int
6438 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6439 {
6440 	struct vmbus_channel **subchans;
6441 	int nchan, rxr_cnt, error;
6442 
6443 	nchan = *nsubch + 1;
6444 	if (nchan == 1) {
6445 		/*
6446 		 * Multiple RX/TX rings are not requested.
6447 		 */
6448 		*nsubch = 0;
6449 		return (0);
6450 	}
6451 
6452 	/*
6453 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6454 	 * table entries.
6455 	 */
6456 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6457 	if (error) {
6458 		/* No RSS; this is benign. */
6459 		*nsubch = 0;
6460 		return (0);
6461 	}
6462 	if (bootverbose) {
6463 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6464 		    rxr_cnt, nchan);
6465 	}
6466 
6467 	if (nchan > rxr_cnt)
6468 		nchan = rxr_cnt;
6469 	if (nchan == 1) {
6470 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6471 		*nsubch = 0;
6472 		return (0);
6473 	}
6474 
6475 	/*
6476 	 * Allocate sub-channels from NVS.
6477 	 */
6478 	*nsubch = nchan - 1;
6479 	error = hn_nvs_alloc_subchans(sc, nsubch);
6480 	if (error || *nsubch == 0) {
6481 		/* Failed to allocate sub-channels. */
6482 		*nsubch = 0;
6483 		return (0);
6484 	}
6485 
6486 	/*
6487 	 * Wait for all sub-channels to become ready before moving on.
6488 	 */
6489 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6490 	vmbus_subchan_rel(subchans, *nsubch);
6491 	return (0);
6492 }
6493 
6494 static bool
6495 hn_synth_attachable(const struct hn_softc *sc)
6496 {
6497 	int i;
6498 
6499 	if (sc->hn_flags & HN_FLAG_ERRORS)
6500 		return (false);
6501 
6502 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6503 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6504 
6505 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6506 			return (false);
6507 	}
6508 	return (true);
6509 }
6510 
6511 /*
6512  * Make sure that the RX filter is zero after the successful
6513  * RNDIS initialization.
6514  *
6515  * NOTE:
6516  * Under certain conditions on certain versions of Hyper-V,
6517  * the RNDIS rxfilter is _not_ zero on the hypervisor side
6518  * after the successful RNDIS initialization, which breaks
6519  * the assumption of any following code (well, it breaks the
6520  * RNDIS API contract actually).  Clear the RNDIS rxfilter
6521  * explicitly, drain packets sneaking through, and drain the
6522  * interrupt taskqueues scheduled due to the stealth packets.
6523  */
6524 static void
6525 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6526 {
6527 
6528 	hn_disable_rx(sc);
6529 	hn_drain_rxtx(sc, nchan);
6530 }
6531 
6532 static int
6533 hn_synth_attach(struct hn_softc *sc, int mtu)
6534 {
6535 #define ATTACHED_NVS		0x0002
6536 #define ATTACHED_RNDIS		0x0004
6537 
6538 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6539 	int error, nsubch, nchan = 1, i, rndis_inited;
6540 	uint32_t old_caps, attached = 0;
6541 
6542 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6543 	    ("synthetic parts were attached"));
6544 
6545 	if (!hn_synth_attachable(sc))
6546 		return (ENXIO);
6547 
6548 	/* Save capabilities for later verification. */
6549 	old_caps = sc->hn_caps;
6550 	sc->hn_caps = 0;
6551 
6552 	/* Clear RSS stuffs. */
6553 	sc->hn_rss_ind_size = 0;
6554 	sc->hn_rss_hash = 0;
6555 	sc->hn_rss_hcap = 0;
6556 
6557 	/*
6558 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
6559 	 */
6560 	error = hn_chan_attach(sc, sc->hn_prichan);
6561 	if (error)
6562 		goto failed;
6563 
6564 	/*
6565 	 * Attach NVS.
6566 	 */
6567 	error = hn_nvs_attach(sc, mtu);
6568 	if (error)
6569 		goto failed;
6570 	attached |= ATTACHED_NVS;
6571 
6572 	/*
6573 	 * Attach RNDIS _after_ NVS is attached.
6574 	 */
6575 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6576 	if (rndis_inited)
6577 		attached |= ATTACHED_RNDIS;
6578 	if (error)
6579 		goto failed;
6580 
6581 	/*
6582 	 * Make sure capabilities are not changed.
6583 	 */
6584 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6585 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6586 		    old_caps, sc->hn_caps);
6587 		error = ENXIO;
6588 		goto failed;
6589 	}
6590 
6591 	/*
6592 	 * Allocate sub-channels for multi-TX/RX rings.
6593 	 *
6594 	 * NOTE:
6595 	 * The # of RX rings that can be used is equivalent to the # of
6596 	 * channels to be requested.
6597 	 */
6598 	nsubch = sc->hn_rx_ring_cnt - 1;
6599 	error = hn_synth_alloc_subchans(sc, &nsubch);
6600 	if (error)
6601 		goto failed;
6602 	/* NOTE: _Full_ synthetic parts detach is required now. */
6603 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6604 
6605 	/*
6606 	 * Set the # of TX/RX rings that could be used according to
6607 	 * the # of channels that NVS offered.
6608 	 */
6609 	nchan = nsubch + 1;
6610 	hn_set_ring_inuse(sc, nchan);
6611 	if (nchan == 1) {
6612 		/* Only the primary channel can be used; done */
6613 		goto back;
6614 	}
6615 
6616 	/*
6617 	 * Attach the sub-channels.
6618 	 *
6619 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6620 	 */
6621 	error = hn_attach_subchans(sc);
6622 	if (error)
6623 		goto failed;
6624 
6625 	/*
6626 	 * Configure RSS key and indirect table _after_ all sub-channels
6627 	 * are attached.
6628 	 */
6629 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6630 		/*
6631 		 * RSS key is not set yet; set it to the default RSS key.
6632 		 */
6633 		if (bootverbose)
6634 			if_printf(sc->hn_ifp, "setup default RSS key\n");
6635 #ifdef RSS
6636 		rss_getkey(rss->rss_key);
6637 #else
6638 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6639 #endif
6640 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6641 	}
6642 
6643 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6644 		/*
6645 		 * RSS indirect table is not set yet; set it up in round-
6646 		 * robin fashion.
6647 		 */
6648 		if (bootverbose) {
6649 			if_printf(sc->hn_ifp, "setup default RSS indirect "
6650 			    "table\n");
6651 		}
6652 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6653 			uint32_t subidx;
6654 
6655 #ifdef RSS
6656 			subidx = rss_get_indirection_to_bucket(i);
6657 #else
6658 			subidx = i;
6659 #endif
6660 			rss->rss_ind[i] = subidx % nchan;
6661 		}
6662 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6663 	} else {
6664 		/*
6665 		 * # of usable channels may be changed, so we have to
6666 		 * make sure that all entries in RSS indirect table
6667 		 * are valid.
6668 		 *
6669 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6670 		 */
6671 		hn_rss_ind_fixup(sc);
6672 	}
6673 
6674 	sc->hn_rss_hash = sc->hn_rss_hcap;
6675 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
6676 	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6677 		/* NOTE: Don't reconfigure RSS; will do immediately. */
6678 		hn_vf_rss_fixup(sc, false);
6679 	}
6680 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6681 	if (error)
6682 		goto failed;
6683 back:
6684 	/*
6685 	 * Fixup transmission aggregation setup.
6686 	 */
6687 	hn_set_txagg(sc);
6688 	hn_rndis_init_fixat(sc, nchan);
6689 	return (0);
6690 
6691 failed:
6692 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6693 		hn_rndis_init_fixat(sc, nchan);
6694 		hn_synth_detach(sc);
6695 	} else {
6696 		if (attached & ATTACHED_RNDIS) {
6697 			hn_rndis_init_fixat(sc, nchan);
6698 			hn_rndis_detach(sc);
6699 		}
6700 		if (attached & ATTACHED_NVS)
6701 			hn_nvs_detach(sc);
6702 		hn_chan_detach(sc, sc->hn_prichan);
6703 		/* Restore old capabilities. */
6704 		sc->hn_caps = old_caps;
6705 	}
6706 	return (error);
6707 
6708 #undef ATTACHED_RNDIS
6709 #undef ATTACHED_NVS
6710 }
6711 
6712 /*
6713  * NOTE:
6714  * The interface must have been suspended though hn_suspend(), before
6715  * this function get called.
6716  */
6717 static void
6718 hn_synth_detach(struct hn_softc *sc)
6719 {
6720 
6721 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6722 	    ("synthetic parts were not attached"));
6723 
6724 	/* Detach the RNDIS first. */
6725 	hn_rndis_detach(sc);
6726 
6727 	/* Detach NVS. */
6728 	hn_nvs_detach(sc);
6729 
6730 	/* Detach all of the channels. */
6731 	hn_detach_allchans(sc);
6732 
6733 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
6734 		/*
6735 		 * Host is post-Win2016, disconnect RXBUF from primary channel here.
6736 		 */
6737 		int error;
6738 
6739 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6740 		    sc->hn_rxbuf_gpadl);
6741 		if (error) {
6742 			if_printf(sc->hn_ifp,
6743 			    "rxbuf gpadl disconn failed: %d\n", error);
6744 			sc->hn_flags |= HN_FLAG_RXBUF_REF;
6745 		}
6746 		sc->hn_rxbuf_gpadl = 0;
6747 	}
6748 
6749 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
6750 		/*
6751 		 * Host is post-Win2016, disconnect chimney sending buffer from
6752 		 * primary channel here.
6753 		 */
6754 		int error;
6755 
6756 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6757 		    sc->hn_chim_gpadl);
6758 		if (error) {
6759 			if_printf(sc->hn_ifp,
6760 			    "chim gpadl disconn failed: %d\n", error);
6761 			sc->hn_flags |= HN_FLAG_CHIM_REF;
6762 		}
6763 		sc->hn_chim_gpadl = 0;
6764 	}
6765 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6766 }
6767 
6768 static void
6769 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6770 {
6771 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6772 	    ("invalid ring count %d", ring_cnt));
6773 
6774 	if (sc->hn_tx_ring_cnt > ring_cnt)
6775 		sc->hn_tx_ring_inuse = ring_cnt;
6776 	else
6777 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6778 	sc->hn_rx_ring_inuse = ring_cnt;
6779 
6780 #ifdef RSS
6781 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6782 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6783 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6784 		    rss_getnumbuckets());
6785 	}
6786 #endif
6787 
6788 	if (bootverbose) {
6789 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6790 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6791 	}
6792 }
6793 
6794 static void
6795 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6796 {
6797 
6798 	/*
6799 	 * NOTE:
6800 	 * The TX bufring will not be drained by the hypervisor,
6801 	 * if the primary channel is revoked.
6802 	 */
6803 	while (!vmbus_chan_rx_empty(chan) ||
6804 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6805 	     !vmbus_chan_tx_empty(chan)))
6806 		pause("waitch", 1);
6807 	vmbus_chan_intr_drain(chan);
6808 }
6809 
6810 static void
6811 hn_disable_rx(struct hn_softc *sc)
6812 {
6813 
6814 	/*
6815 	 * Disable RX by clearing RX filter forcefully.
6816 	 */
6817 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6818 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6819 
6820 	/*
6821 	 * Give RNDIS enough time to flush all pending data packets.
6822 	 */
6823 	pause("waitrx", (200 * hz) / 1000);
6824 }
6825 
6826 /*
6827  * NOTE:
6828  * RX/TX _must_ have been suspended/disabled, before this function
6829  * is called.
6830  */
6831 static void
6832 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6833 {
6834 	struct vmbus_channel **subch = NULL;
6835 	int nsubch;
6836 
6837 	/*
6838 	 * Drain RX/TX bufrings and interrupts.
6839 	 */
6840 	nsubch = nchan - 1;
6841 	if (nsubch > 0)
6842 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6843 
6844 	if (subch != NULL) {
6845 		int i;
6846 
6847 		for (i = 0; i < nsubch; ++i)
6848 			hn_chan_drain(sc, subch[i]);
6849 	}
6850 	hn_chan_drain(sc, sc->hn_prichan);
6851 
6852 	if (subch != NULL)
6853 		vmbus_subchan_rel(subch, nsubch);
6854 }
6855 
6856 static void
6857 hn_suspend_data(struct hn_softc *sc)
6858 {
6859 	struct hn_tx_ring *txr;
6860 	int i;
6861 
6862 	HN_LOCK_ASSERT(sc);
6863 
6864 	/*
6865 	 * Suspend TX.
6866 	 */
6867 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6868 		txr = &sc->hn_tx_ring[i];
6869 
6870 		mtx_lock(&txr->hn_tx_lock);
6871 		txr->hn_suspended = 1;
6872 		mtx_unlock(&txr->hn_tx_lock);
6873 		/* No one is able send more packets now. */
6874 
6875 		/*
6876 		 * Wait for all pending sends to finish.
6877 		 *
6878 		 * NOTE:
6879 		 * We will _not_ receive all pending send-done, if the
6880 		 * primary channel is revoked.
6881 		 */
6882 		while (hn_tx_ring_pending(txr) &&
6883 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6884 			pause("hnwtx", 1 /* 1 tick */);
6885 	}
6886 
6887 	/*
6888 	 * Disable RX.
6889 	 */
6890 	hn_disable_rx(sc);
6891 
6892 	/*
6893 	 * Drain RX/TX.
6894 	 */
6895 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6896 
6897 	/*
6898 	 * Drain any pending TX tasks.
6899 	 *
6900 	 * NOTE:
6901 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6902 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6903 	 */
6904 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6905 		txr = &sc->hn_tx_ring[i];
6906 
6907 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6908 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6909 	}
6910 }
6911 
6912 static void
6913 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6914 {
6915 
6916 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6917 }
6918 
6919 static void
6920 hn_suspend_mgmt(struct hn_softc *sc)
6921 {
6922 	struct task task;
6923 
6924 	HN_LOCK_ASSERT(sc);
6925 
6926 	/*
6927 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6928 	 * through hn_mgmt_taskq.
6929 	 */
6930 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6931 	vmbus_chan_run_task(sc->hn_prichan, &task);
6932 
6933 	/*
6934 	 * Make sure that all pending management tasks are completed.
6935 	 */
6936 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6937 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6938 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6939 }
6940 
6941 static void
6942 hn_suspend(struct hn_softc *sc)
6943 {
6944 
6945 	/* Disable polling. */
6946 	hn_polling(sc, 0);
6947 
6948 	/*
6949 	 * If the non-transparent mode VF is activated, the synthetic
6950 	 * device is receiving packets, so the data path of the
6951 	 * synthetic device must be suspended.
6952 	 */
6953 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6954 	    (sc->hn_flags & HN_FLAG_RXVF))
6955 		hn_suspend_data(sc);
6956 	hn_suspend_mgmt(sc);
6957 }
6958 
6959 static void
6960 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6961 {
6962 	int i;
6963 
6964 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6965 	    ("invalid TX ring count %d", tx_ring_cnt));
6966 
6967 	for (i = 0; i < tx_ring_cnt; ++i) {
6968 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6969 
6970 		mtx_lock(&txr->hn_tx_lock);
6971 		txr->hn_suspended = 0;
6972 		mtx_unlock(&txr->hn_tx_lock);
6973 	}
6974 }
6975 
6976 static void
6977 hn_resume_data(struct hn_softc *sc)
6978 {
6979 	int i;
6980 
6981 	HN_LOCK_ASSERT(sc);
6982 
6983 	/*
6984 	 * Re-enable RX.
6985 	 */
6986 	hn_rxfilter_config(sc);
6987 
6988 	/*
6989 	 * Make sure to clear suspend status on "all" TX rings,
6990 	 * since hn_tx_ring_inuse can be changed after
6991 	 * hn_suspend_data().
6992 	 */
6993 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6994 
6995 #ifdef HN_IFSTART_SUPPORT
6996 	if (!hn_use_if_start)
6997 #endif
6998 	{
6999 		/*
7000 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
7001 		 * reduced.
7002 		 */
7003 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
7004 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
7005 	}
7006 
7007 	/*
7008 	 * Kick start TX.
7009 	 */
7010 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
7011 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
7012 
7013 		/*
7014 		 * Use txeof task, so that any pending oactive can be
7015 		 * cleared properly.
7016 		 */
7017 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
7018 	}
7019 }
7020 
7021 static void
7022 hn_resume_mgmt(struct hn_softc *sc)
7023 {
7024 
7025 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
7026 
7027 	/*
7028 	 * Kick off network change detection, if it was pending.
7029 	 * If no network change was pending, start link status
7030 	 * checks, which is more lightweight than network change
7031 	 * detection.
7032 	 */
7033 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
7034 		hn_change_network(sc);
7035 	else
7036 		hn_update_link_status(sc);
7037 }
7038 
7039 static void
7040 hn_resume(struct hn_softc *sc)
7041 {
7042 
7043 	/*
7044 	 * If the non-transparent mode VF is activated, the synthetic
7045 	 * device have to receive packets, so the data path of the
7046 	 * synthetic device must be resumed.
7047 	 */
7048 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
7049 	    (sc->hn_flags & HN_FLAG_RXVF))
7050 		hn_resume_data(sc);
7051 
7052 	/*
7053 	 * Don't resume link status change if VF is attached/activated.
7054 	 * - In the non-transparent VF mode, the synthetic device marks
7055 	 *   link down until the VF is deactivated; i.e. VF is down.
7056 	 * - In transparent VF mode, VF's media status is used until
7057 	 *   the VF is detached.
7058 	 */
7059 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
7060 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
7061 		hn_resume_mgmt(sc);
7062 
7063 	/*
7064 	 * Re-enable polling if this interface is running and
7065 	 * the polling is requested.
7066 	 */
7067 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
7068 		hn_polling(sc, sc->hn_pollhz);
7069 }
7070 
7071 static void
7072 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
7073 {
7074 	const struct rndis_status_msg *msg;
7075 	int ofs;
7076 
7077 	if (dlen < sizeof(*msg)) {
7078 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
7079 		return;
7080 	}
7081 	msg = data;
7082 
7083 	switch (msg->rm_status) {
7084 	case RNDIS_STATUS_MEDIA_CONNECT:
7085 	case RNDIS_STATUS_MEDIA_DISCONNECT:
7086 		hn_update_link_status(sc);
7087 		break;
7088 
7089 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
7090 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
7091 		/* Not really useful; ignore. */
7092 		break;
7093 
7094 	case RNDIS_STATUS_NETWORK_CHANGE:
7095 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
7096 		if (dlen < ofs + msg->rm_stbuflen ||
7097 		    msg->rm_stbuflen < sizeof(uint32_t)) {
7098 			if_printf(sc->hn_ifp, "network changed\n");
7099 		} else {
7100 			uint32_t change;
7101 
7102 			memcpy(&change, ((const uint8_t *)msg) + ofs,
7103 			    sizeof(change));
7104 			if_printf(sc->hn_ifp, "network changed, change %u\n",
7105 			    change);
7106 		}
7107 		hn_change_network(sc);
7108 		break;
7109 
7110 	default:
7111 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
7112 		    msg->rm_status);
7113 		break;
7114 	}
7115 }
7116 
7117 static int
7118 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
7119 {
7120 	const struct rndis_pktinfo *pi = info_data;
7121 	uint32_t mask = 0;
7122 
7123 	while (info_dlen != 0) {
7124 		const void *data;
7125 		uint32_t dlen;
7126 
7127 		if (__predict_false(info_dlen < sizeof(*pi)))
7128 			return (EINVAL);
7129 		if (__predict_false(info_dlen < pi->rm_size))
7130 			return (EINVAL);
7131 		info_dlen -= pi->rm_size;
7132 
7133 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
7134 			return (EINVAL);
7135 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
7136 			return (EINVAL);
7137 		dlen = pi->rm_size - pi->rm_pktinfooffset;
7138 		data = pi->rm_data;
7139 
7140 		if (pi->rm_internal == 1) {
7141 			switch (pi->rm_type) {
7142 			case NDIS_PKTINFO_IT_PKTINFO_ID:
7143 				if (__predict_false(dlen < NDIS_PKTINFOID_SZ))
7144 					return (EINVAL);
7145 				info->pktinfo_id =
7146 				    (const struct packet_info_id *)data;
7147 				mask |= HN_RXINFO_PKTINFO_ID;
7148 				break;
7149 
7150 			default:
7151 				goto next;
7152 			}
7153 		} else {
7154 			switch (pi->rm_type) {
7155 			case NDIS_PKTINFO_TYPE_VLAN:
7156 				if (__predict_false(dlen
7157 				    < NDIS_VLAN_INFO_SIZE))
7158 					return (EINVAL);
7159 				info->vlan_info = (const uint32_t *)data;
7160 				mask |= HN_RXINFO_VLAN;
7161 				break;
7162 
7163 			case NDIS_PKTINFO_TYPE_CSUM:
7164 				if (__predict_false(dlen
7165 				    < NDIS_RXCSUM_INFO_SIZE))
7166 					return (EINVAL);
7167 				info->csum_info = (const uint32_t *)data;
7168 				mask |= HN_RXINFO_CSUM;
7169 				break;
7170 
7171 			case HN_NDIS_PKTINFO_TYPE_HASHVAL:
7172 				if (__predict_false(dlen
7173 				    < HN_NDIS_HASH_VALUE_SIZE))
7174 					return (EINVAL);
7175 				info->hash_value = (const uint32_t *)data;
7176 				mask |= HN_RXINFO_HASHVAL;
7177 				break;
7178 
7179 			case HN_NDIS_PKTINFO_TYPE_HASHINF:
7180 				if (__predict_false(dlen
7181 				    < HN_NDIS_HASH_INFO_SIZE))
7182 					return (EINVAL);
7183 				info->hash_info = (const uint32_t *)data;
7184 				mask |= HN_RXINFO_HASHINF;
7185 				break;
7186 
7187 			default:
7188 				goto next;
7189 			}
7190 		}
7191 
7192 		if (mask == HN_RXINFO_ALL) {
7193 			/* All found; done */
7194 			break;
7195 		}
7196 next:
7197 		pi = (const struct rndis_pktinfo *)
7198 		    ((const uint8_t *)pi + pi->rm_size);
7199 	}
7200 
7201 	/*
7202 	 * Final fixup.
7203 	 * - If there is no hash value, invalidate the hash info.
7204 	 */
7205 	if ((mask & HN_RXINFO_HASHVAL) == 0)
7206 		info->hash_info = NULL;
7207 	return (0);
7208 }
7209 
7210 static __inline bool
7211 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7212 {
7213 
7214 	if (off < check_off) {
7215 		if (__predict_true(off + len <= check_off))
7216 			return (false);
7217 	} else if (off > check_off) {
7218 		if (__predict_true(check_off + check_len <= off))
7219 			return (false);
7220 	}
7221 	return (true);
7222 }
7223 
7224 static __inline void
7225 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data,
7226 		uint32_t len, struct hn_rxinfo *info)
7227 {
7228 	uint32_t cnt = rxr->rsc.cnt;
7229 
7230 	if (cnt) {
7231 		rxr->rsc.pktlen += len;
7232 	} else {
7233 		rxr->rsc.vlan_info = info->vlan_info;
7234 		rxr->rsc.csum_info = info->csum_info;
7235 		rxr->rsc.hash_info = info->hash_info;
7236 		rxr->rsc.hash_value = info->hash_value;
7237 		rxr->rsc.pktlen = len;
7238 	}
7239 
7240 	rxr->rsc.frag_data[cnt] = data;
7241 	rxr->rsc.frag_len[cnt] = len;
7242 	rxr->rsc.cnt++;
7243 }
7244 
7245 static void
7246 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7247 {
7248 	const struct rndis_packet_msg *pkt;
7249 	struct hn_rxinfo info;
7250 	int data_off, pktinfo_off, data_len, pktinfo_len;
7251 	bool rsc_more= false;
7252 
7253 	/*
7254 	 * Check length.
7255 	 */
7256 	if (__predict_false(dlen < sizeof(*pkt))) {
7257 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7258 		return;
7259 	}
7260 	pkt = data;
7261 
7262 	if (__predict_false(dlen < pkt->rm_len)) {
7263 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7264 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7265 		return;
7266 	}
7267 	if (__predict_false(pkt->rm_len <
7268 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7269 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7270 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
7271 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7272 		    pkt->rm_pktinfolen);
7273 		return;
7274 	}
7275 	if (__predict_false(pkt->rm_datalen == 0)) {
7276 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7277 		return;
7278 	}
7279 
7280 	/*
7281 	 * Check offests.
7282 	 */
7283 #define IS_OFFSET_INVALID(ofs)			\
7284 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
7285 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7286 
7287 	/* XXX Hyper-V does not meet data offset alignment requirement */
7288 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7289 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7290 		    "data offset %u\n", pkt->rm_dataoffset);
7291 		return;
7292 	}
7293 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7294 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7295 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7296 		    "oob offset %u\n", pkt->rm_oobdataoffset);
7297 		return;
7298 	}
7299 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7300 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7301 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7302 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7303 		return;
7304 	}
7305 
7306 #undef IS_OFFSET_INVALID
7307 
7308 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7309 	data_len = pkt->rm_datalen;
7310 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7311 	pktinfo_len = pkt->rm_pktinfolen;
7312 
7313 	/*
7314 	 * Check OOB coverage.
7315 	 */
7316 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
7317 		int oob_off, oob_len;
7318 
7319 		if_printf(rxr->hn_ifp, "got oobdata\n");
7320 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7321 		oob_len = pkt->rm_oobdatalen;
7322 
7323 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7324 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7325 			    "oob overflow, msglen %u, oob abs %d len %d\n",
7326 			    pkt->rm_len, oob_off, oob_len);
7327 			return;
7328 		}
7329 
7330 		/*
7331 		 * Check against data.
7332 		 */
7333 		if (hn_rndis_check_overlap(oob_off, oob_len,
7334 		    data_off, data_len)) {
7335 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7336 			    "oob overlaps data, oob abs %d len %d, "
7337 			    "data abs %d len %d\n",
7338 			    oob_off, oob_len, data_off, data_len);
7339 			return;
7340 		}
7341 
7342 		/*
7343 		 * Check against pktinfo.
7344 		 */
7345 		if (pktinfo_len != 0 &&
7346 		    hn_rndis_check_overlap(oob_off, oob_len,
7347 		    pktinfo_off, pktinfo_len)) {
7348 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7349 			    "oob overlaps pktinfo, oob abs %d len %d, "
7350 			    "pktinfo abs %d len %d\n",
7351 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
7352 			return;
7353 		}
7354 	}
7355 
7356 	/*
7357 	 * Check per-packet-info coverage and find useful per-packet-info.
7358 	 */
7359 	info.vlan_info = NULL;
7360 	info.csum_info = NULL;
7361 	info.hash_info = NULL;
7362 	info.pktinfo_id = NULL;
7363 
7364 	if (__predict_true(pktinfo_len != 0)) {
7365 		bool overlap;
7366 		int error;
7367 
7368 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7369 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7370 			    "pktinfo overflow, msglen %u, "
7371 			    "pktinfo abs %d len %d\n",
7372 			    pkt->rm_len, pktinfo_off, pktinfo_len);
7373 			return;
7374 		}
7375 
7376 		/*
7377 		 * Check packet info coverage.
7378 		 */
7379 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7380 		    data_off, data_len);
7381 		if (__predict_false(overlap)) {
7382 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7383 			    "pktinfo overlap data, pktinfo abs %d len %d, "
7384 			    "data abs %d len %d\n",
7385 			    pktinfo_off, pktinfo_len, data_off, data_len);
7386 			return;
7387 		}
7388 
7389 		/*
7390 		 * Find useful per-packet-info.
7391 		 */
7392 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7393 		    pktinfo_len, &info);
7394 		if (__predict_false(error)) {
7395 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7396 			    "pktinfo\n");
7397 			return;
7398 		}
7399 	}
7400 
7401 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
7402 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7403 		    "data overflow, msglen %u, data abs %d len %d\n",
7404 		    pkt->rm_len, data_off, data_len);
7405 		return;
7406 	}
7407 
7408 	/* Identify RSC fragments, drop invalid packets */
7409 	if ((info.pktinfo_id != NULL) &&
7410 	    (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) {
7411 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) {
7412 			rxr->rsc.cnt = 0;
7413 			rxr->hn_rsc_pkts++;
7414 		} else if (rxr->rsc.cnt == 0)
7415 			goto drop;
7416 
7417 		rsc_more = true;
7418 
7419 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG)
7420 			rsc_more = false;
7421 
7422 		if (rsc_more && rxr->rsc.is_last)
7423 			goto drop;
7424 	} else {
7425 		rxr->rsc.cnt = 0;
7426 	}
7427 
7428 	if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX))
7429 		goto drop;
7430 
7431 	/* Store data in per rx ring structure */
7432 	hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off,
7433 	    data_len, &info);
7434 
7435 	if (rsc_more)
7436 		return;
7437 
7438 	hn_rxpkt(rxr);
7439 	rxr->rsc.cnt = 0;
7440 	return;
7441 drop:
7442 	rxr->hn_rsc_drop++;
7443 	return;
7444 }
7445 
7446 static __inline void
7447 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7448 {
7449 	const struct rndis_msghdr *hdr;
7450 
7451 	if (__predict_false(dlen < sizeof(*hdr))) {
7452 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7453 		return;
7454 	}
7455 	hdr = data;
7456 
7457 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7458 		/* Hot data path. */
7459 		hn_rndis_rx_data(rxr, data, dlen);
7460 		/* Done! */
7461 		return;
7462 	}
7463 
7464 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7465 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7466 	else
7467 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7468 }
7469 
7470 static void
7471 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7472 {
7473 	const struct hn_nvs_hdr *hdr;
7474 
7475 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7476 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
7477 		return;
7478 	}
7479 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7480 
7481 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7482 		/* Useless; ignore */
7483 		return;
7484 	}
7485 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7486 }
7487 
7488 static void
7489 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7490     const struct vmbus_chanpkt_hdr *pkt)
7491 {
7492 	struct hn_nvs_sendctx *sndc;
7493 
7494 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7495 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7496 	    VMBUS_CHANPKT_DATALEN(pkt));
7497 	/*
7498 	 * NOTE:
7499 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7500 	 * its callback.
7501 	 */
7502 }
7503 
7504 static void
7505 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7506     const struct vmbus_chanpkt_hdr *pkthdr)
7507 {
7508 	struct epoch_tracker et;
7509 	const struct vmbus_chanpkt_rxbuf *pkt;
7510 	const struct hn_nvs_hdr *nvs_hdr;
7511 	int count, i, hlen;
7512 
7513 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7514 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7515 		return;
7516 	}
7517 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7518 
7519 	/* Make sure that this is a RNDIS message. */
7520 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7521 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7522 		    nvs_hdr->nvs_type);
7523 		return;
7524 	}
7525 
7526 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7527 	if (__predict_false(hlen < sizeof(*pkt))) {
7528 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7529 		return;
7530 	}
7531 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7532 
7533 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7534 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7535 		    pkt->cp_rxbuf_id);
7536 		return;
7537 	}
7538 
7539 	count = pkt->cp_rxbuf_cnt;
7540 	if (__predict_false(hlen <
7541 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7542 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7543 		return;
7544 	}
7545 
7546 	NET_EPOCH_ENTER(et);
7547 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7548 	for (i = 0; i < count; ++i) {
7549 		int ofs, len;
7550 
7551 		ofs = pkt->cp_rxbuf[i].rb_ofs;
7552 		len = pkt->cp_rxbuf[i].rb_len;
7553 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7554 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7555 			    "ofs %d, len %d\n", i, ofs, len);
7556 			continue;
7557 		}
7558 
7559 		rxr->rsc.is_last = (i == (count - 1));
7560 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7561 	}
7562 	NET_EPOCH_EXIT(et);
7563 
7564 	/*
7565 	 * Ack the consumed RXBUF associated w/ this channel packet,
7566 	 * so that this RXBUF can be recycled by the hypervisor.
7567 	 */
7568 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7569 }
7570 
7571 static void
7572 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7573     uint64_t tid)
7574 {
7575 	struct hn_nvs_rndis_ack ack;
7576 	int retries, error;
7577 
7578 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7579 	ack.nvs_status = HN_NVS_STATUS_OK;
7580 
7581 	retries = 0;
7582 again:
7583 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7584 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7585 	if (__predict_false(error == EAGAIN)) {
7586 		/*
7587 		 * NOTE:
7588 		 * This should _not_ happen in real world, since the
7589 		 * consumption of the TX bufring from the TX path is
7590 		 * controlled.
7591 		 */
7592 		if (rxr->hn_ack_failed == 0)
7593 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7594 		rxr->hn_ack_failed++;
7595 		retries++;
7596 		if (retries < 10) {
7597 			DELAY(100);
7598 			goto again;
7599 		}
7600 		/* RXBUF leaks! */
7601 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7602 	}
7603 }
7604 
7605 static void
7606 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7607 {
7608 	struct hn_rx_ring *rxr = xrxr;
7609 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
7610 
7611 	for (;;) {
7612 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7613 		int error, pktlen;
7614 
7615 		pktlen = rxr->hn_pktbuf_len;
7616 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7617 		if (__predict_false(error == ENOBUFS)) {
7618 			void *nbuf;
7619 			int nlen;
7620 
7621 			/*
7622 			 * Expand channel packet buffer.
7623 			 *
7624 			 * XXX
7625 			 * Use M_WAITOK here, since allocation failure
7626 			 * is fatal.
7627 			 */
7628 			nlen = rxr->hn_pktbuf_len * 2;
7629 			while (nlen < pktlen)
7630 				nlen *= 2;
7631 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7632 
7633 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7634 			    rxr->hn_pktbuf_len, nlen);
7635 
7636 			free(rxr->hn_pktbuf, M_DEVBUF);
7637 			rxr->hn_pktbuf = nbuf;
7638 			rxr->hn_pktbuf_len = nlen;
7639 			/* Retry! */
7640 			continue;
7641 		} else if (__predict_false(error == EAGAIN)) {
7642 			/* No more channel packets; done! */
7643 			break;
7644 		}
7645 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7646 
7647 		switch (pkt->cph_type) {
7648 		case VMBUS_CHANPKT_TYPE_COMP:
7649 			hn_nvs_handle_comp(sc, chan, pkt);
7650 			break;
7651 
7652 		case VMBUS_CHANPKT_TYPE_RXBUF:
7653 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
7654 			break;
7655 
7656 		case VMBUS_CHANPKT_TYPE_INBAND:
7657 			hn_nvs_handle_notify(sc, pkt);
7658 			break;
7659 
7660 		default:
7661 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7662 			    pkt->cph_type);
7663 			break;
7664 		}
7665 	}
7666 	hn_chan_rollup(rxr, rxr->hn_txr);
7667 }
7668 
7669 static void
7670 hn_sysinit(void *arg __unused)
7671 {
7672 	int i;
7673 
7674 	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7675 
7676 #ifdef HN_IFSTART_SUPPORT
7677 	/*
7678 	 * Don't use ifnet.if_start if transparent VF mode is requested;
7679 	 * mainly due to the IFF_DRV_OACTIVE flag.
7680 	 */
7681 	if (hn_xpnt_vf && hn_use_if_start) {
7682 		hn_use_if_start = 0;
7683 		printf("hn: tranparent VF mode, if_transmit will be used, "
7684 		    "instead of if_start\n");
7685 	}
7686 #endif
7687 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7688 		printf("hn: invalid transparent VF attach routing "
7689 		    "wait timeout %d, reset to %d\n",
7690 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7691 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7692 	}
7693 
7694 	/*
7695 	 * Initialize VF map.
7696 	 */
7697 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7698 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7699 	hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7700 	    M_WAITOK | M_ZERO);
7701 
7702 	/*
7703 	 * Fix the # of TX taskqueues.
7704 	 */
7705 	if (hn_tx_taskq_cnt <= 0)
7706 		hn_tx_taskq_cnt = 1;
7707 	else if (hn_tx_taskq_cnt > mp_ncpus)
7708 		hn_tx_taskq_cnt = mp_ncpus;
7709 
7710 	/*
7711 	 * Fix the TX taskqueue mode.
7712 	 */
7713 	switch (hn_tx_taskq_mode) {
7714 	case HN_TX_TASKQ_M_INDEP:
7715 	case HN_TX_TASKQ_M_GLOBAL:
7716 	case HN_TX_TASKQ_M_EVTTQ:
7717 		break;
7718 	default:
7719 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7720 		break;
7721 	}
7722 
7723 	if (vm_guest != VM_GUEST_HV)
7724 		return;
7725 
7726 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7727 		return;
7728 
7729 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7730 	    M_DEVBUF, M_WAITOK);
7731 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7732 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7733 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7734 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7735 		    "hn tx%d", i);
7736 	}
7737 }
7738 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7739 
7740 static void
7741 hn_sysuninit(void *arg __unused)
7742 {
7743 
7744 	if (hn_tx_taskque != NULL) {
7745 		int i;
7746 
7747 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7748 			taskqueue_free(hn_tx_taskque[i]);
7749 		free(hn_tx_taskque, M_DEVBUF);
7750 	}
7751 
7752 	if (hn_vfmap != NULL)
7753 		free(hn_vfmap, M_DEVBUF);
7754 	rm_destroy(&hn_vfmap_lock);
7755 
7756 	counter_u64_free(hn_udpcs_fixup);
7757 }
7758 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7759