xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision d93a896e)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/bus.h>
65 #include <sys/kernel.h>
66 #include <sys/limits.h>
67 #include <sys/malloc.h>
68 #include <sys/mbuf.h>
69 #include <sys/module.h>
70 #include <sys/queue.h>
71 #include <sys/lock.h>
72 #include <sys/rmlock.h>
73 #include <sys/sbuf.h>
74 #include <sys/smp.h>
75 #include <sys/socket.h>
76 #include <sys/sockio.h>
77 #include <sys/sx.h>
78 #include <sys/sysctl.h>
79 #include <sys/systm.h>
80 #include <sys/taskqueue.h>
81 #include <sys/buf_ring.h>
82 #include <sys/eventhandler.h>
83 
84 #include <machine/atomic.h>
85 #include <machine/in_cksum.h>
86 
87 #include <net/bpf.h>
88 #include <net/ethernet.h>
89 #include <net/if.h>
90 #include <net/if_dl.h>
91 #include <net/if_media.h>
92 #include <net/if_types.h>
93 #include <net/if_var.h>
94 #include <net/rndis.h>
95 #ifdef RSS
96 #include <net/rss_config.h>
97 #endif
98 
99 #include <netinet/in_systm.h>
100 #include <netinet/in.h>
101 #include <netinet/ip.h>
102 #include <netinet/ip6.h>
103 #include <netinet/tcp.h>
104 #include <netinet/tcp_lro.h>
105 #include <netinet/udp.h>
106 
107 #include <dev/hyperv/include/hyperv.h>
108 #include <dev/hyperv/include/hyperv_busdma.h>
109 #include <dev/hyperv/include/vmbus.h>
110 #include <dev/hyperv/include/vmbus_xact.h>
111 
112 #include <dev/hyperv/netvsc/ndis.h>
113 #include <dev/hyperv/netvsc/if_hnreg.h>
114 #include <dev/hyperv/netvsc/if_hnvar.h>
115 #include <dev/hyperv/netvsc/hn_nvs.h>
116 #include <dev/hyperv/netvsc/hn_rndis.h>
117 
118 #include "vmbus_if.h"
119 
120 #define HN_IFSTART_SUPPORT
121 
122 #define HN_RING_CNT_DEF_MAX		8
123 
124 #define HN_VFMAP_SIZE_DEF		8
125 
126 /* YYY should get it from the underlying channel */
127 #define HN_TX_DESC_CNT			512
128 
129 #define HN_RNDIS_PKT_LEN					\
130 	(sizeof(struct rndis_packet_msg) +			\
131 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
132 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
133 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
134 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
135 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
136 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
137 
138 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
139 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
140 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
141 /* -1 for RNDIS packet message */
142 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
143 
144 #define HN_DIRECT_TX_SIZE_DEF		128
145 
146 #define HN_EARLY_TXEOF_THRESH		8
147 
148 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
149 
150 #define HN_LROENT_CNT_DEF		128
151 
152 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
153 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
154 /* YYY 2*MTU is a bit rough, but should be good enough. */
155 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
156 
157 #define HN_LRO_ACKCNT_DEF		1
158 
159 #define HN_LOCK_INIT(sc)		\
160 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
161 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
162 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
163 #define HN_LOCK(sc)					\
164 do {							\
165 	while (sx_try_xlock(&(sc)->hn_lock) == 0)	\
166 		DELAY(1000);				\
167 } while (0)
168 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
169 
170 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
171 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
172 #define HN_CSUM_IP_HWASSIST(sc)		\
173 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
174 #define HN_CSUM_IP6_HWASSIST(sc)	\
175 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
176 
177 #define HN_PKTSIZE_MIN(align)		\
178 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
179 	    HN_RNDIS_PKT_LEN, (align))
180 #define HN_PKTSIZE(m, align)		\
181 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
182 
183 #ifdef RSS
184 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
185 #else
186 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
187 #endif
188 
189 struct hn_txdesc {
190 #ifndef HN_USE_TXDESC_BUFRING
191 	SLIST_ENTRY(hn_txdesc)		link;
192 #endif
193 	STAILQ_ENTRY(hn_txdesc)		agg_link;
194 
195 	/* Aggregated txdescs, in sending order. */
196 	STAILQ_HEAD(, hn_txdesc)	agg_list;
197 
198 	/* The oldest packet, if transmission aggregation happens. */
199 	struct mbuf			*m;
200 	struct hn_tx_ring		*txr;
201 	int				refs;
202 	uint32_t			flags;	/* HN_TXD_FLAG_ */
203 	struct hn_nvs_sendctx		send_ctx;
204 	uint32_t			chim_index;
205 	int				chim_size;
206 
207 	bus_dmamap_t			data_dmap;
208 
209 	bus_addr_t			rndis_pkt_paddr;
210 	struct rndis_packet_msg		*rndis_pkt;
211 	bus_dmamap_t			rndis_pkt_dmap;
212 };
213 
214 #define HN_TXD_FLAG_ONLIST		0x0001
215 #define HN_TXD_FLAG_DMAMAP		0x0002
216 #define HN_TXD_FLAG_ONAGG		0x0004
217 
218 struct hn_rxinfo {
219 	uint32_t			vlan_info;
220 	uint32_t			csum_info;
221 	uint32_t			hash_info;
222 	uint32_t			hash_value;
223 };
224 
225 struct hn_rxvf_setarg {
226 	struct hn_rx_ring	*rxr;
227 	struct ifnet		*vf_ifp;
228 };
229 
230 #define HN_RXINFO_VLAN			0x0001
231 #define HN_RXINFO_CSUM			0x0002
232 #define HN_RXINFO_HASHINF		0x0004
233 #define HN_RXINFO_HASHVAL		0x0008
234 #define HN_RXINFO_ALL			\
235 	(HN_RXINFO_VLAN |		\
236 	 HN_RXINFO_CSUM |		\
237 	 HN_RXINFO_HASHINF |		\
238 	 HN_RXINFO_HASHVAL)
239 
240 #define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
241 #define HN_NDIS_RXCSUM_INFO_INVALID	0
242 #define HN_NDIS_HASH_INFO_INVALID	0
243 
244 static int			hn_probe(device_t);
245 static int			hn_attach(device_t);
246 static int			hn_detach(device_t);
247 static int			hn_shutdown(device_t);
248 static void			hn_chan_callback(struct vmbus_channel *,
249 				    void *);
250 
251 static void			hn_init(void *);
252 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
253 #ifdef HN_IFSTART_SUPPORT
254 static void			hn_start(struct ifnet *);
255 #endif
256 static int			hn_transmit(struct ifnet *, struct mbuf *);
257 static void			hn_xmit_qflush(struct ifnet *);
258 static int			hn_ifmedia_upd(struct ifnet *);
259 static void			hn_ifmedia_sts(struct ifnet *,
260 				    struct ifmediareq *);
261 
262 static void			hn_ifnet_event(void *, struct ifnet *, int);
263 static void			hn_ifaddr_event(void *, struct ifnet *);
264 static void			hn_ifnet_attevent(void *, struct ifnet *);
265 static void			hn_ifnet_detevent(void *, struct ifnet *);
266 
267 static bool			hn_ismyvf(const struct hn_softc *,
268 				    const struct ifnet *);
269 static void			hn_rxvf_change(struct hn_softc *,
270 				    struct ifnet *, bool);
271 static void			hn_rxvf_set(struct hn_softc *, struct ifnet *);
272 static void			hn_rxvf_set_task(void *, int);
273 
274 static int			hn_rndis_rxinfo(const void *, int,
275 				    struct hn_rxinfo *);
276 static void			hn_rndis_rx_data(struct hn_rx_ring *,
277 				    const void *, int);
278 static void			hn_rndis_rx_status(struct hn_softc *,
279 				    const void *, int);
280 static void			hn_rndis_init_fixat(struct hn_softc *, int);
281 
282 static void			hn_nvs_handle_notify(struct hn_softc *,
283 				    const struct vmbus_chanpkt_hdr *);
284 static void			hn_nvs_handle_comp(struct hn_softc *,
285 				    struct vmbus_channel *,
286 				    const struct vmbus_chanpkt_hdr *);
287 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
288 				    struct vmbus_channel *,
289 				    const struct vmbus_chanpkt_hdr *);
290 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
291 				    struct vmbus_channel *, uint64_t);
292 
293 #if __FreeBSD_version >= 1100099
294 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
295 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
296 #endif
297 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
298 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
299 #if __FreeBSD_version < 1100095
300 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
301 #else
302 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
303 #endif
304 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
305 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
306 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
307 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
308 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
309 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
310 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
311 #ifndef RSS
312 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
313 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
314 #endif
315 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
316 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
317 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
318 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
319 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
320 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
321 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
322 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
323 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
324 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
325 
326 static void			hn_stop(struct hn_softc *, bool);
327 static void			hn_init_locked(struct hn_softc *);
328 static int			hn_chan_attach(struct hn_softc *,
329 				    struct vmbus_channel *);
330 static void			hn_chan_detach(struct hn_softc *,
331 				    struct vmbus_channel *);
332 static int			hn_attach_subchans(struct hn_softc *);
333 static void			hn_detach_allchans(struct hn_softc *);
334 static void			hn_chan_rollup(struct hn_rx_ring *,
335 				    struct hn_tx_ring *);
336 static void			hn_set_ring_inuse(struct hn_softc *, int);
337 static int			hn_synth_attach(struct hn_softc *, int);
338 static void			hn_synth_detach(struct hn_softc *);
339 static int			hn_synth_alloc_subchans(struct hn_softc *,
340 				    int *);
341 static bool			hn_synth_attachable(const struct hn_softc *);
342 static void			hn_suspend(struct hn_softc *);
343 static void			hn_suspend_data(struct hn_softc *);
344 static void			hn_suspend_mgmt(struct hn_softc *);
345 static void			hn_resume(struct hn_softc *);
346 static void			hn_resume_data(struct hn_softc *);
347 static void			hn_resume_mgmt(struct hn_softc *);
348 static void			hn_suspend_mgmt_taskfunc(void *, int);
349 static void			hn_chan_drain(struct hn_softc *,
350 				    struct vmbus_channel *);
351 static void			hn_disable_rx(struct hn_softc *);
352 static void			hn_drain_rxtx(struct hn_softc *, int);
353 static void			hn_polling(struct hn_softc *, u_int);
354 static void			hn_chan_polling(struct vmbus_channel *, u_int);
355 
356 static void			hn_update_link_status(struct hn_softc *);
357 static void			hn_change_network(struct hn_softc *);
358 static void			hn_link_taskfunc(void *, int);
359 static void			hn_netchg_init_taskfunc(void *, int);
360 static void			hn_netchg_status_taskfunc(void *, int);
361 static void			hn_link_status(struct hn_softc *);
362 
363 static int			hn_create_rx_data(struct hn_softc *, int);
364 static void			hn_destroy_rx_data(struct hn_softc *);
365 static int			hn_check_iplen(const struct mbuf *, int);
366 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
367 static int			hn_rxfilter_config(struct hn_softc *);
368 #ifndef RSS
369 static int			hn_rss_reconfig(struct hn_softc *);
370 #endif
371 static void			hn_rss_ind_fixup(struct hn_softc *);
372 static int			hn_rxpkt(struct hn_rx_ring *, const void *,
373 				    int, const struct hn_rxinfo *);
374 
375 static int			hn_tx_ring_create(struct hn_softc *, int);
376 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
377 static int			hn_create_tx_data(struct hn_softc *, int);
378 static void			hn_fixup_tx_data(struct hn_softc *);
379 static void			hn_destroy_tx_data(struct hn_softc *);
380 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
381 static void			hn_txdesc_gc(struct hn_tx_ring *,
382 				    struct hn_txdesc *);
383 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
384 				    struct hn_txdesc *, struct mbuf **);
385 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
386 				    struct hn_txdesc *);
387 static void			hn_set_chim_size(struct hn_softc *, int);
388 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
389 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
390 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
391 static void			hn_resume_tx(struct hn_softc *, int);
392 static void			hn_set_txagg(struct hn_softc *);
393 static void			*hn_try_txagg(struct ifnet *,
394 				    struct hn_tx_ring *, struct hn_txdesc *,
395 				    int);
396 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
397 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
398 				    struct hn_softc *, struct vmbus_channel *,
399 				    const void *, int);
400 static int			hn_txpkt_sglist(struct hn_tx_ring *,
401 				    struct hn_txdesc *);
402 static int			hn_txpkt_chim(struct hn_tx_ring *,
403 				    struct hn_txdesc *);
404 static int			hn_xmit(struct hn_tx_ring *, int);
405 static void			hn_xmit_taskfunc(void *, int);
406 static void			hn_xmit_txeof(struct hn_tx_ring *);
407 static void			hn_xmit_txeof_taskfunc(void *, int);
408 #ifdef HN_IFSTART_SUPPORT
409 static int			hn_start_locked(struct hn_tx_ring *, int);
410 static void			hn_start_taskfunc(void *, int);
411 static void			hn_start_txeof(struct hn_tx_ring *);
412 static void			hn_start_txeof_taskfunc(void *, int);
413 #endif
414 
415 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
416     "Hyper-V network interface");
417 
418 /* Trust tcp segements verification on host side. */
419 static int			hn_trust_hosttcp = 1;
420 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
421     &hn_trust_hosttcp, 0,
422     "Trust tcp segement verification on host side, "
423     "when csum info is missing (global setting)");
424 
425 /* Trust udp datagrams verification on host side. */
426 static int			hn_trust_hostudp = 1;
427 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
428     &hn_trust_hostudp, 0,
429     "Trust udp datagram verification on host side, "
430     "when csum info is missing (global setting)");
431 
432 /* Trust ip packets verification on host side. */
433 static int			hn_trust_hostip = 1;
434 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
435     &hn_trust_hostip, 0,
436     "Trust ip packet verification on host side, "
437     "when csum info is missing (global setting)");
438 
439 /* Limit TSO burst size */
440 static int			hn_tso_maxlen = IP_MAXPACKET;
441 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
442     &hn_tso_maxlen, 0, "TSO burst limit");
443 
444 /* Limit chimney send size */
445 static int			hn_tx_chimney_size = 0;
446 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
447     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
448 
449 /* Limit the size of packet for direct transmission */
450 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
451 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
452     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
453 
454 /* # of LRO entries per RX ring */
455 #if defined(INET) || defined(INET6)
456 #if __FreeBSD_version >= 1100095
457 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
458 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
459     &hn_lro_entry_count, 0, "LRO entry count");
460 #endif
461 #endif
462 
463 static int			hn_tx_taskq_cnt = 1;
464 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
465     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
466 
467 #define HN_TX_TASKQ_M_INDEP	0
468 #define HN_TX_TASKQ_M_GLOBAL	1
469 #define HN_TX_TASKQ_M_EVTTQ	2
470 
471 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
472 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
473     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
474     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
475 
476 #ifndef HN_USE_TXDESC_BUFRING
477 static int			hn_use_txdesc_bufring = 0;
478 #else
479 static int			hn_use_txdesc_bufring = 1;
480 #endif
481 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
482     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
483 
484 #ifdef HN_IFSTART_SUPPORT
485 /* Use ifnet.if_start instead of ifnet.if_transmit */
486 static int			hn_use_if_start = 0;
487 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
488     &hn_use_if_start, 0, "Use if_start TX method");
489 #endif
490 
491 /* # of channels to use */
492 static int			hn_chan_cnt = 0;
493 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
494     &hn_chan_cnt, 0,
495     "# of channels to use; each channel has one RX ring and one TX ring");
496 
497 /* # of transmit rings to use */
498 static int			hn_tx_ring_cnt = 0;
499 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
500     &hn_tx_ring_cnt, 0, "# of TX rings to use");
501 
502 /* Software TX ring deptch */
503 static int			hn_tx_swq_depth = 0;
504 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
505     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
506 
507 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
508 #if __FreeBSD_version >= 1100095
509 static u_int			hn_lro_mbufq_depth = 0;
510 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
511     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
512 #endif
513 
514 /* Packet transmission aggregation size limit */
515 static int			hn_tx_agg_size = -1;
516 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
517     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
518 
519 /* Packet transmission aggregation count limit */
520 static int			hn_tx_agg_pkts = -1;
521 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
522     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
523 
524 /* VF list */
525 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
526     0, 0, hn_vflist_sysctl, "A", "VF list");
527 
528 /* VF mapping */
529 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
530     0, 0, hn_vfmap_sysctl, "A", "VF mapping");
531 
532 static u_int			hn_cpu_index;	/* next CPU for channel */
533 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
534 
535 static struct rmlock		hn_vfmap_lock;
536 static int			hn_vfmap_size;
537 static struct ifnet		**hn_vfmap;
538 
539 #ifndef RSS
540 static const uint8_t
541 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
542 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
543 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
544 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
545 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
546 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
547 };
548 #endif	/* !RSS */
549 
550 static device_method_t hn_methods[] = {
551 	/* Device interface */
552 	DEVMETHOD(device_probe,		hn_probe),
553 	DEVMETHOD(device_attach,	hn_attach),
554 	DEVMETHOD(device_detach,	hn_detach),
555 	DEVMETHOD(device_shutdown,	hn_shutdown),
556 	DEVMETHOD_END
557 };
558 
559 static driver_t hn_driver = {
560 	"hn",
561 	hn_methods,
562 	sizeof(struct hn_softc)
563 };
564 
565 static devclass_t hn_devclass;
566 
567 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
568 MODULE_VERSION(hn, 1);
569 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
570 
571 #if __FreeBSD_version >= 1100099
572 static void
573 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
574 {
575 	int i;
576 
577 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
578 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
579 }
580 #endif
581 
582 static int
583 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
584 {
585 
586 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
587 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
588 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
589 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
590 }
591 
592 static int
593 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
594 {
595 	struct hn_nvs_rndis rndis;
596 
597 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
598 	    txd->chim_size > 0, ("invalid rndis chim txd"));
599 
600 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
601 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
602 	rndis.nvs_chim_idx = txd->chim_index;
603 	rndis.nvs_chim_sz = txd->chim_size;
604 
605 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
606 	    &rndis, sizeof(rndis), &txd->send_ctx));
607 }
608 
609 static __inline uint32_t
610 hn_chim_alloc(struct hn_softc *sc)
611 {
612 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
613 	u_long *bmap = sc->hn_chim_bmap;
614 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
615 
616 	for (i = 0; i < bmap_cnt; ++i) {
617 		int idx;
618 
619 		idx = ffsl(~bmap[i]);
620 		if (idx == 0)
621 			continue;
622 
623 		--idx; /* ffsl is 1-based */
624 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
625 		    ("invalid i %d and idx %d", i, idx));
626 
627 		if (atomic_testandset_long(&bmap[i], idx))
628 			continue;
629 
630 		ret = i * LONG_BIT + idx;
631 		break;
632 	}
633 	return (ret);
634 }
635 
636 static __inline void
637 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
638 {
639 	u_long mask;
640 	uint32_t idx;
641 
642 	idx = chim_idx / LONG_BIT;
643 	KASSERT(idx < sc->hn_chim_bmap_cnt,
644 	    ("invalid chimney index 0x%x", chim_idx));
645 
646 	mask = 1UL << (chim_idx % LONG_BIT);
647 	KASSERT(sc->hn_chim_bmap[idx] & mask,
648 	    ("index bitmap 0x%lx, chimney index %u, "
649 	     "bitmap idx %d, bitmask 0x%lx",
650 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
651 
652 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
653 }
654 
655 #if defined(INET6) || defined(INET)
656 
657 #define PULLUP_HDR(m, len)				\
658 do {							\
659 	if (__predict_false((m)->m_len < (len))) {	\
660 		(m) = m_pullup((m), (len));		\
661 		if ((m) == NULL)			\
662 			return (NULL);			\
663 	}						\
664 } while (0)
665 
666 /*
667  * NOTE: If this function failed, the m_head would be freed.
668  */
669 static __inline struct mbuf *
670 hn_tso_fixup(struct mbuf *m_head)
671 {
672 	struct ether_vlan_header *evl;
673 	struct tcphdr *th;
674 	int ehlen;
675 
676 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
677 
678 	PULLUP_HDR(m_head, sizeof(*evl));
679 	evl = mtod(m_head, struct ether_vlan_header *);
680 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
681 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
682 	else
683 		ehlen = ETHER_HDR_LEN;
684 
685 #ifdef INET
686 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
687 		struct ip *ip;
688 		int iphlen;
689 
690 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
691 		ip = mtodo(m_head, ehlen);
692 		iphlen = ip->ip_hl << 2;
693 
694 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
695 		th = mtodo(m_head, ehlen + iphlen);
696 
697 		ip->ip_len = 0;
698 		ip->ip_sum = 0;
699 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
700 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
701 	}
702 #endif
703 #if defined(INET6) && defined(INET)
704 	else
705 #endif
706 #ifdef INET6
707 	{
708 		struct ip6_hdr *ip6;
709 
710 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
711 		ip6 = mtodo(m_head, ehlen);
712 		if (ip6->ip6_nxt != IPPROTO_TCP) {
713 			m_freem(m_head);
714 			return (NULL);
715 		}
716 
717 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
718 		th = mtodo(m_head, ehlen + sizeof(*ip6));
719 
720 		ip6->ip6_plen = 0;
721 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
722 	}
723 #endif
724 	return (m_head);
725 
726 }
727 
728 /*
729  * NOTE: If this function failed, the m_head would be freed.
730  */
731 static __inline struct mbuf *
732 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
733 {
734 	const struct ether_vlan_header *evl;
735 	const struct tcphdr *th;
736 	int ehlen;
737 
738 	*tcpsyn = 0;
739 
740 	PULLUP_HDR(m_head, sizeof(*evl));
741 	evl = mtod(m_head, const struct ether_vlan_header *);
742 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
743 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
744 	else
745 		ehlen = ETHER_HDR_LEN;
746 
747 #ifdef INET
748 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TCP) {
749 		const struct ip *ip;
750 		int iphlen;
751 
752 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
753 		ip = mtodo(m_head, ehlen);
754 		iphlen = ip->ip_hl << 2;
755 
756 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
757 		th = mtodo(m_head, ehlen + iphlen);
758 		if (th->th_flags & TH_SYN)
759 			*tcpsyn = 1;
760 	}
761 #endif
762 #if defined(INET6) && defined(INET)
763 	else
764 #endif
765 #ifdef INET6
766 	{
767 		const struct ip6_hdr *ip6;
768 
769 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
770 		ip6 = mtodo(m_head, ehlen);
771 		if (ip6->ip6_nxt != IPPROTO_TCP)
772 			return (m_head);
773 
774 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
775 		th = mtodo(m_head, ehlen + sizeof(*ip6));
776 		if (th->th_flags & TH_SYN)
777 			*tcpsyn = 1;
778 	}
779 #endif
780 	return (m_head);
781 }
782 
783 #undef PULLUP_HDR
784 
785 #endif	/* INET6 || INET */
786 
787 static int
788 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
789 {
790 	int error = 0;
791 
792 	HN_LOCK_ASSERT(sc);
793 
794 	if (sc->hn_rx_filter != filter) {
795 		error = hn_rndis_set_rxfilter(sc, filter);
796 		if (!error)
797 			sc->hn_rx_filter = filter;
798 	}
799 	return (error);
800 }
801 
802 static int
803 hn_rxfilter_config(struct hn_softc *sc)
804 {
805 	struct ifnet *ifp = sc->hn_ifp;
806 	uint32_t filter;
807 
808 	HN_LOCK_ASSERT(sc);
809 
810 	if ((ifp->if_flags & IFF_PROMISC) ||
811 	    (sc->hn_flags & HN_FLAG_RXVF)) {
812 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
813 	} else {
814 		filter = NDIS_PACKET_TYPE_DIRECTED;
815 		if (ifp->if_flags & IFF_BROADCAST)
816 			filter |= NDIS_PACKET_TYPE_BROADCAST;
817 		/* TODO: support multicast list */
818 		if ((ifp->if_flags & IFF_ALLMULTI) ||
819 		    !TAILQ_EMPTY(&ifp->if_multiaddrs))
820 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
821 	}
822 	return (hn_set_rxfilter(sc, filter));
823 }
824 
825 static void
826 hn_set_txagg(struct hn_softc *sc)
827 {
828 	uint32_t size, pkts;
829 	int i;
830 
831 	/*
832 	 * Setup aggregation size.
833 	 */
834 	if (sc->hn_agg_size < 0)
835 		size = UINT32_MAX;
836 	else
837 		size = sc->hn_agg_size;
838 
839 	if (sc->hn_rndis_agg_size < size)
840 		size = sc->hn_rndis_agg_size;
841 
842 	/* NOTE: We only aggregate packets using chimney sending buffers. */
843 	if (size > (uint32_t)sc->hn_chim_szmax)
844 		size = sc->hn_chim_szmax;
845 
846 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
847 		/* Disable */
848 		size = 0;
849 		pkts = 0;
850 		goto done;
851 	}
852 
853 	/* NOTE: Type of the per TX ring setting is 'int'. */
854 	if (size > INT_MAX)
855 		size = INT_MAX;
856 
857 	/*
858 	 * Setup aggregation packet count.
859 	 */
860 	if (sc->hn_agg_pkts < 0)
861 		pkts = UINT32_MAX;
862 	else
863 		pkts = sc->hn_agg_pkts;
864 
865 	if (sc->hn_rndis_agg_pkts < pkts)
866 		pkts = sc->hn_rndis_agg_pkts;
867 
868 	if (pkts <= 1) {
869 		/* Disable */
870 		size = 0;
871 		pkts = 0;
872 		goto done;
873 	}
874 
875 	/* NOTE: Type of the per TX ring setting is 'short'. */
876 	if (pkts > SHRT_MAX)
877 		pkts = SHRT_MAX;
878 
879 done:
880 	/* NOTE: Type of the per TX ring setting is 'short'. */
881 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
882 		/* Disable */
883 		size = 0;
884 		pkts = 0;
885 	}
886 
887 	if (bootverbose) {
888 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
889 		    size, pkts, sc->hn_rndis_agg_align);
890 	}
891 
892 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
893 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
894 
895 		mtx_lock(&txr->hn_tx_lock);
896 		txr->hn_agg_szmax = size;
897 		txr->hn_agg_pktmax = pkts;
898 		txr->hn_agg_align = sc->hn_rndis_agg_align;
899 		mtx_unlock(&txr->hn_tx_lock);
900 	}
901 }
902 
903 static int
904 hn_get_txswq_depth(const struct hn_tx_ring *txr)
905 {
906 
907 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
908 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
909 		return txr->hn_txdesc_cnt;
910 	return hn_tx_swq_depth;
911 }
912 
913 #ifndef RSS
914 static int
915 hn_rss_reconfig(struct hn_softc *sc)
916 {
917 	int error;
918 
919 	HN_LOCK_ASSERT(sc);
920 
921 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
922 		return (ENXIO);
923 
924 	/*
925 	 * Disable RSS first.
926 	 *
927 	 * NOTE:
928 	 * Direct reconfiguration by setting the UNCHG flags does
929 	 * _not_ work properly.
930 	 */
931 	if (bootverbose)
932 		if_printf(sc->hn_ifp, "disable RSS\n");
933 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
934 	if (error) {
935 		if_printf(sc->hn_ifp, "RSS disable failed\n");
936 		return (error);
937 	}
938 
939 	/*
940 	 * Reenable the RSS w/ the updated RSS key or indirect
941 	 * table.
942 	 */
943 	if (bootverbose)
944 		if_printf(sc->hn_ifp, "reconfig RSS\n");
945 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
946 	if (error) {
947 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
948 		return (error);
949 	}
950 	return (0);
951 }
952 #endif	/* !RSS */
953 
954 static void
955 hn_rss_ind_fixup(struct hn_softc *sc)
956 {
957 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
958 	int i, nchan;
959 
960 	nchan = sc->hn_rx_ring_inuse;
961 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
962 
963 	/*
964 	 * Check indirect table to make sure that all channels in it
965 	 * can be used.
966 	 */
967 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
968 		if (rss->rss_ind[i] >= nchan) {
969 			if_printf(sc->hn_ifp,
970 			    "RSS indirect table %d fixup: %u -> %d\n",
971 			    i, rss->rss_ind[i], nchan - 1);
972 			rss->rss_ind[i] = nchan - 1;
973 		}
974 	}
975 }
976 
977 static int
978 hn_ifmedia_upd(struct ifnet *ifp __unused)
979 {
980 
981 	return EOPNOTSUPP;
982 }
983 
984 static void
985 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
986 {
987 	struct hn_softc *sc = ifp->if_softc;
988 
989 	ifmr->ifm_status = IFM_AVALID;
990 	ifmr->ifm_active = IFM_ETHER;
991 
992 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
993 		ifmr->ifm_active |= IFM_NONE;
994 		return;
995 	}
996 	ifmr->ifm_status |= IFM_ACTIVE;
997 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
998 }
999 
1000 static void
1001 hn_rxvf_set_task(void *xarg, int pending __unused)
1002 {
1003 	struct hn_rxvf_setarg *arg = xarg;
1004 
1005 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1006 }
1007 
1008 static void
1009 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1010 {
1011 	struct hn_rx_ring *rxr;
1012 	struct hn_rxvf_setarg arg;
1013 	struct task task;
1014 	int i;
1015 
1016 	HN_LOCK_ASSERT(sc);
1017 
1018 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1019 
1020 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1021 		rxr = &sc->hn_rx_ring[i];
1022 
1023 		if (i < sc->hn_rx_ring_inuse) {
1024 			arg.rxr = rxr;
1025 			arg.vf_ifp = vf_ifp;
1026 			vmbus_chan_run_task(rxr->hn_chan, &task);
1027 		} else {
1028 			rxr->hn_rxvf_ifp = vf_ifp;
1029 		}
1030 	}
1031 }
1032 
1033 static bool
1034 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1035 {
1036 	const struct ifnet *hn_ifp;
1037 
1038 	hn_ifp = sc->hn_ifp;
1039 
1040 	if (ifp == hn_ifp)
1041 		return (false);
1042 
1043 	if (ifp->if_alloctype != IFT_ETHER)
1044 		return (false);
1045 
1046 	/* Ignore lagg/vlan interfaces */
1047 	if (strcmp(ifp->if_dname, "lagg") == 0 ||
1048 	    strcmp(ifp->if_dname, "vlan") == 0)
1049 		return (false);
1050 
1051 	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1052 		return (false);
1053 
1054 	return (true);
1055 }
1056 
1057 static void
1058 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1059 {
1060 	struct ifnet *hn_ifp;
1061 
1062 	HN_LOCK(sc);
1063 
1064 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1065 		goto out;
1066 
1067 	if (!hn_ismyvf(sc, ifp))
1068 		goto out;
1069 	hn_ifp = sc->hn_ifp;
1070 
1071 	if (rxvf) {
1072 		if (sc->hn_flags & HN_FLAG_RXVF)
1073 			goto out;
1074 
1075 		sc->hn_flags |= HN_FLAG_RXVF;
1076 		hn_rxfilter_config(sc);
1077 	} else {
1078 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1079 			goto out;
1080 
1081 		sc->hn_flags &= ~HN_FLAG_RXVF;
1082 		if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1083 			hn_rxfilter_config(sc);
1084 		else
1085 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1086 	}
1087 
1088 	hn_nvs_set_datapath(sc,
1089 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTHETIC);
1090 
1091 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1092 
1093 	if (rxvf) {
1094 		hn_suspend_mgmt(sc);
1095 		sc->hn_link_flags &=
1096 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1097 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1098 	} else {
1099 		hn_resume_mgmt(sc);
1100 	}
1101 
1102 	devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1103 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1104 
1105 	if (bootverbose) {
1106 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1107 		    rxvf ? "to" : "from", ifp->if_xname);
1108 	}
1109 out:
1110 	HN_UNLOCK(sc);
1111 }
1112 
1113 static void
1114 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1115 {
1116 
1117 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1118 		return;
1119 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1120 }
1121 
1122 static void
1123 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1124 {
1125 
1126 	hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1127 }
1128 
1129 static void
1130 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1131 {
1132 	struct hn_softc *sc = xsc;
1133 
1134 	HN_LOCK(sc);
1135 
1136 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1137 		goto done;
1138 
1139 	if (!hn_ismyvf(sc, ifp))
1140 		goto done;
1141 
1142 	if (sc->hn_vf_ifp != NULL) {
1143 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1144 		    sc->hn_vf_ifp->if_xname);
1145 		goto done;
1146 	}
1147 
1148 	rm_wlock(&hn_vfmap_lock);
1149 
1150 	if (ifp->if_index >= hn_vfmap_size) {
1151 		struct ifnet **newmap;
1152 		int newsize;
1153 
1154 		newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1155 		newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1156 		    M_WAITOK | M_ZERO);
1157 
1158 		memcpy(newmap, hn_vfmap,
1159 		    sizeof(struct ifnet *) * hn_vfmap_size);
1160 		free(hn_vfmap, M_DEVBUF);
1161 		hn_vfmap = newmap;
1162 		hn_vfmap_size = newsize;
1163 	}
1164 	KASSERT(hn_vfmap[ifp->if_index] == NULL,
1165 	    ("%s: ifindex %d was mapped to %s",
1166 	     ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1167 	hn_vfmap[ifp->if_index] = sc->hn_ifp;
1168 
1169 	rm_wunlock(&hn_vfmap_lock);
1170 
1171 	sc->hn_vf_ifp = ifp;
1172 done:
1173 	HN_UNLOCK(sc);
1174 }
1175 
1176 static void
1177 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1178 {
1179 	struct hn_softc *sc = xsc;
1180 
1181 	HN_LOCK(sc);
1182 
1183 	if (sc->hn_vf_ifp == NULL)
1184 		goto done;
1185 
1186 	if (!hn_ismyvf(sc, ifp))
1187 		goto done;
1188 
1189 	sc->hn_vf_ifp = NULL;
1190 
1191 	rm_wlock(&hn_vfmap_lock);
1192 
1193 	KASSERT(ifp->if_index < hn_vfmap_size,
1194 	    ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
1195 	if (hn_vfmap[ifp->if_index] != NULL) {
1196 		KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
1197 		    ("%s: ifindex %d was mapped to %s",
1198 		     ifp->if_xname, ifp->if_index,
1199 		     hn_vfmap[ifp->if_index]->if_xname));
1200 		hn_vfmap[ifp->if_index] = NULL;
1201 	}
1202 
1203 	rm_wunlock(&hn_vfmap_lock);
1204 done:
1205 	HN_UNLOCK(sc);
1206 }
1207 
1208 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
1209 static const struct hyperv_guid g_net_vsc_device_type = {
1210 	.hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
1211 		0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
1212 };
1213 
1214 static int
1215 hn_probe(device_t dev)
1216 {
1217 
1218 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
1219 	    &g_net_vsc_device_type) == 0) {
1220 		device_set_desc(dev, "Hyper-V Network Interface");
1221 		return BUS_PROBE_DEFAULT;
1222 	}
1223 	return ENXIO;
1224 }
1225 
1226 static int
1227 hn_attach(device_t dev)
1228 {
1229 	struct hn_softc *sc = device_get_softc(dev);
1230 	struct sysctl_oid_list *child;
1231 	struct sysctl_ctx_list *ctx;
1232 	uint8_t eaddr[ETHER_ADDR_LEN];
1233 	struct ifnet *ifp = NULL;
1234 	int error, ring_cnt, tx_ring_cnt;
1235 
1236 	sc->hn_dev = dev;
1237 	sc->hn_prichan = vmbus_get_channel(dev);
1238 	HN_LOCK_INIT(sc);
1239 
1240 	/*
1241 	 * Initialize these tunables once.
1242 	 */
1243 	sc->hn_agg_size = hn_tx_agg_size;
1244 	sc->hn_agg_pkts = hn_tx_agg_pkts;
1245 
1246 	/*
1247 	 * Setup taskqueue for transmission.
1248 	 */
1249 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
1250 		int i;
1251 
1252 		sc->hn_tx_taskqs =
1253 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
1254 		    M_DEVBUF, M_WAITOK);
1255 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
1256 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
1257 			    M_WAITOK, taskqueue_thread_enqueue,
1258 			    &sc->hn_tx_taskqs[i]);
1259 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
1260 			    "%s tx%d", device_get_nameunit(dev), i);
1261 		}
1262 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
1263 		sc->hn_tx_taskqs = hn_tx_taskque;
1264 	}
1265 
1266 	/*
1267 	 * Setup taskqueue for mangement tasks, e.g. link status.
1268 	 */
1269 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
1270 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
1271 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
1272 	    device_get_nameunit(dev));
1273 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
1274 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
1275 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
1276 	    hn_netchg_status_taskfunc, sc);
1277 
1278 	/*
1279 	 * Allocate ifnet and setup its name earlier, so that if_printf
1280 	 * can be used by functions, which will be called after
1281 	 * ether_ifattach().
1282 	 */
1283 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
1284 	ifp->if_softc = sc;
1285 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
1286 
1287 	/*
1288 	 * Initialize ifmedia earlier so that it can be unconditionally
1289 	 * destroyed, if error happened later on.
1290 	 */
1291 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
1292 
1293 	/*
1294 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
1295 	 * to use (tx_ring_cnt).
1296 	 *
1297 	 * NOTE:
1298 	 * The # of RX rings to use is same as the # of channels to use.
1299 	 */
1300 	ring_cnt = hn_chan_cnt;
1301 	if (ring_cnt <= 0) {
1302 		/* Default */
1303 		ring_cnt = mp_ncpus;
1304 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
1305 			ring_cnt = HN_RING_CNT_DEF_MAX;
1306 	} else if (ring_cnt > mp_ncpus) {
1307 		ring_cnt = mp_ncpus;
1308 	}
1309 #ifdef RSS
1310 	if (ring_cnt > rss_getnumbuckets())
1311 		ring_cnt = rss_getnumbuckets();
1312 #endif
1313 
1314 	tx_ring_cnt = hn_tx_ring_cnt;
1315 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
1316 		tx_ring_cnt = ring_cnt;
1317 #ifdef HN_IFSTART_SUPPORT
1318 	if (hn_use_if_start) {
1319 		/* ifnet.if_start only needs one TX ring. */
1320 		tx_ring_cnt = 1;
1321 	}
1322 #endif
1323 
1324 	/*
1325 	 * Set the leader CPU for channels.
1326 	 */
1327 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
1328 
1329 	/*
1330 	 * Create enough TX/RX rings, even if only limited number of
1331 	 * channels can be allocated.
1332 	 */
1333 	error = hn_create_tx_data(sc, tx_ring_cnt);
1334 	if (error)
1335 		goto failed;
1336 	error = hn_create_rx_data(sc, ring_cnt);
1337 	if (error)
1338 		goto failed;
1339 
1340 	/*
1341 	 * Create transaction context for NVS and RNDIS transactions.
1342 	 */
1343 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1344 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1345 	if (sc->hn_xact == NULL) {
1346 		error = ENXIO;
1347 		goto failed;
1348 	}
1349 
1350 	/*
1351 	 * Install orphan handler for the revocation of this device's
1352 	 * primary channel.
1353 	 *
1354 	 * NOTE:
1355 	 * The processing order is critical here:
1356 	 * Install the orphan handler, _before_ testing whether this
1357 	 * device's primary channel has been revoked or not.
1358 	 */
1359 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
1360 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
1361 		error = ENXIO;
1362 		goto failed;
1363 	}
1364 
1365 	/*
1366 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
1367 	 */
1368 	error = hn_synth_attach(sc, ETHERMTU);
1369 	if (error)
1370 		goto failed;
1371 
1372 	error = hn_rndis_get_eaddr(sc, eaddr);
1373 	if (error)
1374 		goto failed;
1375 
1376 #if __FreeBSD_version >= 1100099
1377 	if (sc->hn_rx_ring_inuse > 1) {
1378 		/*
1379 		 * Reduce TCP segment aggregation limit for multiple
1380 		 * RX rings to increase ACK timeliness.
1381 		 */
1382 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1383 	}
1384 #endif
1385 
1386 	/*
1387 	 * Fixup TX stuffs after synthetic parts are attached.
1388 	 */
1389 	hn_fixup_tx_data(sc);
1390 
1391 	ctx = device_get_sysctl_ctx(dev);
1392 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1393 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1394 	    &sc->hn_nvs_ver, 0, "NVS version");
1395 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1396 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1397 	    hn_ndis_version_sysctl, "A", "NDIS version");
1398 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1399 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1400 	    hn_caps_sysctl, "A", "capabilities");
1401 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1402 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1403 	    hn_hwassist_sysctl, "A", "hwassist");
1404 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1405 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1406 	    hn_rxfilter_sysctl, "A", "rxfilter");
1407 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1408 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1409 	    hn_rss_hash_sysctl, "A", "RSS hash");
1410 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1411 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1412 #ifndef RSS
1413 	/*
1414 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
1415 	 */
1416 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1417 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1418 	    hn_rss_key_sysctl, "IU", "RSS key");
1419 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1420 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1421 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
1422 #endif
1423 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1424 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1425 	    "RNDIS offered packet transmission aggregation size limit");
1426 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1427 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1428 	    "RNDIS offered packet transmission aggregation count limit");
1429 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1430 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1431 	    "RNDIS packet transmission aggregation alignment");
1432 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1433 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1434 	    hn_txagg_size_sysctl, "I",
1435 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1436 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1437 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1438 	    hn_txagg_pkts_sysctl, "I",
1439 	    "Packet transmission aggregation packets, "
1440 	    "0 -- disable, -1 -- auto");
1441 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
1442 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1443 	    hn_polling_sysctl, "I",
1444 	    "Polling frequency: [100,1000000], 0 disable polling");
1445 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
1446 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1447 	    hn_vf_sysctl, "A", "Virtual Function's name");
1448 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
1449 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1450 	    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
1451 
1452 	/*
1453 	 * Setup the ifmedia, which has been initialized earlier.
1454 	 */
1455 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1456 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1457 	/* XXX ifmedia_set really should do this for us */
1458 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1459 
1460 	/*
1461 	 * Setup the ifnet for this interface.
1462 	 */
1463 
1464 	ifp->if_baudrate = IF_Gbps(10);
1465 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1466 	ifp->if_ioctl = hn_ioctl;
1467 	ifp->if_init = hn_init;
1468 #ifdef HN_IFSTART_SUPPORT
1469 	if (hn_use_if_start) {
1470 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1471 
1472 		ifp->if_start = hn_start;
1473 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1474 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1475 		IFQ_SET_READY(&ifp->if_snd);
1476 	} else
1477 #endif
1478 	{
1479 		ifp->if_transmit = hn_transmit;
1480 		ifp->if_qflush = hn_xmit_qflush;
1481 	}
1482 
1483 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
1484 #ifdef foo
1485 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
1486 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1487 #endif
1488 	if (sc->hn_caps & HN_CAP_VLAN) {
1489 		/* XXX not sure about VLAN_MTU. */
1490 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1491 	}
1492 
1493 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1494 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1495 		ifp->if_capabilities |= IFCAP_TXCSUM;
1496 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1497 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1498 	if (sc->hn_caps & HN_CAP_TSO4) {
1499 		ifp->if_capabilities |= IFCAP_TSO4;
1500 		ifp->if_hwassist |= CSUM_IP_TSO;
1501 	}
1502 	if (sc->hn_caps & HN_CAP_TSO6) {
1503 		ifp->if_capabilities |= IFCAP_TSO6;
1504 		ifp->if_hwassist |= CSUM_IP6_TSO;
1505 	}
1506 
1507 	/* Enable all available capabilities by default. */
1508 	ifp->if_capenable = ifp->if_capabilities;
1509 
1510 	/*
1511 	 * Disable IPv6 TSO and TXCSUM by default, they still can
1512 	 * be enabled through SIOCSIFCAP.
1513 	 */
1514 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
1515 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
1516 
1517 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1518 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1519 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1520 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1521 	}
1522 
1523 	ether_ifattach(ifp, eaddr);
1524 
1525 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1526 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
1527 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1528 	}
1529 
1530 	/* Inform the upper layer about the long frame support. */
1531 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1532 
1533 	/*
1534 	 * Kick off link status check.
1535 	 */
1536 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1537 	hn_update_link_status(sc);
1538 
1539 	sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
1540 	    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
1541 	sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
1542 	    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
1543 
1544 	/*
1545 	 * NOTE:
1546 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
1547 	 * since interface's LLADDR is needed; interface LLADDR is not
1548 	 * available when ifnet_arrival event is triggered.
1549 	 */
1550 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
1551 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
1552 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
1553 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
1554 
1555 	return (0);
1556 failed:
1557 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1558 		hn_synth_detach(sc);
1559 	hn_detach(dev);
1560 	return (error);
1561 }
1562 
1563 static int
1564 hn_detach(device_t dev)
1565 {
1566 	struct hn_softc *sc = device_get_softc(dev);
1567 	struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
1568 
1569 	if (sc->hn_ifaddr_evthand != NULL)
1570 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
1571 	if (sc->hn_ifnet_evthand != NULL)
1572 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
1573 	if (sc->hn_ifnet_atthand != NULL) {
1574 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
1575 		    sc->hn_ifnet_atthand);
1576 	}
1577 	if (sc->hn_ifnet_dethand != NULL) {
1578 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
1579 		    sc->hn_ifnet_dethand);
1580 	}
1581 
1582 	vf_ifp = sc->hn_vf_ifp;
1583 	__compiler_membar();
1584 	if (vf_ifp != NULL)
1585 		hn_ifnet_detevent(sc, vf_ifp);
1586 
1587 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
1588 		/*
1589 		 * In case that the vmbus missed the orphan handler
1590 		 * installation.
1591 		 */
1592 		vmbus_xact_ctx_orphan(sc->hn_xact);
1593 	}
1594 
1595 	if (device_is_attached(dev)) {
1596 		HN_LOCK(sc);
1597 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1598 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1599 				hn_stop(sc, true);
1600 			/*
1601 			 * NOTE:
1602 			 * hn_stop() only suspends data, so managment
1603 			 * stuffs have to be suspended manually here.
1604 			 */
1605 			hn_suspend_mgmt(sc);
1606 			hn_synth_detach(sc);
1607 		}
1608 		HN_UNLOCK(sc);
1609 		ether_ifdetach(ifp);
1610 	}
1611 
1612 	ifmedia_removeall(&sc->hn_media);
1613 	hn_destroy_rx_data(sc);
1614 	hn_destroy_tx_data(sc);
1615 
1616 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
1617 		int i;
1618 
1619 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
1620 			taskqueue_free(sc->hn_tx_taskqs[i]);
1621 		free(sc->hn_tx_taskqs, M_DEVBUF);
1622 	}
1623 	taskqueue_free(sc->hn_mgmt_taskq0);
1624 
1625 	if (sc->hn_xact != NULL) {
1626 		/*
1627 		 * Uninstall the orphan handler _before_ the xact is
1628 		 * destructed.
1629 		 */
1630 		vmbus_chan_unset_orphan(sc->hn_prichan);
1631 		vmbus_xact_ctx_destroy(sc->hn_xact);
1632 	}
1633 
1634 	if_free(ifp);
1635 
1636 	HN_LOCK_DESTROY(sc);
1637 	return (0);
1638 }
1639 
1640 static int
1641 hn_shutdown(device_t dev)
1642 {
1643 
1644 	return (0);
1645 }
1646 
1647 static void
1648 hn_link_status(struct hn_softc *sc)
1649 {
1650 	uint32_t link_status;
1651 	int error;
1652 
1653 	error = hn_rndis_get_linkstatus(sc, &link_status);
1654 	if (error) {
1655 		/* XXX what to do? */
1656 		return;
1657 	}
1658 
1659 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1660 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1661 	else
1662 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1663 	if_link_state_change(sc->hn_ifp,
1664 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1665 	    LINK_STATE_UP : LINK_STATE_DOWN);
1666 }
1667 
1668 static void
1669 hn_link_taskfunc(void *xsc, int pending __unused)
1670 {
1671 	struct hn_softc *sc = xsc;
1672 
1673 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1674 		return;
1675 	hn_link_status(sc);
1676 }
1677 
1678 static void
1679 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1680 {
1681 	struct hn_softc *sc = xsc;
1682 
1683 	/* Prevent any link status checks from running. */
1684 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1685 
1686 	/*
1687 	 * Fake up a [link down --> link up] state change; 5 seconds
1688 	 * delay is used, which closely simulates miibus reaction
1689 	 * upon link down event.
1690 	 */
1691 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1692 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1693 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1694 	    &sc->hn_netchg_status, 5 * hz);
1695 }
1696 
1697 static void
1698 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1699 {
1700 	struct hn_softc *sc = xsc;
1701 
1702 	/* Re-allow link status checks. */
1703 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1704 	hn_link_status(sc);
1705 }
1706 
1707 static void
1708 hn_update_link_status(struct hn_softc *sc)
1709 {
1710 
1711 	if (sc->hn_mgmt_taskq != NULL)
1712 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1713 }
1714 
1715 static void
1716 hn_change_network(struct hn_softc *sc)
1717 {
1718 
1719 	if (sc->hn_mgmt_taskq != NULL)
1720 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1721 }
1722 
1723 static __inline int
1724 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1725     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1726 {
1727 	struct mbuf *m = *m_head;
1728 	int error;
1729 
1730 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1731 
1732 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1733 	    m, segs, nsegs, BUS_DMA_NOWAIT);
1734 	if (error == EFBIG) {
1735 		struct mbuf *m_new;
1736 
1737 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1738 		if (m_new == NULL)
1739 			return ENOBUFS;
1740 		else
1741 			*m_head = m = m_new;
1742 		txr->hn_tx_collapsed++;
1743 
1744 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1745 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1746 	}
1747 	if (!error) {
1748 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1749 		    BUS_DMASYNC_PREWRITE);
1750 		txd->flags |= HN_TXD_FLAG_DMAMAP;
1751 	}
1752 	return error;
1753 }
1754 
1755 static __inline int
1756 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1757 {
1758 
1759 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1760 	    ("put an onlist txd %#x", txd->flags));
1761 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1762 	    ("put an onagg txd %#x", txd->flags));
1763 
1764 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1765 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1766 		return 0;
1767 
1768 	if (!STAILQ_EMPTY(&txd->agg_list)) {
1769 		struct hn_txdesc *tmp_txd;
1770 
1771 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
1772 			int freed;
1773 
1774 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
1775 			    ("resursive aggregation on aggregated txdesc"));
1776 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
1777 			    ("not aggregated txdesc"));
1778 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1779 			    ("aggregated txdesc uses dmamap"));
1780 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1781 			    ("aggregated txdesc consumes "
1782 			     "chimney sending buffer"));
1783 			KASSERT(tmp_txd->chim_size == 0,
1784 			    ("aggregated txdesc has non-zero "
1785 			     "chimney sending size"));
1786 
1787 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
1788 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
1789 			freed = hn_txdesc_put(txr, tmp_txd);
1790 			KASSERT(freed, ("failed to free aggregated txdesc"));
1791 		}
1792 	}
1793 
1794 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1795 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1796 		    ("chim txd uses dmamap"));
1797 		hn_chim_free(txr->hn_sc, txd->chim_index);
1798 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1799 		txd->chim_size = 0;
1800 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1801 		bus_dmamap_sync(txr->hn_tx_data_dtag,
1802 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1803 		bus_dmamap_unload(txr->hn_tx_data_dtag,
1804 		    txd->data_dmap);
1805 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1806 	}
1807 
1808 	if (txd->m != NULL) {
1809 		m_freem(txd->m);
1810 		txd->m = NULL;
1811 	}
1812 
1813 	txd->flags |= HN_TXD_FLAG_ONLIST;
1814 #ifndef HN_USE_TXDESC_BUFRING
1815 	mtx_lock_spin(&txr->hn_txlist_spin);
1816 	KASSERT(txr->hn_txdesc_avail >= 0 &&
1817 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1818 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1819 	txr->hn_txdesc_avail++;
1820 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1821 	mtx_unlock_spin(&txr->hn_txlist_spin);
1822 #else	/* HN_USE_TXDESC_BUFRING */
1823 #ifdef HN_DEBUG
1824 	atomic_add_int(&txr->hn_txdesc_avail, 1);
1825 #endif
1826 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
1827 #endif	/* !HN_USE_TXDESC_BUFRING */
1828 
1829 	return 1;
1830 }
1831 
1832 static __inline struct hn_txdesc *
1833 hn_txdesc_get(struct hn_tx_ring *txr)
1834 {
1835 	struct hn_txdesc *txd;
1836 
1837 #ifndef HN_USE_TXDESC_BUFRING
1838 	mtx_lock_spin(&txr->hn_txlist_spin);
1839 	txd = SLIST_FIRST(&txr->hn_txlist);
1840 	if (txd != NULL) {
1841 		KASSERT(txr->hn_txdesc_avail > 0,
1842 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1843 		txr->hn_txdesc_avail--;
1844 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1845 	}
1846 	mtx_unlock_spin(&txr->hn_txlist_spin);
1847 #else
1848 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1849 #endif
1850 
1851 	if (txd != NULL) {
1852 #ifdef HN_USE_TXDESC_BUFRING
1853 #ifdef HN_DEBUG
1854 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1855 #endif
1856 #endif	/* HN_USE_TXDESC_BUFRING */
1857 		KASSERT(txd->m == NULL && txd->refs == 0 &&
1858 		    STAILQ_EMPTY(&txd->agg_list) &&
1859 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1860 		    txd->chim_size == 0 &&
1861 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
1862 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
1863 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1864 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
1865 		txd->refs = 1;
1866 	}
1867 	return txd;
1868 }
1869 
1870 static __inline void
1871 hn_txdesc_hold(struct hn_txdesc *txd)
1872 {
1873 
1874 	/* 0->1 transition will never work */
1875 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1876 	atomic_add_int(&txd->refs, 1);
1877 }
1878 
1879 static __inline void
1880 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
1881 {
1882 
1883 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1884 	    ("recursive aggregation on aggregating txdesc"));
1885 
1886 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1887 	    ("already aggregated"));
1888 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
1889 	    ("recursive aggregation on to-be-aggregated txdesc"));
1890 
1891 	txd->flags |= HN_TXD_FLAG_ONAGG;
1892 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
1893 }
1894 
1895 static bool
1896 hn_tx_ring_pending(struct hn_tx_ring *txr)
1897 {
1898 	bool pending = false;
1899 
1900 #ifndef HN_USE_TXDESC_BUFRING
1901 	mtx_lock_spin(&txr->hn_txlist_spin);
1902 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1903 		pending = true;
1904 	mtx_unlock_spin(&txr->hn_txlist_spin);
1905 #else
1906 	if (!buf_ring_full(txr->hn_txdesc_br))
1907 		pending = true;
1908 #endif
1909 	return (pending);
1910 }
1911 
1912 static __inline void
1913 hn_txeof(struct hn_tx_ring *txr)
1914 {
1915 	txr->hn_has_txeof = 0;
1916 	txr->hn_txeof(txr);
1917 }
1918 
1919 static void
1920 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1921     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1922 {
1923 	struct hn_txdesc *txd = sndc->hn_cbarg;
1924 	struct hn_tx_ring *txr;
1925 
1926 	txr = txd->txr;
1927 	KASSERT(txr->hn_chan == chan,
1928 	    ("channel mismatch, on chan%u, should be chan%u",
1929 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
1930 
1931 	txr->hn_has_txeof = 1;
1932 	hn_txdesc_put(txr, txd);
1933 
1934 	++txr->hn_txdone_cnt;
1935 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1936 		txr->hn_txdone_cnt = 0;
1937 		if (txr->hn_oactive)
1938 			hn_txeof(txr);
1939 	}
1940 }
1941 
1942 static void
1943 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1944 {
1945 #if defined(INET) || defined(INET6)
1946 	tcp_lro_flush_all(&rxr->hn_lro);
1947 #endif
1948 
1949 	/*
1950 	 * NOTE:
1951 	 * 'txr' could be NULL, if multiple channels and
1952 	 * ifnet.if_start method are enabled.
1953 	 */
1954 	if (txr == NULL || !txr->hn_has_txeof)
1955 		return;
1956 
1957 	txr->hn_txdone_cnt = 0;
1958 	hn_txeof(txr);
1959 }
1960 
1961 static __inline uint32_t
1962 hn_rndis_pktmsg_offset(uint32_t ofs)
1963 {
1964 
1965 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1966 	    ("invalid RNDIS packet msg offset %u", ofs));
1967 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1968 }
1969 
1970 static __inline void *
1971 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1972     size_t pi_dlen, uint32_t pi_type)
1973 {
1974 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1975 	struct rndis_pktinfo *pi;
1976 
1977 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1978 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1979 
1980 	/*
1981 	 * Per-packet-info does not move; it only grows.
1982 	 *
1983 	 * NOTE:
1984 	 * rm_pktinfooffset in this phase counts from the beginning
1985 	 * of rndis_packet_msg.
1986 	 */
1987 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1988 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
1989 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1990 	    pkt->rm_pktinfolen);
1991 	pkt->rm_pktinfolen += pi_size;
1992 
1993 	pi->rm_size = pi_size;
1994 	pi->rm_type = pi_type;
1995 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1996 
1997 	return (pi->rm_data);
1998 }
1999 
2000 static __inline int
2001 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2002 {
2003 	struct hn_txdesc *txd;
2004 	struct mbuf *m;
2005 	int error, pkts;
2006 
2007 	txd = txr->hn_agg_txd;
2008 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2009 
2010 	/*
2011 	 * Since hn_txpkt() will reset this temporary stat, save
2012 	 * it now, so that oerrors can be updated properly, if
2013 	 * hn_txpkt() ever fails.
2014 	 */
2015 	pkts = txr->hn_stat_pkts;
2016 
2017 	/*
2018 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2019 	 * failure, save it for later freeing, if hn_txpkt() ever
2020 	 * fails.
2021 	 */
2022 	m = txd->m;
2023 	error = hn_txpkt(ifp, txr, txd);
2024 	if (__predict_false(error)) {
2025 		/* txd is freed, but m is not. */
2026 		m_freem(m);
2027 
2028 		txr->hn_flush_failed++;
2029 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2030 	}
2031 
2032 	/* Reset all aggregation states. */
2033 	txr->hn_agg_txd = NULL;
2034 	txr->hn_agg_szleft = 0;
2035 	txr->hn_agg_pktleft = 0;
2036 	txr->hn_agg_prevpkt = NULL;
2037 
2038 	return (error);
2039 }
2040 
2041 static void *
2042 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2043     int pktsize)
2044 {
2045 	void *chim;
2046 
2047 	if (txr->hn_agg_txd != NULL) {
2048 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2049 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2050 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2051 			int olen;
2052 
2053 			/*
2054 			 * Update the previous RNDIS packet's total length,
2055 			 * it can be increased due to the mandatory alignment
2056 			 * padding for this RNDIS packet.  And update the
2057 			 * aggregating txdesc's chimney sending buffer size
2058 			 * accordingly.
2059 			 *
2060 			 * XXX
2061 			 * Zero-out the padding, as required by the RNDIS spec.
2062 			 */
2063 			olen = pkt->rm_len;
2064 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2065 			agg_txd->chim_size += pkt->rm_len - olen;
2066 
2067 			/* Link this txdesc to the parent. */
2068 			hn_txdesc_agg(agg_txd, txd);
2069 
2070 			chim = (uint8_t *)pkt + pkt->rm_len;
2071 			/* Save the current packet for later fixup. */
2072 			txr->hn_agg_prevpkt = chim;
2073 
2074 			txr->hn_agg_pktleft--;
2075 			txr->hn_agg_szleft -= pktsize;
2076 			if (txr->hn_agg_szleft <=
2077 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2078 				/*
2079 				 * Probably can't aggregate more packets,
2080 				 * flush this aggregating txdesc proactively.
2081 				 */
2082 				txr->hn_agg_pktleft = 0;
2083 			}
2084 			/* Done! */
2085 			return (chim);
2086 		}
2087 		hn_flush_txagg(ifp, txr);
2088 	}
2089 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
2090 
2091 	txr->hn_tx_chimney_tried++;
2092 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
2093 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
2094 		return (NULL);
2095 	txr->hn_tx_chimney++;
2096 
2097 	chim = txr->hn_sc->hn_chim +
2098 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
2099 
2100 	if (txr->hn_agg_pktmax > 1 &&
2101 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2102 		txr->hn_agg_txd = txd;
2103 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
2104 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
2105 		txr->hn_agg_prevpkt = chim;
2106 	}
2107 	return (chim);
2108 }
2109 
2110 /*
2111  * NOTE:
2112  * If this function fails, then both txd and m_head0 will be freed.
2113  */
2114 static int
2115 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2116     struct mbuf **m_head0)
2117 {
2118 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
2119 	int error, nsegs, i;
2120 	struct mbuf *m_head = *m_head0;
2121 	struct rndis_packet_msg *pkt;
2122 	uint32_t *pi_data;
2123 	void *chim = NULL;
2124 	int pkt_hlen, pkt_size;
2125 
2126 	pkt = txd->rndis_pkt;
2127 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
2128 	if (pkt_size < txr->hn_chim_size) {
2129 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
2130 		if (chim != NULL)
2131 			pkt = chim;
2132 	} else {
2133 		if (txr->hn_agg_txd != NULL)
2134 			hn_flush_txagg(ifp, txr);
2135 	}
2136 
2137 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
2138 	pkt->rm_len = m_head->m_pkthdr.len;
2139 	pkt->rm_dataoffset = 0;
2140 	pkt->rm_datalen = m_head->m_pkthdr.len;
2141 	pkt->rm_oobdataoffset = 0;
2142 	pkt->rm_oobdatalen = 0;
2143 	pkt->rm_oobdataelements = 0;
2144 	pkt->rm_pktinfooffset = sizeof(*pkt);
2145 	pkt->rm_pktinfolen = 0;
2146 	pkt->rm_vchandle = 0;
2147 	pkt->rm_reserved = 0;
2148 
2149 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
2150 		/*
2151 		 * Set the hash value for this packet, so that the host could
2152 		 * dispatch the TX done event for this packet back to this TX
2153 		 * ring's channel.
2154 		 */
2155 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2156 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
2157 		*pi_data = txr->hn_tx_idx;
2158 	}
2159 
2160 	if (m_head->m_flags & M_VLANTAG) {
2161 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2162 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
2163 		*pi_data = NDIS_VLAN_INFO_MAKE(
2164 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
2165 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
2166 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
2167 	}
2168 
2169 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
2170 #if defined(INET6) || defined(INET)
2171 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2172 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
2173 #ifdef INET
2174 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
2175 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
2176 			    m_head->m_pkthdr.tso_segsz);
2177 		}
2178 #endif
2179 #if defined(INET6) && defined(INET)
2180 		else
2181 #endif
2182 #ifdef INET6
2183 		{
2184 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
2185 			    m_head->m_pkthdr.tso_segsz);
2186 		}
2187 #endif
2188 #endif	/* INET6 || INET */
2189 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
2190 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2191 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
2192 		if (m_head->m_pkthdr.csum_flags &
2193 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
2194 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
2195 		} else {
2196 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
2197 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
2198 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
2199 		}
2200 
2201 		if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
2202 			*pi_data |= NDIS_TXCSUM_INFO_TCPCS;
2203 		else if (m_head->m_pkthdr.csum_flags &
2204 		    (CSUM_IP_UDP | CSUM_IP6_UDP))
2205 			*pi_data |= NDIS_TXCSUM_INFO_UDPCS;
2206 	}
2207 
2208 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
2209 	/* Fixup RNDIS packet message total length */
2210 	pkt->rm_len += pkt_hlen;
2211 	/* Convert RNDIS packet message offsets */
2212 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
2213 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
2214 
2215 	/*
2216 	 * Fast path: Chimney sending.
2217 	 */
2218 	if (chim != NULL) {
2219 		struct hn_txdesc *tgt_txd = txd;
2220 
2221 		if (txr->hn_agg_txd != NULL) {
2222 			tgt_txd = txr->hn_agg_txd;
2223 #ifdef INVARIANTS
2224 			*m_head0 = NULL;
2225 #endif
2226 		}
2227 
2228 		KASSERT(pkt == chim,
2229 		    ("RNDIS pkt not in chimney sending buffer"));
2230 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
2231 		    ("chimney sending buffer is not used"));
2232 		tgt_txd->chim_size += pkt->rm_len;
2233 
2234 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
2235 		    ((uint8_t *)chim) + pkt_hlen);
2236 
2237 		txr->hn_gpa_cnt = 0;
2238 		txr->hn_sendpkt = hn_txpkt_chim;
2239 		goto done;
2240 	}
2241 
2242 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
2243 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2244 	    ("chimney buffer is used"));
2245 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
2246 
2247 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
2248 	if (__predict_false(error)) {
2249 		int freed;
2250 
2251 		/*
2252 		 * This mbuf is not linked w/ the txd yet, so free it now.
2253 		 */
2254 		m_freem(m_head);
2255 		*m_head0 = NULL;
2256 
2257 		freed = hn_txdesc_put(txr, txd);
2258 		KASSERT(freed != 0,
2259 		    ("fail to free txd upon txdma error"));
2260 
2261 		txr->hn_txdma_failed++;
2262 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2263 		return error;
2264 	}
2265 	*m_head0 = m_head;
2266 
2267 	/* +1 RNDIS packet message */
2268 	txr->hn_gpa_cnt = nsegs + 1;
2269 
2270 	/* send packet with page buffer */
2271 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
2272 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
2273 	txr->hn_gpa[0].gpa_len = pkt_hlen;
2274 
2275 	/*
2276 	 * Fill the page buffers with mbuf info after the page
2277 	 * buffer for RNDIS packet message.
2278 	 */
2279 	for (i = 0; i < nsegs; ++i) {
2280 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
2281 
2282 		gpa->gpa_page = atop(segs[i].ds_addr);
2283 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
2284 		gpa->gpa_len = segs[i].ds_len;
2285 	}
2286 
2287 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2288 	txd->chim_size = 0;
2289 	txr->hn_sendpkt = hn_txpkt_sglist;
2290 done:
2291 	txd->m = m_head;
2292 
2293 	/* Set the completion routine */
2294 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
2295 
2296 	/* Update temporary stats for later use. */
2297 	txr->hn_stat_pkts++;
2298 	txr->hn_stat_size += m_head->m_pkthdr.len;
2299 	if (m_head->m_flags & M_MCAST)
2300 		txr->hn_stat_mcasts++;
2301 
2302 	return 0;
2303 }
2304 
2305 /*
2306  * NOTE:
2307  * If this function fails, then txd will be freed, but the mbuf
2308  * associated w/ the txd will _not_ be freed.
2309  */
2310 static int
2311 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
2312 {
2313 	int error, send_failed = 0, has_bpf;
2314 
2315 again:
2316 	has_bpf = bpf_peers_present(ifp->if_bpf);
2317 	if (has_bpf) {
2318 		/*
2319 		 * Make sure that this txd and any aggregated txds are not
2320 		 * freed before ETHER_BPF_MTAP.
2321 		 */
2322 		hn_txdesc_hold(txd);
2323 	}
2324 	error = txr->hn_sendpkt(txr, txd);
2325 	if (!error) {
2326 		if (has_bpf) {
2327 			const struct hn_txdesc *tmp_txd;
2328 
2329 			ETHER_BPF_MTAP(ifp, txd->m);
2330 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
2331 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
2332 		}
2333 
2334 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
2335 #ifdef HN_IFSTART_SUPPORT
2336 		if (!hn_use_if_start)
2337 #endif
2338 		{
2339 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
2340 			    txr->hn_stat_size);
2341 			if (txr->hn_stat_mcasts != 0) {
2342 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
2343 				    txr->hn_stat_mcasts);
2344 			}
2345 		}
2346 		txr->hn_pkts += txr->hn_stat_pkts;
2347 		txr->hn_sends++;
2348 	}
2349 	if (has_bpf)
2350 		hn_txdesc_put(txr, txd);
2351 
2352 	if (__predict_false(error)) {
2353 		int freed;
2354 
2355 		/*
2356 		 * This should "really rarely" happen.
2357 		 *
2358 		 * XXX Too many RX to be acked or too many sideband
2359 		 * commands to run?  Ask netvsc_channel_rollup()
2360 		 * to kick start later.
2361 		 */
2362 		txr->hn_has_txeof = 1;
2363 		if (!send_failed) {
2364 			txr->hn_send_failed++;
2365 			send_failed = 1;
2366 			/*
2367 			 * Try sending again after set hn_has_txeof;
2368 			 * in case that we missed the last
2369 			 * netvsc_channel_rollup().
2370 			 */
2371 			goto again;
2372 		}
2373 		if_printf(ifp, "send failed\n");
2374 
2375 		/*
2376 		 * Caller will perform further processing on the
2377 		 * associated mbuf, so don't free it in hn_txdesc_put();
2378 		 * only unload it from the DMA map in hn_txdesc_put(),
2379 		 * if it was loaded.
2380 		 */
2381 		txd->m = NULL;
2382 		freed = hn_txdesc_put(txr, txd);
2383 		KASSERT(freed != 0,
2384 		    ("fail to free txd upon send error"));
2385 
2386 		txr->hn_send_failed++;
2387 	}
2388 
2389 	/* Reset temporary stats, after this sending is done. */
2390 	txr->hn_stat_size = 0;
2391 	txr->hn_stat_pkts = 0;
2392 	txr->hn_stat_mcasts = 0;
2393 
2394 	return (error);
2395 }
2396 
2397 /*
2398  * Append the specified data to the indicated mbuf chain,
2399  * Extend the mbuf chain if the new data does not fit in
2400  * existing space.
2401  *
2402  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
2403  * There should be an equivalent in the kernel mbuf code,
2404  * but there does not appear to be one yet.
2405  *
2406  * Differs from m_append() in that additional mbufs are
2407  * allocated with cluster size MJUMPAGESIZE, and filled
2408  * accordingly.
2409  *
2410  * Return 1 if able to complete the job; otherwise 0.
2411  */
2412 static int
2413 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2414 {
2415 	struct mbuf *m, *n;
2416 	int remainder, space;
2417 
2418 	for (m = m0; m->m_next != NULL; m = m->m_next)
2419 		;
2420 	remainder = len;
2421 	space = M_TRAILINGSPACE(m);
2422 	if (space > 0) {
2423 		/*
2424 		 * Copy into available space.
2425 		 */
2426 		if (space > remainder)
2427 			space = remainder;
2428 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2429 		m->m_len += space;
2430 		cp += space;
2431 		remainder -= space;
2432 	}
2433 	while (remainder > 0) {
2434 		/*
2435 		 * Allocate a new mbuf; could check space
2436 		 * and allocate a cluster instead.
2437 		 */
2438 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
2439 		if (n == NULL)
2440 			break;
2441 		n->m_len = min(MJUMPAGESIZE, remainder);
2442 		bcopy(cp, mtod(n, caddr_t), n->m_len);
2443 		cp += n->m_len;
2444 		remainder -= n->m_len;
2445 		m->m_next = n;
2446 		m = n;
2447 	}
2448 	if (m0->m_flags & M_PKTHDR)
2449 		m0->m_pkthdr.len += len - remainder;
2450 
2451 	return (remainder == 0);
2452 }
2453 
2454 #if defined(INET) || defined(INET6)
2455 static __inline int
2456 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2457 {
2458 #if __FreeBSD_version >= 1100095
2459 	if (hn_lro_mbufq_depth) {
2460 		tcp_lro_queue_mbuf(lc, m);
2461 		return 0;
2462 	}
2463 #endif
2464 	return tcp_lro_rx(lc, m, 0);
2465 }
2466 #endif
2467 
2468 static int
2469 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2470     const struct hn_rxinfo *info)
2471 {
2472 	struct ifnet *ifp;
2473 	struct mbuf *m_new;
2474 	int size, do_lro = 0, do_csum = 1;
2475 	int hash_type;
2476 
2477 	/* If the VF is active, inject the packet through the VF */
2478 	ifp = rxr->hn_rxvf_ifp ? rxr->hn_rxvf_ifp : rxr->hn_ifp;
2479 
2480 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
2481 		/*
2482 		 * NOTE:
2483 		 * See the NOTE of hn_rndis_init_fixat().  This
2484 		 * function can be reached, immediately after the
2485 		 * RNDIS is initialized but before the ifnet is
2486 		 * setup on the hn_attach() path; drop the unexpected
2487 		 * packets.
2488 		 */
2489 		return (0);
2490 	}
2491 
2492 	if (dlen <= MHLEN) {
2493 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
2494 		if (m_new == NULL) {
2495 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2496 			return (0);
2497 		}
2498 		memcpy(mtod(m_new, void *), data, dlen);
2499 		m_new->m_pkthdr.len = m_new->m_len = dlen;
2500 		rxr->hn_small_pkts++;
2501 	} else {
2502 		/*
2503 		 * Get an mbuf with a cluster.  For packets 2K or less,
2504 		 * get a standard 2K cluster.  For anything larger, get a
2505 		 * 4K cluster.  Any buffers larger than 4K can cause problems
2506 		 * if looped around to the Hyper-V TX channel, so avoid them.
2507 		 */
2508 		size = MCLBYTES;
2509 		if (dlen > MCLBYTES) {
2510 			/* 4096 */
2511 			size = MJUMPAGESIZE;
2512 		}
2513 
2514 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
2515 		if (m_new == NULL) {
2516 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2517 			return (0);
2518 		}
2519 
2520 		hv_m_append(m_new, dlen, data);
2521 	}
2522 	m_new->m_pkthdr.rcvif = ifp;
2523 
2524 	if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
2525 		do_csum = 0;
2526 
2527 	/* receive side checksum offload */
2528 	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
2529 		/* IP csum offload */
2530 		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
2531 			m_new->m_pkthdr.csum_flags |=
2532 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
2533 			rxr->hn_csum_ip++;
2534 		}
2535 
2536 		/* TCP/UDP csum offload */
2537 		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
2538 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
2539 			m_new->m_pkthdr.csum_flags |=
2540 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2541 			m_new->m_pkthdr.csum_data = 0xffff;
2542 			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
2543 				rxr->hn_csum_tcp++;
2544 			else
2545 				rxr->hn_csum_udp++;
2546 		}
2547 
2548 		/*
2549 		 * XXX
2550 		 * As of this write (Oct 28th, 2016), host side will turn
2551 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
2552 		 * the do_lro setting here is actually _not_ accurate.  We
2553 		 * depend on the RSS hash type check to reset do_lro.
2554 		 */
2555 		if ((info->csum_info &
2556 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
2557 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
2558 			do_lro = 1;
2559 	} else {
2560 		const struct ether_header *eh;
2561 		uint16_t etype;
2562 		int hoff;
2563 
2564 		hoff = sizeof(*eh);
2565 		if (m_new->m_len < hoff)
2566 			goto skip;
2567 		eh = mtod(m_new, struct ether_header *);
2568 		etype = ntohs(eh->ether_type);
2569 		if (etype == ETHERTYPE_VLAN) {
2570 			const struct ether_vlan_header *evl;
2571 
2572 			hoff = sizeof(*evl);
2573 			if (m_new->m_len < hoff)
2574 				goto skip;
2575 			evl = mtod(m_new, struct ether_vlan_header *);
2576 			etype = ntohs(evl->evl_proto);
2577 		}
2578 
2579 		if (etype == ETHERTYPE_IP) {
2580 			int pr;
2581 
2582 			pr = hn_check_iplen(m_new, hoff);
2583 			if (pr == IPPROTO_TCP) {
2584 				if (do_csum &&
2585 				    (rxr->hn_trust_hcsum &
2586 				     HN_TRUST_HCSUM_TCP)) {
2587 					rxr->hn_csum_trusted++;
2588 					m_new->m_pkthdr.csum_flags |=
2589 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
2590 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2591 					m_new->m_pkthdr.csum_data = 0xffff;
2592 				}
2593 				do_lro = 1;
2594 			} else if (pr == IPPROTO_UDP) {
2595 				if (do_csum &&
2596 				    (rxr->hn_trust_hcsum &
2597 				     HN_TRUST_HCSUM_UDP)) {
2598 					rxr->hn_csum_trusted++;
2599 					m_new->m_pkthdr.csum_flags |=
2600 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
2601 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2602 					m_new->m_pkthdr.csum_data = 0xffff;
2603 				}
2604 			} else if (pr != IPPROTO_DONE && do_csum &&
2605 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
2606 				rxr->hn_csum_trusted++;
2607 				m_new->m_pkthdr.csum_flags |=
2608 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
2609 			}
2610 		}
2611 	}
2612 skip:
2613 	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
2614 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
2615 		    NDIS_VLAN_INFO_ID(info->vlan_info),
2616 		    NDIS_VLAN_INFO_PRI(info->vlan_info),
2617 		    NDIS_VLAN_INFO_CFI(info->vlan_info));
2618 		m_new->m_flags |= M_VLANTAG;
2619 	}
2620 
2621 	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
2622 		rxr->hn_rss_pkts++;
2623 		m_new->m_pkthdr.flowid = info->hash_value;
2624 		hash_type = M_HASHTYPE_OPAQUE_HASH;
2625 		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
2626 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
2627 			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
2628 
2629 			/*
2630 			 * NOTE:
2631 			 * do_lro is resetted, if the hash types are not TCP
2632 			 * related.  See the comment in the above csum_flags
2633 			 * setup section.
2634 			 */
2635 			switch (type) {
2636 			case NDIS_HASH_IPV4:
2637 				hash_type = M_HASHTYPE_RSS_IPV4;
2638 				do_lro = 0;
2639 				break;
2640 
2641 			case NDIS_HASH_TCP_IPV4:
2642 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
2643 				break;
2644 
2645 			case NDIS_HASH_IPV6:
2646 				hash_type = M_HASHTYPE_RSS_IPV6;
2647 				do_lro = 0;
2648 				break;
2649 
2650 			case NDIS_HASH_IPV6_EX:
2651 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
2652 				do_lro = 0;
2653 				break;
2654 
2655 			case NDIS_HASH_TCP_IPV6:
2656 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
2657 				break;
2658 
2659 			case NDIS_HASH_TCP_IPV6_EX:
2660 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
2661 				break;
2662 			}
2663 		}
2664 	} else {
2665 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
2666 		hash_type = M_HASHTYPE_OPAQUE;
2667 	}
2668 	M_HASHTYPE_SET(m_new, hash_type);
2669 
2670 	/*
2671 	 * Note:  Moved RX completion back to hv_nv_on_receive() so all
2672 	 * messages (not just data messages) will trigger a response.
2673 	 */
2674 
2675 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
2676 	rxr->hn_pkts++;
2677 
2678 	if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
2679 #if defined(INET) || defined(INET6)
2680 		struct lro_ctrl *lro = &rxr->hn_lro;
2681 
2682 		if (lro->lro_cnt) {
2683 			rxr->hn_lro_tried++;
2684 			if (hn_lro_rx(lro, m_new) == 0) {
2685 				/* DONE! */
2686 				return 0;
2687 			}
2688 		}
2689 #endif
2690 	}
2691 
2692 	/* We're not holding the lock here, so don't release it */
2693 	(*ifp->if_input)(ifp, m_new);
2694 
2695 	return (0);
2696 }
2697 
2698 static int
2699 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2700 {
2701 	struct hn_softc *sc = ifp->if_softc;
2702 	struct ifreq *ifr = (struct ifreq *)data;
2703 	int mask, error = 0;
2704 
2705 	switch (cmd) {
2706 	case SIOCSIFMTU:
2707 		if (ifr->ifr_mtu > HN_MTU_MAX) {
2708 			error = EINVAL;
2709 			break;
2710 		}
2711 
2712 		HN_LOCK(sc);
2713 
2714 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2715 			HN_UNLOCK(sc);
2716 			break;
2717 		}
2718 
2719 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
2720 			/* Can't change MTU */
2721 			HN_UNLOCK(sc);
2722 			error = EOPNOTSUPP;
2723 			break;
2724 		}
2725 
2726 		if (ifp->if_mtu == ifr->ifr_mtu) {
2727 			HN_UNLOCK(sc);
2728 			break;
2729 		}
2730 
2731 		/*
2732 		 * Suspend this interface before the synthetic parts
2733 		 * are ripped.
2734 		 */
2735 		hn_suspend(sc);
2736 
2737 		/*
2738 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
2739 		 */
2740 		hn_synth_detach(sc);
2741 
2742 		/*
2743 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
2744 		 * with the new MTU setting.
2745 		 */
2746 		error = hn_synth_attach(sc, ifr->ifr_mtu);
2747 		if (error) {
2748 			HN_UNLOCK(sc);
2749 			break;
2750 		}
2751 
2752 		/*
2753 		 * Commit the requested MTU, after the synthetic parts
2754 		 * have been successfully attached.
2755 		 */
2756 		ifp->if_mtu = ifr->ifr_mtu;
2757 
2758 		/*
2759 		 * Make sure that various parameters based on MTU are
2760 		 * still valid, after the MTU change.
2761 		 */
2762 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2763 			hn_set_chim_size(sc, sc->hn_chim_szmax);
2764 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2765 #if __FreeBSD_version >= 1100099
2766 		if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2767 		    HN_LRO_LENLIM_MIN(ifp))
2768 			hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2769 #endif
2770 
2771 		/*
2772 		 * All done!  Resume the interface now.
2773 		 */
2774 		hn_resume(sc);
2775 
2776 		HN_UNLOCK(sc);
2777 		break;
2778 
2779 	case SIOCSIFFLAGS:
2780 		HN_LOCK(sc);
2781 
2782 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2783 			HN_UNLOCK(sc);
2784 			break;
2785 		}
2786 
2787 		if (ifp->if_flags & IFF_UP) {
2788 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2789 				/*
2790 				 * Caller meight hold mutex, e.g.
2791 				 * bpf; use busy-wait for the RNDIS
2792 				 * reply.
2793 				 */
2794 				HN_NO_SLEEPING(sc);
2795 				hn_rxfilter_config(sc);
2796 				HN_SLEEPING_OK(sc);
2797 			} else {
2798 				hn_init_locked(sc);
2799 			}
2800 		} else {
2801 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2802 				hn_stop(sc, false);
2803 		}
2804 		sc->hn_if_flags = ifp->if_flags;
2805 
2806 		HN_UNLOCK(sc);
2807 		break;
2808 
2809 	case SIOCSIFCAP:
2810 		HN_LOCK(sc);
2811 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2812 
2813 		if (mask & IFCAP_TXCSUM) {
2814 			ifp->if_capenable ^= IFCAP_TXCSUM;
2815 			if (ifp->if_capenable & IFCAP_TXCSUM)
2816 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2817 			else
2818 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2819 		}
2820 		if (mask & IFCAP_TXCSUM_IPV6) {
2821 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2822 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2823 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2824 			else
2825 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2826 		}
2827 
2828 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
2829 		if (mask & IFCAP_RXCSUM)
2830 			ifp->if_capenable ^= IFCAP_RXCSUM;
2831 #ifdef foo
2832 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2833 		if (mask & IFCAP_RXCSUM_IPV6)
2834 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2835 #endif
2836 
2837 		if (mask & IFCAP_LRO)
2838 			ifp->if_capenable ^= IFCAP_LRO;
2839 
2840 		if (mask & IFCAP_TSO4) {
2841 			ifp->if_capenable ^= IFCAP_TSO4;
2842 			if (ifp->if_capenable & IFCAP_TSO4)
2843 				ifp->if_hwassist |= CSUM_IP_TSO;
2844 			else
2845 				ifp->if_hwassist &= ~CSUM_IP_TSO;
2846 		}
2847 		if (mask & IFCAP_TSO6) {
2848 			ifp->if_capenable ^= IFCAP_TSO6;
2849 			if (ifp->if_capenable & IFCAP_TSO6)
2850 				ifp->if_hwassist |= CSUM_IP6_TSO;
2851 			else
2852 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
2853 		}
2854 
2855 		HN_UNLOCK(sc);
2856 		break;
2857 
2858 	case SIOCADDMULTI:
2859 	case SIOCDELMULTI:
2860 		HN_LOCK(sc);
2861 
2862 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2863 			HN_UNLOCK(sc);
2864 			break;
2865 		}
2866 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2867 			/*
2868 			 * Multicast uses mutex; use busy-wait for
2869 			 * the RNDIS reply.
2870 			 */
2871 			HN_NO_SLEEPING(sc);
2872 			hn_rxfilter_config(sc);
2873 			HN_SLEEPING_OK(sc);
2874 		}
2875 
2876 		HN_UNLOCK(sc);
2877 		break;
2878 
2879 	case SIOCSIFMEDIA:
2880 	case SIOCGIFMEDIA:
2881 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2882 		break;
2883 
2884 	default:
2885 		error = ether_ioctl(ifp, cmd, data);
2886 		break;
2887 	}
2888 	return (error);
2889 }
2890 
2891 static void
2892 hn_stop(struct hn_softc *sc, bool detaching)
2893 {
2894 	struct ifnet *ifp = sc->hn_ifp;
2895 	int i;
2896 
2897 	HN_LOCK_ASSERT(sc);
2898 
2899 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2900 	    ("synthetic parts were not attached"));
2901 
2902 	/* Disable polling. */
2903 	hn_polling(sc, 0);
2904 
2905 	/* Clear RUNNING bit _before_ hn_suspend_data() */
2906 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2907 	hn_suspend_data(sc);
2908 
2909 	/* Clear OACTIVE bit. */
2910 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2911 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2912 		sc->hn_tx_ring[i].hn_oactive = 0;
2913 
2914 	/*
2915 	 * If the VF is active, make sure the filter is not 0, even if
2916 	 * the synthetic NIC is down.
2917 	 */
2918 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
2919 		hn_rxfilter_config(sc);
2920 }
2921 
2922 static void
2923 hn_init_locked(struct hn_softc *sc)
2924 {
2925 	struct ifnet *ifp = sc->hn_ifp;
2926 	int i;
2927 
2928 	HN_LOCK_ASSERT(sc);
2929 
2930 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2931 		return;
2932 
2933 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2934 		return;
2935 
2936 	/* Configure RX filter */
2937 	hn_rxfilter_config(sc);
2938 
2939 	/* Clear OACTIVE bit. */
2940 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2941 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2942 		sc->hn_tx_ring[i].hn_oactive = 0;
2943 
2944 	/* Clear TX 'suspended' bit. */
2945 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2946 
2947 	/* Everything is ready; unleash! */
2948 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2949 
2950 	/* Re-enable polling if requested. */
2951 	if (sc->hn_pollhz > 0)
2952 		hn_polling(sc, sc->hn_pollhz);
2953 }
2954 
2955 static void
2956 hn_init(void *xsc)
2957 {
2958 	struct hn_softc *sc = xsc;
2959 
2960 	HN_LOCK(sc);
2961 	hn_init_locked(sc);
2962 	HN_UNLOCK(sc);
2963 }
2964 
2965 #if __FreeBSD_version >= 1100099
2966 
2967 static int
2968 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2969 {
2970 	struct hn_softc *sc = arg1;
2971 	unsigned int lenlim;
2972 	int error;
2973 
2974 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2975 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
2976 	if (error || req->newptr == NULL)
2977 		return error;
2978 
2979 	HN_LOCK(sc);
2980 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2981 	    lenlim > TCP_LRO_LENGTH_MAX) {
2982 		HN_UNLOCK(sc);
2983 		return EINVAL;
2984 	}
2985 	hn_set_lro_lenlim(sc, lenlim);
2986 	HN_UNLOCK(sc);
2987 
2988 	return 0;
2989 }
2990 
2991 static int
2992 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2993 {
2994 	struct hn_softc *sc = arg1;
2995 	int ackcnt, error, i;
2996 
2997 	/*
2998 	 * lro_ackcnt_lim is append count limit,
2999 	 * +1 to turn it into aggregation limit.
3000 	 */
3001 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
3002 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
3003 	if (error || req->newptr == NULL)
3004 		return error;
3005 
3006 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
3007 		return EINVAL;
3008 
3009 	/*
3010 	 * Convert aggregation limit back to append
3011 	 * count limit.
3012 	 */
3013 	--ackcnt;
3014 	HN_LOCK(sc);
3015 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
3016 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
3017 	HN_UNLOCK(sc);
3018 	return 0;
3019 }
3020 
3021 #endif
3022 
3023 static int
3024 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
3025 {
3026 	struct hn_softc *sc = arg1;
3027 	int hcsum = arg2;
3028 	int on, error, i;
3029 
3030 	on = 0;
3031 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
3032 		on = 1;
3033 
3034 	error = sysctl_handle_int(oidp, &on, 0, req);
3035 	if (error || req->newptr == NULL)
3036 		return error;
3037 
3038 	HN_LOCK(sc);
3039 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3040 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3041 
3042 		if (on)
3043 			rxr->hn_trust_hcsum |= hcsum;
3044 		else
3045 			rxr->hn_trust_hcsum &= ~hcsum;
3046 	}
3047 	HN_UNLOCK(sc);
3048 	return 0;
3049 }
3050 
3051 static int
3052 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
3053 {
3054 	struct hn_softc *sc = arg1;
3055 	int chim_size, error;
3056 
3057 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
3058 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
3059 	if (error || req->newptr == NULL)
3060 		return error;
3061 
3062 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
3063 		return EINVAL;
3064 
3065 	HN_LOCK(sc);
3066 	hn_set_chim_size(sc, chim_size);
3067 	HN_UNLOCK(sc);
3068 	return 0;
3069 }
3070 
3071 #if __FreeBSD_version < 1100095
3072 static int
3073 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
3074 {
3075 	struct hn_softc *sc = arg1;
3076 	int ofs = arg2, i, error;
3077 	struct hn_rx_ring *rxr;
3078 	uint64_t stat;
3079 
3080 	stat = 0;
3081 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3082 		rxr = &sc->hn_rx_ring[i];
3083 		stat += *((int *)((uint8_t *)rxr + ofs));
3084 	}
3085 
3086 	error = sysctl_handle_64(oidp, &stat, 0, req);
3087 	if (error || req->newptr == NULL)
3088 		return error;
3089 
3090 	/* Zero out this stat. */
3091 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3092 		rxr = &sc->hn_rx_ring[i];
3093 		*((int *)((uint8_t *)rxr + ofs)) = 0;
3094 	}
3095 	return 0;
3096 }
3097 #else
3098 static int
3099 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
3100 {
3101 	struct hn_softc *sc = arg1;
3102 	int ofs = arg2, i, error;
3103 	struct hn_rx_ring *rxr;
3104 	uint64_t stat;
3105 
3106 	stat = 0;
3107 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3108 		rxr = &sc->hn_rx_ring[i];
3109 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
3110 	}
3111 
3112 	error = sysctl_handle_64(oidp, &stat, 0, req);
3113 	if (error || req->newptr == NULL)
3114 		return error;
3115 
3116 	/* Zero out this stat. */
3117 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3118 		rxr = &sc->hn_rx_ring[i];
3119 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
3120 	}
3121 	return 0;
3122 }
3123 
3124 #endif
3125 
3126 static int
3127 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
3128 {
3129 	struct hn_softc *sc = arg1;
3130 	int ofs = arg2, i, error;
3131 	struct hn_rx_ring *rxr;
3132 	u_long stat;
3133 
3134 	stat = 0;
3135 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3136 		rxr = &sc->hn_rx_ring[i];
3137 		stat += *((u_long *)((uint8_t *)rxr + ofs));
3138 	}
3139 
3140 	error = sysctl_handle_long(oidp, &stat, 0, req);
3141 	if (error || req->newptr == NULL)
3142 		return error;
3143 
3144 	/* Zero out this stat. */
3145 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3146 		rxr = &sc->hn_rx_ring[i];
3147 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
3148 	}
3149 	return 0;
3150 }
3151 
3152 static int
3153 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
3154 {
3155 	struct hn_softc *sc = arg1;
3156 	int ofs = arg2, i, error;
3157 	struct hn_tx_ring *txr;
3158 	u_long stat;
3159 
3160 	stat = 0;
3161 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3162 		txr = &sc->hn_tx_ring[i];
3163 		stat += *((u_long *)((uint8_t *)txr + ofs));
3164 	}
3165 
3166 	error = sysctl_handle_long(oidp, &stat, 0, req);
3167 	if (error || req->newptr == NULL)
3168 		return error;
3169 
3170 	/* Zero out this stat. */
3171 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3172 		txr = &sc->hn_tx_ring[i];
3173 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
3174 	}
3175 	return 0;
3176 }
3177 
3178 static int
3179 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
3180 {
3181 	struct hn_softc *sc = arg1;
3182 	int ofs = arg2, i, error, conf;
3183 	struct hn_tx_ring *txr;
3184 
3185 	txr = &sc->hn_tx_ring[0];
3186 	conf = *((int *)((uint8_t *)txr + ofs));
3187 
3188 	error = sysctl_handle_int(oidp, &conf, 0, req);
3189 	if (error || req->newptr == NULL)
3190 		return error;
3191 
3192 	HN_LOCK(sc);
3193 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3194 		txr = &sc->hn_tx_ring[i];
3195 		*((int *)((uint8_t *)txr + ofs)) = conf;
3196 	}
3197 	HN_UNLOCK(sc);
3198 
3199 	return 0;
3200 }
3201 
3202 static int
3203 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
3204 {
3205 	struct hn_softc *sc = arg1;
3206 	int error, size;
3207 
3208 	size = sc->hn_agg_size;
3209 	error = sysctl_handle_int(oidp, &size, 0, req);
3210 	if (error || req->newptr == NULL)
3211 		return (error);
3212 
3213 	HN_LOCK(sc);
3214 	sc->hn_agg_size = size;
3215 	hn_set_txagg(sc);
3216 	HN_UNLOCK(sc);
3217 
3218 	return (0);
3219 }
3220 
3221 static int
3222 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
3223 {
3224 	struct hn_softc *sc = arg1;
3225 	int error, pkts;
3226 
3227 	pkts = sc->hn_agg_pkts;
3228 	error = sysctl_handle_int(oidp, &pkts, 0, req);
3229 	if (error || req->newptr == NULL)
3230 		return (error);
3231 
3232 	HN_LOCK(sc);
3233 	sc->hn_agg_pkts = pkts;
3234 	hn_set_txagg(sc);
3235 	HN_UNLOCK(sc);
3236 
3237 	return (0);
3238 }
3239 
3240 static int
3241 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
3242 {
3243 	struct hn_softc *sc = arg1;
3244 	int pkts;
3245 
3246 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
3247 	return (sysctl_handle_int(oidp, &pkts, 0, req));
3248 }
3249 
3250 static int
3251 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
3252 {
3253 	struct hn_softc *sc = arg1;
3254 	int align;
3255 
3256 	align = sc->hn_tx_ring[0].hn_agg_align;
3257 	return (sysctl_handle_int(oidp, &align, 0, req));
3258 }
3259 
3260 static void
3261 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
3262 {
3263 	if (pollhz == 0)
3264 		vmbus_chan_poll_disable(chan);
3265 	else
3266 		vmbus_chan_poll_enable(chan, pollhz);
3267 }
3268 
3269 static void
3270 hn_polling(struct hn_softc *sc, u_int pollhz)
3271 {
3272 	int nsubch = sc->hn_rx_ring_inuse - 1;
3273 
3274 	HN_LOCK_ASSERT(sc);
3275 
3276 	if (nsubch > 0) {
3277 		struct vmbus_channel **subch;
3278 		int i;
3279 
3280 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
3281 		for (i = 0; i < nsubch; ++i)
3282 			hn_chan_polling(subch[i], pollhz);
3283 		vmbus_subchan_rel(subch, nsubch);
3284 	}
3285 	hn_chan_polling(sc->hn_prichan, pollhz);
3286 }
3287 
3288 static int
3289 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
3290 {
3291 	struct hn_softc *sc = arg1;
3292 	int pollhz, error;
3293 
3294 	pollhz = sc->hn_pollhz;
3295 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
3296 	if (error || req->newptr == NULL)
3297 		return (error);
3298 
3299 	if (pollhz != 0 &&
3300 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
3301 		return (EINVAL);
3302 
3303 	HN_LOCK(sc);
3304 	if (sc->hn_pollhz != pollhz) {
3305 		sc->hn_pollhz = pollhz;
3306 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
3307 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
3308 			hn_polling(sc, sc->hn_pollhz);
3309 	}
3310 	HN_UNLOCK(sc);
3311 
3312 	return (0);
3313 }
3314 
3315 static int
3316 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
3317 {
3318 	struct hn_softc *sc = arg1;
3319 	char verstr[16];
3320 
3321 	snprintf(verstr, sizeof(verstr), "%u.%u",
3322 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
3323 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
3324 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
3325 }
3326 
3327 static int
3328 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
3329 {
3330 	struct hn_softc *sc = arg1;
3331 	char caps_str[128];
3332 	uint32_t caps;
3333 
3334 	HN_LOCK(sc);
3335 	caps = sc->hn_caps;
3336 	HN_UNLOCK(sc);
3337 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
3338 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
3339 }
3340 
3341 static int
3342 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
3343 {
3344 	struct hn_softc *sc = arg1;
3345 	char assist_str[128];
3346 	uint32_t hwassist;
3347 
3348 	HN_LOCK(sc);
3349 	hwassist = sc->hn_ifp->if_hwassist;
3350 	HN_UNLOCK(sc);
3351 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
3352 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
3353 }
3354 
3355 static int
3356 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
3357 {
3358 	struct hn_softc *sc = arg1;
3359 	char filter_str[128];
3360 	uint32_t filter;
3361 
3362 	HN_LOCK(sc);
3363 	filter = sc->hn_rx_filter;
3364 	HN_UNLOCK(sc);
3365 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
3366 	    NDIS_PACKET_TYPES);
3367 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
3368 }
3369 
3370 #ifndef RSS
3371 
3372 static int
3373 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
3374 {
3375 	struct hn_softc *sc = arg1;
3376 	int error;
3377 
3378 	HN_LOCK(sc);
3379 
3380 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3381 	if (error || req->newptr == NULL)
3382 		goto back;
3383 
3384 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3385 	if (error)
3386 		goto back;
3387 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
3388 
3389 	if (sc->hn_rx_ring_inuse > 1) {
3390 		error = hn_rss_reconfig(sc);
3391 	} else {
3392 		/* Not RSS capable, at least for now; just save the RSS key. */
3393 		error = 0;
3394 	}
3395 back:
3396 	HN_UNLOCK(sc);
3397 	return (error);
3398 }
3399 
3400 static int
3401 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
3402 {
3403 	struct hn_softc *sc = arg1;
3404 	int error;
3405 
3406 	HN_LOCK(sc);
3407 
3408 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3409 	if (error || req->newptr == NULL)
3410 		goto back;
3411 
3412 	/*
3413 	 * Don't allow RSS indirect table change, if this interface is not
3414 	 * RSS capable currently.
3415 	 */
3416 	if (sc->hn_rx_ring_inuse == 1) {
3417 		error = EOPNOTSUPP;
3418 		goto back;
3419 	}
3420 
3421 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3422 	if (error)
3423 		goto back;
3424 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
3425 
3426 	hn_rss_ind_fixup(sc);
3427 	error = hn_rss_reconfig(sc);
3428 back:
3429 	HN_UNLOCK(sc);
3430 	return (error);
3431 }
3432 
3433 #endif	/* !RSS */
3434 
3435 static int
3436 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
3437 {
3438 	struct hn_softc *sc = arg1;
3439 	char hash_str[128];
3440 	uint32_t hash;
3441 
3442 	HN_LOCK(sc);
3443 	hash = sc->hn_rss_hash;
3444 	HN_UNLOCK(sc);
3445 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
3446 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
3447 }
3448 
3449 static int
3450 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
3451 {
3452 	struct hn_softc *sc = arg1;
3453 	char vf_name[IFNAMSIZ + 1];
3454 	struct ifnet *vf_ifp;
3455 
3456 	HN_LOCK(sc);
3457 	vf_name[0] = '\0';
3458 	vf_ifp = sc->hn_vf_ifp;
3459 	if (vf_ifp != NULL)
3460 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
3461 	HN_UNLOCK(sc);
3462 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
3463 }
3464 
3465 static int
3466 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
3467 {
3468 	struct hn_softc *sc = arg1;
3469 	char vf_name[IFNAMSIZ + 1];
3470 	struct ifnet *vf_ifp;
3471 
3472 	HN_LOCK(sc);
3473 	vf_name[0] = '\0';
3474 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
3475 	if (vf_ifp != NULL)
3476 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
3477 	HN_UNLOCK(sc);
3478 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
3479 }
3480 
3481 static int
3482 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
3483 {
3484 	struct rm_priotracker pt;
3485 	struct sbuf *sb;
3486 	int error, i;
3487 	bool first;
3488 
3489 	error = sysctl_wire_old_buffer(req, 0);
3490 	if (error != 0)
3491 		return (error);
3492 
3493 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
3494 	if (sb == NULL)
3495 		return (ENOMEM);
3496 
3497 	rm_rlock(&hn_vfmap_lock, &pt);
3498 
3499 	first = true;
3500 	for (i = 0; i < hn_vfmap_size; ++i) {
3501 		struct ifnet *ifp;
3502 
3503 		if (hn_vfmap[i] == NULL)
3504 			continue;
3505 
3506 		ifp = ifnet_byindex(i);
3507 		if (ifp != NULL) {
3508 			if (first)
3509 				sbuf_printf(sb, "%s", ifp->if_xname);
3510 			else
3511 				sbuf_printf(sb, " %s", ifp->if_xname);
3512 			first = false;
3513 		}
3514 	}
3515 
3516 	rm_runlock(&hn_vfmap_lock, &pt);
3517 
3518 	error = sbuf_finish(sb);
3519 	sbuf_delete(sb);
3520 	return (error);
3521 }
3522 
3523 static int
3524 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
3525 {
3526 	struct rm_priotracker pt;
3527 	struct sbuf *sb;
3528 	int error, i;
3529 	bool first;
3530 
3531 	error = sysctl_wire_old_buffer(req, 0);
3532 	if (error != 0)
3533 		return (error);
3534 
3535 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
3536 	if (sb == NULL)
3537 		return (ENOMEM);
3538 
3539 	rm_rlock(&hn_vfmap_lock, &pt);
3540 
3541 	first = true;
3542 	for (i = 0; i < hn_vfmap_size; ++i) {
3543 		struct ifnet *ifp, *hn_ifp;
3544 
3545 		hn_ifp = hn_vfmap[i];
3546 		if (hn_ifp == NULL)
3547 			continue;
3548 
3549 		ifp = ifnet_byindex(i);
3550 		if (ifp != NULL) {
3551 			if (first) {
3552 				sbuf_printf(sb, "%s:%s", ifp->if_xname,
3553 				    hn_ifp->if_xname);
3554 			} else {
3555 				sbuf_printf(sb, " %s:%s", ifp->if_xname,
3556 				    hn_ifp->if_xname);
3557 			}
3558 			first = false;
3559 		}
3560 	}
3561 
3562 	rm_runlock(&hn_vfmap_lock, &pt);
3563 
3564 	error = sbuf_finish(sb);
3565 	sbuf_delete(sb);
3566 	return (error);
3567 }
3568 
3569 static int
3570 hn_check_iplen(const struct mbuf *m, int hoff)
3571 {
3572 	const struct ip *ip;
3573 	int len, iphlen, iplen;
3574 	const struct tcphdr *th;
3575 	int thoff;				/* TCP data offset */
3576 
3577 	len = hoff + sizeof(struct ip);
3578 
3579 	/* The packet must be at least the size of an IP header. */
3580 	if (m->m_pkthdr.len < len)
3581 		return IPPROTO_DONE;
3582 
3583 	/* The fixed IP header must reside completely in the first mbuf. */
3584 	if (m->m_len < len)
3585 		return IPPROTO_DONE;
3586 
3587 	ip = mtodo(m, hoff);
3588 
3589 	/* Bound check the packet's stated IP header length. */
3590 	iphlen = ip->ip_hl << 2;
3591 	if (iphlen < sizeof(struct ip))		/* minimum header length */
3592 		return IPPROTO_DONE;
3593 
3594 	/* The full IP header must reside completely in the one mbuf. */
3595 	if (m->m_len < hoff + iphlen)
3596 		return IPPROTO_DONE;
3597 
3598 	iplen = ntohs(ip->ip_len);
3599 
3600 	/*
3601 	 * Check that the amount of data in the buffers is as
3602 	 * at least much as the IP header would have us expect.
3603 	 */
3604 	if (m->m_pkthdr.len < hoff + iplen)
3605 		return IPPROTO_DONE;
3606 
3607 	/*
3608 	 * Ignore IP fragments.
3609 	 */
3610 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
3611 		return IPPROTO_DONE;
3612 
3613 	/*
3614 	 * The TCP/IP or UDP/IP header must be entirely contained within
3615 	 * the first fragment of a packet.
3616 	 */
3617 	switch (ip->ip_p) {
3618 	case IPPROTO_TCP:
3619 		if (iplen < iphlen + sizeof(struct tcphdr))
3620 			return IPPROTO_DONE;
3621 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
3622 			return IPPROTO_DONE;
3623 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
3624 		thoff = th->th_off << 2;
3625 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
3626 			return IPPROTO_DONE;
3627 		if (m->m_len < hoff + iphlen + thoff)
3628 			return IPPROTO_DONE;
3629 		break;
3630 	case IPPROTO_UDP:
3631 		if (iplen < iphlen + sizeof(struct udphdr))
3632 			return IPPROTO_DONE;
3633 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
3634 			return IPPROTO_DONE;
3635 		break;
3636 	default:
3637 		if (iplen < iphlen)
3638 			return IPPROTO_DONE;
3639 		break;
3640 	}
3641 	return ip->ip_p;
3642 }
3643 
3644 static int
3645 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
3646 {
3647 	struct sysctl_oid_list *child;
3648 	struct sysctl_ctx_list *ctx;
3649 	device_t dev = sc->hn_dev;
3650 #if defined(INET) || defined(INET6)
3651 #if __FreeBSD_version >= 1100095
3652 	int lroent_cnt;
3653 #endif
3654 #endif
3655 	int i;
3656 
3657 	/*
3658 	 * Create RXBUF for reception.
3659 	 *
3660 	 * NOTE:
3661 	 * - It is shared by all channels.
3662 	 * - A large enough buffer is allocated, certain version of NVSes
3663 	 *   may further limit the usable space.
3664 	 */
3665 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3666 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
3667 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
3668 	if (sc->hn_rxbuf == NULL) {
3669 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
3670 		return (ENOMEM);
3671 	}
3672 
3673 	sc->hn_rx_ring_cnt = ring_cnt;
3674 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
3675 
3676 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
3677 	    M_DEVBUF, M_WAITOK | M_ZERO);
3678 
3679 #if defined(INET) || defined(INET6)
3680 #if __FreeBSD_version >= 1100095
3681 	lroent_cnt = hn_lro_entry_count;
3682 	if (lroent_cnt < TCP_LRO_ENTRIES)
3683 		lroent_cnt = TCP_LRO_ENTRIES;
3684 	if (bootverbose)
3685 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
3686 #endif
3687 #endif	/* INET || INET6 */
3688 
3689 	ctx = device_get_sysctl_ctx(dev);
3690 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
3691 
3692 	/* Create dev.hn.UNIT.rx sysctl tree */
3693 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
3694 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3695 
3696 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3697 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3698 
3699 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3700 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
3701 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
3702 		if (rxr->hn_br == NULL) {
3703 			device_printf(dev, "allocate bufring failed\n");
3704 			return (ENOMEM);
3705 		}
3706 
3707 		if (hn_trust_hosttcp)
3708 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
3709 		if (hn_trust_hostudp)
3710 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
3711 		if (hn_trust_hostip)
3712 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
3713 		rxr->hn_ifp = sc->hn_ifp;
3714 		if (i < sc->hn_tx_ring_cnt)
3715 			rxr->hn_txr = &sc->hn_tx_ring[i];
3716 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
3717 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
3718 		rxr->hn_rx_idx = i;
3719 		rxr->hn_rxbuf = sc->hn_rxbuf;
3720 
3721 		/*
3722 		 * Initialize LRO.
3723 		 */
3724 #if defined(INET) || defined(INET6)
3725 #if __FreeBSD_version >= 1100095
3726 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
3727 		    hn_lro_mbufq_depth);
3728 #else
3729 		tcp_lro_init(&rxr->hn_lro);
3730 		rxr->hn_lro.ifp = sc->hn_ifp;
3731 #endif
3732 #if __FreeBSD_version >= 1100099
3733 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
3734 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
3735 #endif
3736 #endif	/* INET || INET6 */
3737 
3738 		if (sc->hn_rx_sysctl_tree != NULL) {
3739 			char name[16];
3740 
3741 			/*
3742 			 * Create per RX ring sysctl tree:
3743 			 * dev.hn.UNIT.rx.RINGID
3744 			 */
3745 			snprintf(name, sizeof(name), "%d", i);
3746 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
3747 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
3748 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3749 
3750 			if (rxr->hn_rx_sysctl_tree != NULL) {
3751 				SYSCTL_ADD_ULONG(ctx,
3752 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3753 				    OID_AUTO, "packets", CTLFLAG_RW,
3754 				    &rxr->hn_pkts, "# of packets received");
3755 				SYSCTL_ADD_ULONG(ctx,
3756 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3757 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
3758 				    &rxr->hn_rss_pkts,
3759 				    "# of packets w/ RSS info received");
3760 				SYSCTL_ADD_INT(ctx,
3761 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3762 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
3763 				    &rxr->hn_pktbuf_len, 0,
3764 				    "Temporary channel packet buffer length");
3765 			}
3766 		}
3767 	}
3768 
3769 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
3770 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3771 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
3772 #if __FreeBSD_version < 1100095
3773 	    hn_rx_stat_int_sysctl,
3774 #else
3775 	    hn_rx_stat_u64_sysctl,
3776 #endif
3777 	    "LU", "LRO queued");
3778 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
3779 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3780 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
3781 #if __FreeBSD_version < 1100095
3782 	    hn_rx_stat_int_sysctl,
3783 #else
3784 	    hn_rx_stat_u64_sysctl,
3785 #endif
3786 	    "LU", "LRO flushed");
3787 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
3788 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3789 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
3790 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
3791 #if __FreeBSD_version >= 1100099
3792 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
3793 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3794 	    hn_lro_lenlim_sysctl, "IU",
3795 	    "Max # of data bytes to be aggregated by LRO");
3796 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
3797 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3798 	    hn_lro_ackcnt_sysctl, "I",
3799 	    "Max # of ACKs to be aggregated by LRO");
3800 #endif
3801 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
3802 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
3803 	    hn_trust_hcsum_sysctl, "I",
3804 	    "Trust tcp segement verification on host side, "
3805 	    "when csum info is missing");
3806 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
3807 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
3808 	    hn_trust_hcsum_sysctl, "I",
3809 	    "Trust udp datagram verification on host side, "
3810 	    "when csum info is missing");
3811 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
3812 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
3813 	    hn_trust_hcsum_sysctl, "I",
3814 	    "Trust ip packet verification on host side, "
3815 	    "when csum info is missing");
3816 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
3817 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3818 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
3819 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
3820 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
3821 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3822 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
3823 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
3824 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
3825 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3826 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
3827 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
3828 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
3829 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3830 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
3831 	    hn_rx_stat_ulong_sysctl, "LU",
3832 	    "# of packets that we trust host's csum verification");
3833 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
3834 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3835 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
3836 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
3837 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
3838 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3839 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
3840 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
3841 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
3842 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
3843 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
3844 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
3845 
3846 	return (0);
3847 }
3848 
3849 static void
3850 hn_destroy_rx_data(struct hn_softc *sc)
3851 {
3852 	int i;
3853 
3854 	if (sc->hn_rxbuf != NULL) {
3855 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
3856 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
3857 		else
3858 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
3859 		sc->hn_rxbuf = NULL;
3860 	}
3861 
3862 	if (sc->hn_rx_ring_cnt == 0)
3863 		return;
3864 
3865 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3866 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3867 
3868 		if (rxr->hn_br == NULL)
3869 			continue;
3870 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
3871 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
3872 		} else {
3873 			device_printf(sc->hn_dev,
3874 			    "%dth channel bufring is referenced", i);
3875 		}
3876 		rxr->hn_br = NULL;
3877 
3878 #if defined(INET) || defined(INET6)
3879 		tcp_lro_free(&rxr->hn_lro);
3880 #endif
3881 		free(rxr->hn_pktbuf, M_DEVBUF);
3882 	}
3883 	free(sc->hn_rx_ring, M_DEVBUF);
3884 	sc->hn_rx_ring = NULL;
3885 
3886 	sc->hn_rx_ring_cnt = 0;
3887 	sc->hn_rx_ring_inuse = 0;
3888 }
3889 
3890 static int
3891 hn_tx_ring_create(struct hn_softc *sc, int id)
3892 {
3893 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
3894 	device_t dev = sc->hn_dev;
3895 	bus_dma_tag_t parent_dtag;
3896 	int error, i;
3897 
3898 	txr->hn_sc = sc;
3899 	txr->hn_tx_idx = id;
3900 
3901 #ifndef HN_USE_TXDESC_BUFRING
3902 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
3903 #endif
3904 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
3905 
3906 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
3907 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
3908 	    M_DEVBUF, M_WAITOK | M_ZERO);
3909 #ifndef HN_USE_TXDESC_BUFRING
3910 	SLIST_INIT(&txr->hn_txlist);
3911 #else
3912 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
3913 	    M_WAITOK, &txr->hn_tx_lock);
3914 #endif
3915 
3916 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
3917 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
3918 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
3919 	} else {
3920 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
3921 	}
3922 
3923 #ifdef HN_IFSTART_SUPPORT
3924 	if (hn_use_if_start) {
3925 		txr->hn_txeof = hn_start_txeof;
3926 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
3927 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
3928 	} else
3929 #endif
3930 	{
3931 		int br_depth;
3932 
3933 		txr->hn_txeof = hn_xmit_txeof;
3934 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
3935 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
3936 
3937 		br_depth = hn_get_txswq_depth(txr);
3938 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
3939 		    M_WAITOK, &txr->hn_tx_lock);
3940 	}
3941 
3942 	txr->hn_direct_tx_size = hn_direct_tx_size;
3943 
3944 	/*
3945 	 * Always schedule transmission instead of trying to do direct
3946 	 * transmission.  This one gives the best performance so far.
3947 	 */
3948 	txr->hn_sched_tx = 1;
3949 
3950 	parent_dtag = bus_get_dma_tag(dev);
3951 
3952 	/* DMA tag for RNDIS packet messages. */
3953 	error = bus_dma_tag_create(parent_dtag, /* parent */
3954 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
3955 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
3956 	    BUS_SPACE_MAXADDR,		/* lowaddr */
3957 	    BUS_SPACE_MAXADDR,		/* highaddr */
3958 	    NULL, NULL,			/* filter, filterarg */
3959 	    HN_RNDIS_PKT_LEN,		/* maxsize */
3960 	    1,				/* nsegments */
3961 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
3962 	    0,				/* flags */
3963 	    NULL,			/* lockfunc */
3964 	    NULL,			/* lockfuncarg */
3965 	    &txr->hn_tx_rndis_dtag);
3966 	if (error) {
3967 		device_printf(dev, "failed to create rndis dmatag\n");
3968 		return error;
3969 	}
3970 
3971 	/* DMA tag for data. */
3972 	error = bus_dma_tag_create(parent_dtag, /* parent */
3973 	    1,				/* alignment */
3974 	    HN_TX_DATA_BOUNDARY,	/* boundary */
3975 	    BUS_SPACE_MAXADDR,		/* lowaddr */
3976 	    BUS_SPACE_MAXADDR,		/* highaddr */
3977 	    NULL, NULL,			/* filter, filterarg */
3978 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
3979 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
3980 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
3981 	    0,				/* flags */
3982 	    NULL,			/* lockfunc */
3983 	    NULL,			/* lockfuncarg */
3984 	    &txr->hn_tx_data_dtag);
3985 	if (error) {
3986 		device_printf(dev, "failed to create data dmatag\n");
3987 		return error;
3988 	}
3989 
3990 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3991 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
3992 
3993 		txd->txr = txr;
3994 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3995 		STAILQ_INIT(&txd->agg_list);
3996 
3997 		/*
3998 		 * Allocate and load RNDIS packet message.
3999 		 */
4000         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
4001 		    (void **)&txd->rndis_pkt,
4002 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
4003 		    &txd->rndis_pkt_dmap);
4004 		if (error) {
4005 			device_printf(dev,
4006 			    "failed to allocate rndis_packet_msg, %d\n", i);
4007 			return error;
4008 		}
4009 
4010 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
4011 		    txd->rndis_pkt_dmap,
4012 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
4013 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
4014 		    BUS_DMA_NOWAIT);
4015 		if (error) {
4016 			device_printf(dev,
4017 			    "failed to load rndis_packet_msg, %d\n", i);
4018 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
4019 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
4020 			return error;
4021 		}
4022 
4023 		/* DMA map for TX data. */
4024 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
4025 		    &txd->data_dmap);
4026 		if (error) {
4027 			device_printf(dev,
4028 			    "failed to allocate tx data dmamap\n");
4029 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
4030 			    txd->rndis_pkt_dmap);
4031 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
4032 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
4033 			return error;
4034 		}
4035 
4036 		/* All set, put it to list */
4037 		txd->flags |= HN_TXD_FLAG_ONLIST;
4038 #ifndef HN_USE_TXDESC_BUFRING
4039 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
4040 #else
4041 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
4042 #endif
4043 	}
4044 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
4045 
4046 	if (sc->hn_tx_sysctl_tree != NULL) {
4047 		struct sysctl_oid_list *child;
4048 		struct sysctl_ctx_list *ctx;
4049 		char name[16];
4050 
4051 		/*
4052 		 * Create per TX ring sysctl tree:
4053 		 * dev.hn.UNIT.tx.RINGID
4054 		 */
4055 		ctx = device_get_sysctl_ctx(dev);
4056 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
4057 
4058 		snprintf(name, sizeof(name), "%d", id);
4059 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
4060 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4061 
4062 		if (txr->hn_tx_sysctl_tree != NULL) {
4063 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
4064 
4065 #ifdef HN_DEBUG
4066 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
4067 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
4068 			    "# of available TX descs");
4069 #endif
4070 #ifdef HN_IFSTART_SUPPORT
4071 			if (!hn_use_if_start)
4072 #endif
4073 			{
4074 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
4075 				    CTLFLAG_RD, &txr->hn_oactive, 0,
4076 				    "over active");
4077 			}
4078 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
4079 			    CTLFLAG_RW, &txr->hn_pkts,
4080 			    "# of packets transmitted");
4081 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
4082 			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
4083 		}
4084 	}
4085 
4086 	return 0;
4087 }
4088 
4089 static void
4090 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
4091 {
4092 	struct hn_tx_ring *txr = txd->txr;
4093 
4094 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
4095 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
4096 
4097 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
4098 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
4099 	    txd->rndis_pkt_dmap);
4100 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
4101 }
4102 
4103 static void
4104 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
4105 {
4106 
4107 	KASSERT(txd->refs == 0 || txd->refs == 1,
4108 	    ("invalid txd refs %d", txd->refs));
4109 
4110 	/* Aggregated txds will be freed by their aggregating txd. */
4111 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
4112 		int freed;
4113 
4114 		freed = hn_txdesc_put(txr, txd);
4115 		KASSERT(freed, ("can't free txdesc"));
4116 	}
4117 }
4118 
4119 static void
4120 hn_tx_ring_destroy(struct hn_tx_ring *txr)
4121 {
4122 	int i;
4123 
4124 	if (txr->hn_txdesc == NULL)
4125 		return;
4126 
4127 	/*
4128 	 * NOTE:
4129 	 * Because the freeing of aggregated txds will be deferred
4130 	 * to the aggregating txd, two passes are used here:
4131 	 * - The first pass GCes any pending txds.  This GC is necessary,
4132 	 *   since if the channels are revoked, hypervisor will not
4133 	 *   deliver send-done for all pending txds.
4134 	 * - The second pass frees the busdma stuffs, i.e. after all txds
4135 	 *   were freed.
4136 	 */
4137 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
4138 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
4139 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
4140 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
4141 
4142 	if (txr->hn_tx_data_dtag != NULL)
4143 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
4144 	if (txr->hn_tx_rndis_dtag != NULL)
4145 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
4146 
4147 #ifdef HN_USE_TXDESC_BUFRING
4148 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
4149 #endif
4150 
4151 	free(txr->hn_txdesc, M_DEVBUF);
4152 	txr->hn_txdesc = NULL;
4153 
4154 	if (txr->hn_mbuf_br != NULL)
4155 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
4156 
4157 #ifndef HN_USE_TXDESC_BUFRING
4158 	mtx_destroy(&txr->hn_txlist_spin);
4159 #endif
4160 	mtx_destroy(&txr->hn_tx_lock);
4161 }
4162 
4163 static int
4164 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
4165 {
4166 	struct sysctl_oid_list *child;
4167 	struct sysctl_ctx_list *ctx;
4168 	int i;
4169 
4170 	/*
4171 	 * Create TXBUF for chimney sending.
4172 	 *
4173 	 * NOTE: It is shared by all channels.
4174 	 */
4175 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
4176 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
4177 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
4178 	if (sc->hn_chim == NULL) {
4179 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
4180 		return (ENOMEM);
4181 	}
4182 
4183 	sc->hn_tx_ring_cnt = ring_cnt;
4184 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4185 
4186 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
4187 	    M_DEVBUF, M_WAITOK | M_ZERO);
4188 
4189 	ctx = device_get_sysctl_ctx(sc->hn_dev);
4190 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
4191 
4192 	/* Create dev.hn.UNIT.tx sysctl tree */
4193 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
4194 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4195 
4196 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4197 		int error;
4198 
4199 		error = hn_tx_ring_create(sc, i);
4200 		if (error)
4201 			return error;
4202 	}
4203 
4204 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
4205 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4206 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
4207 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
4208 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
4209 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4210 	    __offsetof(struct hn_tx_ring, hn_send_failed),
4211 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
4212 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
4213 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4214 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
4215 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
4216 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
4217 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4218 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
4219 	    hn_tx_stat_ulong_sysctl, "LU",
4220 	    "# of packet transmission aggregation flush failure");
4221 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
4222 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4223 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
4224 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
4225 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
4226 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4227 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
4228 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
4229 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
4230 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4231 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
4232 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
4233 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
4234 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
4235 	    "# of total TX descs");
4236 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
4237 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
4238 	    "Chimney send packet size upper boundary");
4239 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
4240 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4241 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
4242 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
4243 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4244 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
4245 	    hn_tx_conf_int_sysctl, "I",
4246 	    "Size of the packet for direct transmission");
4247 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
4248 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4249 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
4250 	    hn_tx_conf_int_sysctl, "I",
4251 	    "Always schedule transmission "
4252 	    "instead of doing direct transmission");
4253 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
4254 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
4255 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
4256 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
4257 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
4258 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
4259 	    "Applied packet transmission aggregation size");
4260 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
4261 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
4262 	    hn_txagg_pktmax_sysctl, "I",
4263 	    "Applied packet transmission aggregation packets");
4264 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
4265 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
4266 	    hn_txagg_align_sysctl, "I",
4267 	    "Applied packet transmission aggregation alignment");
4268 
4269 	return 0;
4270 }
4271 
4272 static void
4273 hn_set_chim_size(struct hn_softc *sc, int chim_size)
4274 {
4275 	int i;
4276 
4277 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4278 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
4279 }
4280 
4281 static void
4282 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
4283 {
4284 	struct ifnet *ifp = sc->hn_ifp;
4285 	int tso_minlen;
4286 
4287 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
4288 		return;
4289 
4290 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
4291 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
4292 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
4293 
4294 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
4295 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
4296 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
4297 
4298 	if (tso_maxlen < tso_minlen)
4299 		tso_maxlen = tso_minlen;
4300 	else if (tso_maxlen > IP_MAXPACKET)
4301 		tso_maxlen = IP_MAXPACKET;
4302 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
4303 		tso_maxlen = sc->hn_ndis_tso_szmax;
4304 	ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
4305 	if (bootverbose)
4306 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
4307 }
4308 
4309 static void
4310 hn_fixup_tx_data(struct hn_softc *sc)
4311 {
4312 	uint64_t csum_assist;
4313 	int i;
4314 
4315 	hn_set_chim_size(sc, sc->hn_chim_szmax);
4316 	if (hn_tx_chimney_size > 0 &&
4317 	    hn_tx_chimney_size < sc->hn_chim_szmax)
4318 		hn_set_chim_size(sc, hn_tx_chimney_size);
4319 
4320 	csum_assist = 0;
4321 	if (sc->hn_caps & HN_CAP_IPCS)
4322 		csum_assist |= CSUM_IP;
4323 	if (sc->hn_caps & HN_CAP_TCP4CS)
4324 		csum_assist |= CSUM_IP_TCP;
4325 	if (sc->hn_caps & HN_CAP_UDP4CS)
4326 		csum_assist |= CSUM_IP_UDP;
4327 	if (sc->hn_caps & HN_CAP_TCP6CS)
4328 		csum_assist |= CSUM_IP6_TCP;
4329 	if (sc->hn_caps & HN_CAP_UDP6CS)
4330 		csum_assist |= CSUM_IP6_UDP;
4331 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4332 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
4333 
4334 	if (sc->hn_caps & HN_CAP_HASHVAL) {
4335 		/*
4336 		 * Support HASHVAL pktinfo on TX path.
4337 		 */
4338 		if (bootverbose)
4339 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
4340 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4341 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
4342 	}
4343 }
4344 
4345 static void
4346 hn_destroy_tx_data(struct hn_softc *sc)
4347 {
4348 	int i;
4349 
4350 	if (sc->hn_chim != NULL) {
4351 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
4352 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
4353 		} else {
4354 			device_printf(sc->hn_dev,
4355 			    "chimney sending buffer is referenced");
4356 		}
4357 		sc->hn_chim = NULL;
4358 	}
4359 
4360 	if (sc->hn_tx_ring_cnt == 0)
4361 		return;
4362 
4363 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4364 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
4365 
4366 	free(sc->hn_tx_ring, M_DEVBUF);
4367 	sc->hn_tx_ring = NULL;
4368 
4369 	sc->hn_tx_ring_cnt = 0;
4370 	sc->hn_tx_ring_inuse = 0;
4371 }
4372 
4373 #ifdef HN_IFSTART_SUPPORT
4374 
4375 static void
4376 hn_start_taskfunc(void *xtxr, int pending __unused)
4377 {
4378 	struct hn_tx_ring *txr = xtxr;
4379 
4380 	mtx_lock(&txr->hn_tx_lock);
4381 	hn_start_locked(txr, 0);
4382 	mtx_unlock(&txr->hn_tx_lock);
4383 }
4384 
4385 static int
4386 hn_start_locked(struct hn_tx_ring *txr, int len)
4387 {
4388 	struct hn_softc *sc = txr->hn_sc;
4389 	struct ifnet *ifp = sc->hn_ifp;
4390 	int sched = 0;
4391 
4392 	KASSERT(hn_use_if_start,
4393 	    ("hn_start_locked is called, when if_start is disabled"));
4394 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
4395 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
4396 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
4397 
4398 	if (__predict_false(txr->hn_suspended))
4399 		return (0);
4400 
4401 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
4402 	    IFF_DRV_RUNNING)
4403 		return (0);
4404 
4405 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
4406 		struct hn_txdesc *txd;
4407 		struct mbuf *m_head;
4408 		int error;
4409 
4410 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
4411 		if (m_head == NULL)
4412 			break;
4413 
4414 		if (len > 0 && m_head->m_pkthdr.len > len) {
4415 			/*
4416 			 * This sending could be time consuming; let callers
4417 			 * dispatch this packet sending (and sending of any
4418 			 * following up packets) to tx taskqueue.
4419 			 */
4420 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4421 			sched = 1;
4422 			break;
4423 		}
4424 
4425 #if defined(INET6) || defined(INET)
4426 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
4427 			m_head = hn_tso_fixup(m_head);
4428 			if (__predict_false(m_head == NULL)) {
4429 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4430 				continue;
4431 			}
4432 		}
4433 #endif
4434 
4435 		txd = hn_txdesc_get(txr);
4436 		if (txd == NULL) {
4437 			txr->hn_no_txdescs++;
4438 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4439 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4440 			break;
4441 		}
4442 
4443 		error = hn_encap(ifp, txr, txd, &m_head);
4444 		if (error) {
4445 			/* Both txd and m_head are freed */
4446 			KASSERT(txr->hn_agg_txd == NULL,
4447 			    ("encap failed w/ pending aggregating txdesc"));
4448 			continue;
4449 		}
4450 
4451 		if (txr->hn_agg_pktleft == 0) {
4452 			if (txr->hn_agg_txd != NULL) {
4453 				KASSERT(m_head == NULL,
4454 				    ("pending mbuf for aggregating txdesc"));
4455 				error = hn_flush_txagg(ifp, txr);
4456 				if (__predict_false(error)) {
4457 					atomic_set_int(&ifp->if_drv_flags,
4458 					    IFF_DRV_OACTIVE);
4459 					break;
4460 				}
4461 			} else {
4462 				KASSERT(m_head != NULL, ("mbuf was freed"));
4463 				error = hn_txpkt(ifp, txr, txd);
4464 				if (__predict_false(error)) {
4465 					/* txd is freed, but m_head is not */
4466 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4467 					atomic_set_int(&ifp->if_drv_flags,
4468 					    IFF_DRV_OACTIVE);
4469 					break;
4470 				}
4471 			}
4472 		}
4473 #ifdef INVARIANTS
4474 		else {
4475 			KASSERT(txr->hn_agg_txd != NULL,
4476 			    ("no aggregating txdesc"));
4477 			KASSERT(m_head == NULL,
4478 			    ("pending mbuf for aggregating txdesc"));
4479 		}
4480 #endif
4481 	}
4482 
4483 	/* Flush pending aggerated transmission. */
4484 	if (txr->hn_agg_txd != NULL)
4485 		hn_flush_txagg(ifp, txr);
4486 	return (sched);
4487 }
4488 
4489 static void
4490 hn_start(struct ifnet *ifp)
4491 {
4492 	struct hn_softc *sc = ifp->if_softc;
4493 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
4494 
4495 	if (txr->hn_sched_tx)
4496 		goto do_sched;
4497 
4498 	if (mtx_trylock(&txr->hn_tx_lock)) {
4499 		int sched;
4500 
4501 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4502 		mtx_unlock(&txr->hn_tx_lock);
4503 		if (!sched)
4504 			return;
4505 	}
4506 do_sched:
4507 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4508 }
4509 
4510 static void
4511 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
4512 {
4513 	struct hn_tx_ring *txr = xtxr;
4514 
4515 	mtx_lock(&txr->hn_tx_lock);
4516 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
4517 	hn_start_locked(txr, 0);
4518 	mtx_unlock(&txr->hn_tx_lock);
4519 }
4520 
4521 static void
4522 hn_start_txeof(struct hn_tx_ring *txr)
4523 {
4524 	struct hn_softc *sc = txr->hn_sc;
4525 	struct ifnet *ifp = sc->hn_ifp;
4526 
4527 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
4528 
4529 	if (txr->hn_sched_tx)
4530 		goto do_sched;
4531 
4532 	if (mtx_trylock(&txr->hn_tx_lock)) {
4533 		int sched;
4534 
4535 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4536 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4537 		mtx_unlock(&txr->hn_tx_lock);
4538 		if (sched) {
4539 			taskqueue_enqueue(txr->hn_tx_taskq,
4540 			    &txr->hn_tx_task);
4541 		}
4542 	} else {
4543 do_sched:
4544 		/*
4545 		 * Release the OACTIVE earlier, with the hope, that
4546 		 * others could catch up.  The task will clear the
4547 		 * flag again with the hn_tx_lock to avoid possible
4548 		 * races.
4549 		 */
4550 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4551 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4552 	}
4553 }
4554 
4555 #endif	/* HN_IFSTART_SUPPORT */
4556 
4557 static int
4558 hn_xmit(struct hn_tx_ring *txr, int len)
4559 {
4560 	struct hn_softc *sc = txr->hn_sc;
4561 	struct ifnet *ifp = sc->hn_ifp;
4562 	struct mbuf *m_head;
4563 	int sched = 0;
4564 
4565 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
4566 #ifdef HN_IFSTART_SUPPORT
4567 	KASSERT(hn_use_if_start == 0,
4568 	    ("hn_xmit is called, when if_start is enabled"));
4569 #endif
4570 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
4571 
4572 	if (__predict_false(txr->hn_suspended))
4573 		return (0);
4574 
4575 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
4576 		return (0);
4577 
4578 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
4579 		struct hn_txdesc *txd;
4580 		int error;
4581 
4582 		if (len > 0 && m_head->m_pkthdr.len > len) {
4583 			/*
4584 			 * This sending could be time consuming; let callers
4585 			 * dispatch this packet sending (and sending of any
4586 			 * following up packets) to tx taskqueue.
4587 			 */
4588 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4589 			sched = 1;
4590 			break;
4591 		}
4592 
4593 		txd = hn_txdesc_get(txr);
4594 		if (txd == NULL) {
4595 			txr->hn_no_txdescs++;
4596 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4597 			txr->hn_oactive = 1;
4598 			break;
4599 		}
4600 
4601 		error = hn_encap(ifp, txr, txd, &m_head);
4602 		if (error) {
4603 			/* Both txd and m_head are freed; discard */
4604 			KASSERT(txr->hn_agg_txd == NULL,
4605 			    ("encap failed w/ pending aggregating txdesc"));
4606 			drbr_advance(ifp, txr->hn_mbuf_br);
4607 			continue;
4608 		}
4609 
4610 		if (txr->hn_agg_pktleft == 0) {
4611 			if (txr->hn_agg_txd != NULL) {
4612 				KASSERT(m_head == NULL,
4613 				    ("pending mbuf for aggregating txdesc"));
4614 				error = hn_flush_txagg(ifp, txr);
4615 				if (__predict_false(error)) {
4616 					txr->hn_oactive = 1;
4617 					break;
4618 				}
4619 			} else {
4620 				KASSERT(m_head != NULL, ("mbuf was freed"));
4621 				error = hn_txpkt(ifp, txr, txd);
4622 				if (__predict_false(error)) {
4623 					/* txd is freed, but m_head is not */
4624 					drbr_putback(ifp, txr->hn_mbuf_br,
4625 					    m_head);
4626 					txr->hn_oactive = 1;
4627 					break;
4628 				}
4629 			}
4630 		}
4631 #ifdef INVARIANTS
4632 		else {
4633 			KASSERT(txr->hn_agg_txd != NULL,
4634 			    ("no aggregating txdesc"));
4635 			KASSERT(m_head == NULL,
4636 			    ("pending mbuf for aggregating txdesc"));
4637 		}
4638 #endif
4639 
4640 		/* Sent */
4641 		drbr_advance(ifp, txr->hn_mbuf_br);
4642 	}
4643 
4644 	/* Flush pending aggerated transmission. */
4645 	if (txr->hn_agg_txd != NULL)
4646 		hn_flush_txagg(ifp, txr);
4647 	return (sched);
4648 }
4649 
4650 static int
4651 hn_transmit(struct ifnet *ifp, struct mbuf *m)
4652 {
4653 	struct hn_softc *sc = ifp->if_softc;
4654 	struct hn_tx_ring *txr;
4655 	int error, idx = 0;
4656 
4657 #if defined(INET6) || defined(INET)
4658 	/*
4659 	 * Perform TSO packet header fixup now, since the TSO
4660 	 * packet header should be cache-hot.
4661 	 */
4662 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
4663 		m = hn_tso_fixup(m);
4664 		if (__predict_false(m == NULL)) {
4665 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4666 			return EIO;
4667 		}
4668 	}
4669 #endif
4670 
4671 	/*
4672 	 * Select the TX ring based on flowid
4673 	 */
4674 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
4675 #ifdef RSS
4676 		uint32_t bid;
4677 
4678 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
4679 		    &bid) == 0)
4680 			idx = bid % sc->hn_tx_ring_inuse;
4681 		else
4682 #endif
4683 		{
4684 #if defined(INET6) || defined(INET)
4685 			int tcpsyn = 0;
4686 
4687 			if (m->m_pkthdr.len < 128 &&
4688 			    (m->m_pkthdr.csum_flags &
4689 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
4690 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
4691 				m = hn_check_tcpsyn(m, &tcpsyn);
4692 				if (__predict_false(m == NULL)) {
4693 					if_inc_counter(ifp,
4694 					    IFCOUNTER_OERRORS, 1);
4695 					return (EIO);
4696 				}
4697 			}
4698 #else
4699 			const int tcpsyn = 0;
4700 #endif
4701 			if (tcpsyn)
4702 				idx = 0;
4703 			else
4704 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
4705 		}
4706 	}
4707 	txr = &sc->hn_tx_ring[idx];
4708 
4709 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
4710 	if (error) {
4711 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4712 		return error;
4713 	}
4714 
4715 	if (txr->hn_oactive)
4716 		return 0;
4717 
4718 	if (txr->hn_sched_tx)
4719 		goto do_sched;
4720 
4721 	if (mtx_trylock(&txr->hn_tx_lock)) {
4722 		int sched;
4723 
4724 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
4725 		mtx_unlock(&txr->hn_tx_lock);
4726 		if (!sched)
4727 			return 0;
4728 	}
4729 do_sched:
4730 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4731 	return 0;
4732 }
4733 
4734 static void
4735 hn_tx_ring_qflush(struct hn_tx_ring *txr)
4736 {
4737 	struct mbuf *m;
4738 
4739 	mtx_lock(&txr->hn_tx_lock);
4740 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
4741 		m_freem(m);
4742 	mtx_unlock(&txr->hn_tx_lock);
4743 }
4744 
4745 static void
4746 hn_xmit_qflush(struct ifnet *ifp)
4747 {
4748 	struct hn_softc *sc = ifp->if_softc;
4749 	int i;
4750 
4751 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4752 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4753 	if_qflush(ifp);
4754 }
4755 
4756 static void
4757 hn_xmit_txeof(struct hn_tx_ring *txr)
4758 {
4759 
4760 	if (txr->hn_sched_tx)
4761 		goto do_sched;
4762 
4763 	if (mtx_trylock(&txr->hn_tx_lock)) {
4764 		int sched;
4765 
4766 		txr->hn_oactive = 0;
4767 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
4768 		mtx_unlock(&txr->hn_tx_lock);
4769 		if (sched) {
4770 			taskqueue_enqueue(txr->hn_tx_taskq,
4771 			    &txr->hn_tx_task);
4772 		}
4773 	} else {
4774 do_sched:
4775 		/*
4776 		 * Release the oactive earlier, with the hope, that
4777 		 * others could catch up.  The task will clear the
4778 		 * oactive again with the hn_tx_lock to avoid possible
4779 		 * races.
4780 		 */
4781 		txr->hn_oactive = 0;
4782 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4783 	}
4784 }
4785 
4786 static void
4787 hn_xmit_taskfunc(void *xtxr, int pending __unused)
4788 {
4789 	struct hn_tx_ring *txr = xtxr;
4790 
4791 	mtx_lock(&txr->hn_tx_lock);
4792 	hn_xmit(txr, 0);
4793 	mtx_unlock(&txr->hn_tx_lock);
4794 }
4795 
4796 static void
4797 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
4798 {
4799 	struct hn_tx_ring *txr = xtxr;
4800 
4801 	mtx_lock(&txr->hn_tx_lock);
4802 	txr->hn_oactive = 0;
4803 	hn_xmit(txr, 0);
4804 	mtx_unlock(&txr->hn_tx_lock);
4805 }
4806 
4807 static int
4808 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
4809 {
4810 	struct vmbus_chan_br cbr;
4811 	struct hn_rx_ring *rxr;
4812 	struct hn_tx_ring *txr = NULL;
4813 	int idx, error;
4814 
4815 	idx = vmbus_chan_subidx(chan);
4816 
4817 	/*
4818 	 * Link this channel to RX/TX ring.
4819 	 */
4820 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4821 	    ("invalid channel index %d, should > 0 && < %d",
4822 	     idx, sc->hn_rx_ring_inuse));
4823 	rxr = &sc->hn_rx_ring[idx];
4824 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
4825 	    ("RX ring %d already attached", idx));
4826 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
4827 	rxr->hn_chan = chan;
4828 
4829 	if (bootverbose) {
4830 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
4831 		    idx, vmbus_chan_id(chan));
4832 	}
4833 
4834 	if (idx < sc->hn_tx_ring_inuse) {
4835 		txr = &sc->hn_tx_ring[idx];
4836 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
4837 		    ("TX ring %d already attached", idx));
4838 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
4839 
4840 		txr->hn_chan = chan;
4841 		if (bootverbose) {
4842 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
4843 			    idx, vmbus_chan_id(chan));
4844 		}
4845 	}
4846 
4847 	/* Bind this channel to a proper CPU. */
4848 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
4849 
4850 	/*
4851 	 * Open this channel
4852 	 */
4853 	cbr.cbr = rxr->hn_br;
4854 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
4855 	cbr.cbr_txsz = HN_TXBR_SIZE;
4856 	cbr.cbr_rxsz = HN_RXBR_SIZE;
4857 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
4858 	if (error) {
4859 		if (error == EISCONN) {
4860 			if_printf(sc->hn_ifp, "bufring is connected after "
4861 			    "chan%u open failure\n", vmbus_chan_id(chan));
4862 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4863 		} else {
4864 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
4865 			    vmbus_chan_id(chan), error);
4866 		}
4867 	}
4868 	return (error);
4869 }
4870 
4871 static void
4872 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
4873 {
4874 	struct hn_rx_ring *rxr;
4875 	int idx, error;
4876 
4877 	idx = vmbus_chan_subidx(chan);
4878 
4879 	/*
4880 	 * Link this channel to RX/TX ring.
4881 	 */
4882 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4883 	    ("invalid channel index %d, should > 0 && < %d",
4884 	     idx, sc->hn_rx_ring_inuse));
4885 	rxr = &sc->hn_rx_ring[idx];
4886 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
4887 	    ("RX ring %d is not attached", idx));
4888 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4889 
4890 	if (idx < sc->hn_tx_ring_inuse) {
4891 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
4892 
4893 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
4894 		    ("TX ring %d is not attached attached", idx));
4895 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4896 	}
4897 
4898 	/*
4899 	 * Close this channel.
4900 	 *
4901 	 * NOTE:
4902 	 * Channel closing does _not_ destroy the target channel.
4903 	 */
4904 	error = vmbus_chan_close_direct(chan);
4905 	if (error == EISCONN) {
4906 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
4907 		    "after being closed\n", vmbus_chan_id(chan));
4908 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4909 	} else if (error) {
4910 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
4911 		    vmbus_chan_id(chan), error);
4912 	}
4913 }
4914 
4915 static int
4916 hn_attach_subchans(struct hn_softc *sc)
4917 {
4918 	struct vmbus_channel **subchans;
4919 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4920 	int i, error = 0;
4921 
4922 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
4923 
4924 	/* Attach the sub-channels. */
4925 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4926 	for (i = 0; i < subchan_cnt; ++i) {
4927 		int error1;
4928 
4929 		error1 = hn_chan_attach(sc, subchans[i]);
4930 		if (error1) {
4931 			error = error1;
4932 			/* Move on; all channels will be detached later. */
4933 		}
4934 	}
4935 	vmbus_subchan_rel(subchans, subchan_cnt);
4936 
4937 	if (error) {
4938 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
4939 	} else {
4940 		if (bootverbose) {
4941 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
4942 			    subchan_cnt);
4943 		}
4944 	}
4945 	return (error);
4946 }
4947 
4948 static void
4949 hn_detach_allchans(struct hn_softc *sc)
4950 {
4951 	struct vmbus_channel **subchans;
4952 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4953 	int i;
4954 
4955 	if (subchan_cnt == 0)
4956 		goto back;
4957 
4958 	/* Detach the sub-channels. */
4959 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4960 	for (i = 0; i < subchan_cnt; ++i)
4961 		hn_chan_detach(sc, subchans[i]);
4962 	vmbus_subchan_rel(subchans, subchan_cnt);
4963 
4964 back:
4965 	/*
4966 	 * Detach the primary channel, _after_ all sub-channels
4967 	 * are detached.
4968 	 */
4969 	hn_chan_detach(sc, sc->hn_prichan);
4970 
4971 	/* Wait for sub-channels to be destroyed, if any. */
4972 	vmbus_subchan_drain(sc->hn_prichan);
4973 
4974 #ifdef INVARIANTS
4975 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4976 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
4977 		    HN_RX_FLAG_ATTACHED) == 0,
4978 		    ("%dth RX ring is still attached", i));
4979 	}
4980 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4981 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
4982 		    HN_TX_FLAG_ATTACHED) == 0,
4983 		    ("%dth TX ring is still attached", i));
4984 	}
4985 #endif
4986 }
4987 
4988 static int
4989 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
4990 {
4991 	struct vmbus_channel **subchans;
4992 	int nchan, rxr_cnt, error;
4993 
4994 	nchan = *nsubch + 1;
4995 	if (nchan == 1) {
4996 		/*
4997 		 * Multiple RX/TX rings are not requested.
4998 		 */
4999 		*nsubch = 0;
5000 		return (0);
5001 	}
5002 
5003 	/*
5004 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
5005 	 * table entries.
5006 	 */
5007 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
5008 	if (error) {
5009 		/* No RSS; this is benign. */
5010 		*nsubch = 0;
5011 		return (0);
5012 	}
5013 	if (bootverbose) {
5014 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
5015 		    rxr_cnt, nchan);
5016 	}
5017 
5018 	if (nchan > rxr_cnt)
5019 		nchan = rxr_cnt;
5020 	if (nchan == 1) {
5021 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
5022 		*nsubch = 0;
5023 		return (0);
5024 	}
5025 
5026 	/*
5027 	 * Allocate sub-channels from NVS.
5028 	 */
5029 	*nsubch = nchan - 1;
5030 	error = hn_nvs_alloc_subchans(sc, nsubch);
5031 	if (error || *nsubch == 0) {
5032 		/* Failed to allocate sub-channels. */
5033 		*nsubch = 0;
5034 		return (0);
5035 	}
5036 
5037 	/*
5038 	 * Wait for all sub-channels to become ready before moving on.
5039 	 */
5040 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
5041 	vmbus_subchan_rel(subchans, *nsubch);
5042 	return (0);
5043 }
5044 
5045 static bool
5046 hn_synth_attachable(const struct hn_softc *sc)
5047 {
5048 	int i;
5049 
5050 	if (sc->hn_flags & HN_FLAG_ERRORS)
5051 		return (false);
5052 
5053 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5054 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5055 
5056 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
5057 			return (false);
5058 	}
5059 	return (true);
5060 }
5061 
5062 /*
5063  * Make sure that the RX filter is zero after the successful
5064  * RNDIS initialization.
5065  *
5066  * NOTE:
5067  * Under certain conditions on certain versions of Hyper-V,
5068  * the RNDIS rxfilter is _not_ zero on the hypervisor side
5069  * after the successful RNDIS initialization, which breaks
5070  * the assumption of any following code (well, it breaks the
5071  * RNDIS API contract actually).  Clear the RNDIS rxfilter
5072  * explicitly, drain packets sneaking through, and drain the
5073  * interrupt taskqueues scheduled due to the stealth packets.
5074  */
5075 static void
5076 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
5077 {
5078 
5079 	hn_disable_rx(sc);
5080 	hn_drain_rxtx(sc, nchan);
5081 }
5082 
5083 static int
5084 hn_synth_attach(struct hn_softc *sc, int mtu)
5085 {
5086 #define ATTACHED_NVS		0x0002
5087 #define ATTACHED_RNDIS		0x0004
5088 
5089 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
5090 	int error, nsubch, nchan = 1, i, rndis_inited;
5091 	uint32_t old_caps, attached = 0;
5092 
5093 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
5094 	    ("synthetic parts were attached"));
5095 
5096 	if (!hn_synth_attachable(sc))
5097 		return (ENXIO);
5098 
5099 	/* Save capabilities for later verification. */
5100 	old_caps = sc->hn_caps;
5101 	sc->hn_caps = 0;
5102 
5103 	/* Clear RSS stuffs. */
5104 	sc->hn_rss_ind_size = 0;
5105 	sc->hn_rss_hash = 0;
5106 
5107 	/*
5108 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
5109 	 */
5110 	error = hn_chan_attach(sc, sc->hn_prichan);
5111 	if (error)
5112 		goto failed;
5113 
5114 	/*
5115 	 * Attach NVS.
5116 	 */
5117 	error = hn_nvs_attach(sc, mtu);
5118 	if (error)
5119 		goto failed;
5120 	attached |= ATTACHED_NVS;
5121 
5122 	/*
5123 	 * Attach RNDIS _after_ NVS is attached.
5124 	 */
5125 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
5126 	if (rndis_inited)
5127 		attached |= ATTACHED_RNDIS;
5128 	if (error)
5129 		goto failed;
5130 
5131 	/*
5132 	 * Make sure capabilities are not changed.
5133 	 */
5134 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
5135 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
5136 		    old_caps, sc->hn_caps);
5137 		error = ENXIO;
5138 		goto failed;
5139 	}
5140 
5141 	/*
5142 	 * Allocate sub-channels for multi-TX/RX rings.
5143 	 *
5144 	 * NOTE:
5145 	 * The # of RX rings that can be used is equivalent to the # of
5146 	 * channels to be requested.
5147 	 */
5148 	nsubch = sc->hn_rx_ring_cnt - 1;
5149 	error = hn_synth_alloc_subchans(sc, &nsubch);
5150 	if (error)
5151 		goto failed;
5152 	/* NOTE: _Full_ synthetic parts detach is required now. */
5153 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
5154 
5155 	/*
5156 	 * Set the # of TX/RX rings that could be used according to
5157 	 * the # of channels that NVS offered.
5158 	 */
5159 	nchan = nsubch + 1;
5160 	hn_set_ring_inuse(sc, nchan);
5161 	if (nchan == 1) {
5162 		/* Only the primary channel can be used; done */
5163 		goto back;
5164 	}
5165 
5166 	/*
5167 	 * Attach the sub-channels.
5168 	 *
5169 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
5170 	 */
5171 	error = hn_attach_subchans(sc);
5172 	if (error)
5173 		goto failed;
5174 
5175 	/*
5176 	 * Configure RSS key and indirect table _after_ all sub-channels
5177 	 * are attached.
5178 	 */
5179 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
5180 		/*
5181 		 * RSS key is not set yet; set it to the default RSS key.
5182 		 */
5183 		if (bootverbose)
5184 			if_printf(sc->hn_ifp, "setup default RSS key\n");
5185 #ifdef RSS
5186 		rss_getkey(rss->rss_key);
5187 #else
5188 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
5189 #endif
5190 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
5191 	}
5192 
5193 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
5194 		/*
5195 		 * RSS indirect table is not set yet; set it up in round-
5196 		 * robin fashion.
5197 		 */
5198 		if (bootverbose) {
5199 			if_printf(sc->hn_ifp, "setup default RSS indirect "
5200 			    "table\n");
5201 		}
5202 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
5203 			uint32_t subidx;
5204 
5205 #ifdef RSS
5206 			subidx = rss_get_indirection_to_bucket(i);
5207 #else
5208 			subidx = i;
5209 #endif
5210 			rss->rss_ind[i] = subidx % nchan;
5211 		}
5212 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
5213 	} else {
5214 		/*
5215 		 * # of usable channels may be changed, so we have to
5216 		 * make sure that all entries in RSS indirect table
5217 		 * are valid.
5218 		 *
5219 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
5220 		 */
5221 		hn_rss_ind_fixup(sc);
5222 	}
5223 
5224 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
5225 	if (error)
5226 		goto failed;
5227 back:
5228 	/*
5229 	 * Fixup transmission aggregation setup.
5230 	 */
5231 	hn_set_txagg(sc);
5232 	hn_rndis_init_fixat(sc, nchan);
5233 	return (0);
5234 
5235 failed:
5236 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
5237 		hn_rndis_init_fixat(sc, nchan);
5238 		hn_synth_detach(sc);
5239 	} else {
5240 		if (attached & ATTACHED_RNDIS) {
5241 			hn_rndis_init_fixat(sc, nchan);
5242 			hn_rndis_detach(sc);
5243 		}
5244 		if (attached & ATTACHED_NVS)
5245 			hn_nvs_detach(sc);
5246 		hn_chan_detach(sc, sc->hn_prichan);
5247 		/* Restore old capabilities. */
5248 		sc->hn_caps = old_caps;
5249 	}
5250 	return (error);
5251 
5252 #undef ATTACHED_RNDIS
5253 #undef ATTACHED_NVS
5254 }
5255 
5256 /*
5257  * NOTE:
5258  * The interface must have been suspended though hn_suspend(), before
5259  * this function get called.
5260  */
5261 static void
5262 hn_synth_detach(struct hn_softc *sc)
5263 {
5264 
5265 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
5266 	    ("synthetic parts were not attached"));
5267 
5268 	/* Detach the RNDIS first. */
5269 	hn_rndis_detach(sc);
5270 
5271 	/* Detach NVS. */
5272 	hn_nvs_detach(sc);
5273 
5274 	/* Detach all of the channels. */
5275 	hn_detach_allchans(sc);
5276 
5277 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
5278 }
5279 
5280 static void
5281 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
5282 {
5283 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
5284 	    ("invalid ring count %d", ring_cnt));
5285 
5286 	if (sc->hn_tx_ring_cnt > ring_cnt)
5287 		sc->hn_tx_ring_inuse = ring_cnt;
5288 	else
5289 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5290 	sc->hn_rx_ring_inuse = ring_cnt;
5291 
5292 #ifdef RSS
5293 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
5294 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
5295 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
5296 		    rss_getnumbuckets());
5297 	}
5298 #endif
5299 
5300 	if (bootverbose) {
5301 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
5302 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
5303 	}
5304 }
5305 
5306 static void
5307 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
5308 {
5309 
5310 	/*
5311 	 * NOTE:
5312 	 * The TX bufring will not be drained by the hypervisor,
5313 	 * if the primary channel is revoked.
5314 	 */
5315 	while (!vmbus_chan_rx_empty(chan) ||
5316 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
5317 	     !vmbus_chan_tx_empty(chan)))
5318 		pause("waitch", 1);
5319 	vmbus_chan_intr_drain(chan);
5320 }
5321 
5322 static void
5323 hn_disable_rx(struct hn_softc *sc)
5324 {
5325 
5326 	/*
5327 	 * Disable RX by clearing RX filter forcefully.
5328 	 */
5329 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
5330 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
5331 
5332 	/*
5333 	 * Give RNDIS enough time to flush all pending data packets.
5334 	 */
5335 	pause("waitrx", (200 * hz) / 1000);
5336 }
5337 
5338 /*
5339  * NOTE:
5340  * RX/TX _must_ have been suspended/disabled, before this function
5341  * is called.
5342  */
5343 static void
5344 hn_drain_rxtx(struct hn_softc *sc, int nchan)
5345 {
5346 	struct vmbus_channel **subch = NULL;
5347 	int nsubch;
5348 
5349 	/*
5350 	 * Drain RX/TX bufrings and interrupts.
5351 	 */
5352 	nsubch = nchan - 1;
5353 	if (nsubch > 0)
5354 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
5355 
5356 	if (subch != NULL) {
5357 		int i;
5358 
5359 		for (i = 0; i < nsubch; ++i)
5360 			hn_chan_drain(sc, subch[i]);
5361 	}
5362 	hn_chan_drain(sc, sc->hn_prichan);
5363 
5364 	if (subch != NULL)
5365 		vmbus_subchan_rel(subch, nsubch);
5366 }
5367 
5368 static void
5369 hn_suspend_data(struct hn_softc *sc)
5370 {
5371 	struct hn_tx_ring *txr;
5372 	int i;
5373 
5374 	HN_LOCK_ASSERT(sc);
5375 
5376 	/*
5377 	 * Suspend TX.
5378 	 */
5379 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
5380 		txr = &sc->hn_tx_ring[i];
5381 
5382 		mtx_lock(&txr->hn_tx_lock);
5383 		txr->hn_suspended = 1;
5384 		mtx_unlock(&txr->hn_tx_lock);
5385 		/* No one is able send more packets now. */
5386 
5387 		/*
5388 		 * Wait for all pending sends to finish.
5389 		 *
5390 		 * NOTE:
5391 		 * We will _not_ receive all pending send-done, if the
5392 		 * primary channel is revoked.
5393 		 */
5394 		while (hn_tx_ring_pending(txr) &&
5395 		    !vmbus_chan_is_revoked(sc->hn_prichan))
5396 			pause("hnwtx", 1 /* 1 tick */);
5397 	}
5398 
5399 	/*
5400 	 * Disable RX.
5401 	 */
5402 	hn_disable_rx(sc);
5403 
5404 	/*
5405 	 * Drain RX/TX.
5406 	 */
5407 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
5408 
5409 	/*
5410 	 * Drain any pending TX tasks.
5411 	 *
5412 	 * NOTE:
5413 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
5414 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
5415 	 */
5416 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
5417 		txr = &sc->hn_tx_ring[i];
5418 
5419 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
5420 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
5421 	}
5422 }
5423 
5424 static void
5425 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
5426 {
5427 
5428 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
5429 }
5430 
5431 static void
5432 hn_suspend_mgmt(struct hn_softc *sc)
5433 {
5434 	struct task task;
5435 
5436 	HN_LOCK_ASSERT(sc);
5437 
5438 	/*
5439 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
5440 	 * through hn_mgmt_taskq.
5441 	 */
5442 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
5443 	vmbus_chan_run_task(sc->hn_prichan, &task);
5444 
5445 	/*
5446 	 * Make sure that all pending management tasks are completed.
5447 	 */
5448 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
5449 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
5450 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
5451 }
5452 
5453 static void
5454 hn_suspend(struct hn_softc *sc)
5455 {
5456 
5457 	/* Disable polling. */
5458 	hn_polling(sc, 0);
5459 
5460 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
5461 	    (sc->hn_flags & HN_FLAG_RXVF))
5462 		hn_suspend_data(sc);
5463 	hn_suspend_mgmt(sc);
5464 }
5465 
5466 static void
5467 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
5468 {
5469 	int i;
5470 
5471 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
5472 	    ("invalid TX ring count %d", tx_ring_cnt));
5473 
5474 	for (i = 0; i < tx_ring_cnt; ++i) {
5475 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
5476 
5477 		mtx_lock(&txr->hn_tx_lock);
5478 		txr->hn_suspended = 0;
5479 		mtx_unlock(&txr->hn_tx_lock);
5480 	}
5481 }
5482 
5483 static void
5484 hn_resume_data(struct hn_softc *sc)
5485 {
5486 	int i;
5487 
5488 	HN_LOCK_ASSERT(sc);
5489 
5490 	/*
5491 	 * Re-enable RX.
5492 	 */
5493 	hn_rxfilter_config(sc);
5494 
5495 	/*
5496 	 * Make sure to clear suspend status on "all" TX rings,
5497 	 * since hn_tx_ring_inuse can be changed after
5498 	 * hn_suspend_data().
5499 	 */
5500 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
5501 
5502 #ifdef HN_IFSTART_SUPPORT
5503 	if (!hn_use_if_start)
5504 #endif
5505 	{
5506 		/*
5507 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
5508 		 * reduced.
5509 		 */
5510 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
5511 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
5512 	}
5513 
5514 	/*
5515 	 * Kick start TX.
5516 	 */
5517 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
5518 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
5519 
5520 		/*
5521 		 * Use txeof task, so that any pending oactive can be
5522 		 * cleared properly.
5523 		 */
5524 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5525 	}
5526 }
5527 
5528 static void
5529 hn_resume_mgmt(struct hn_softc *sc)
5530 {
5531 
5532 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
5533 
5534 	/*
5535 	 * Kick off network change detection, if it was pending.
5536 	 * If no network change was pending, start link status
5537 	 * checks, which is more lightweight than network change
5538 	 * detection.
5539 	 */
5540 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
5541 		hn_change_network(sc);
5542 	else
5543 		hn_update_link_status(sc);
5544 }
5545 
5546 static void
5547 hn_resume(struct hn_softc *sc)
5548 {
5549 
5550 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
5551 	    (sc->hn_flags & HN_FLAG_RXVF))
5552 		hn_resume_data(sc);
5553 
5554 	/*
5555 	 * When the VF is activated, the synthetic interface is changed
5556 	 * to DOWN in hn_rxvf_change().  Here, if the VF is still active,
5557 	 * we don't call hn_resume_mgmt() until the VF is deactivated in
5558 	 * hn_rxvf_change().
5559 	 */
5560 	if (!(sc->hn_flags & HN_FLAG_RXVF))
5561 		hn_resume_mgmt(sc);
5562 
5563 	/*
5564 	 * Re-enable polling if this interface is running and
5565 	 * the polling is requested.
5566 	 */
5567 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
5568 		hn_polling(sc, sc->hn_pollhz);
5569 }
5570 
5571 static void
5572 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
5573 {
5574 	const struct rndis_status_msg *msg;
5575 	int ofs;
5576 
5577 	if (dlen < sizeof(*msg)) {
5578 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
5579 		return;
5580 	}
5581 	msg = data;
5582 
5583 	switch (msg->rm_status) {
5584 	case RNDIS_STATUS_MEDIA_CONNECT:
5585 	case RNDIS_STATUS_MEDIA_DISCONNECT:
5586 		hn_update_link_status(sc);
5587 		break;
5588 
5589 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
5590 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
5591 		/* Not really useful; ignore. */
5592 		break;
5593 
5594 	case RNDIS_STATUS_NETWORK_CHANGE:
5595 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
5596 		if (dlen < ofs + msg->rm_stbuflen ||
5597 		    msg->rm_stbuflen < sizeof(uint32_t)) {
5598 			if_printf(sc->hn_ifp, "network changed\n");
5599 		} else {
5600 			uint32_t change;
5601 
5602 			memcpy(&change, ((const uint8_t *)msg) + ofs,
5603 			    sizeof(change));
5604 			if_printf(sc->hn_ifp, "network changed, change %u\n",
5605 			    change);
5606 		}
5607 		hn_change_network(sc);
5608 		break;
5609 
5610 	default:
5611 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
5612 		    msg->rm_status);
5613 		break;
5614 	}
5615 }
5616 
5617 static int
5618 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
5619 {
5620 	const struct rndis_pktinfo *pi = info_data;
5621 	uint32_t mask = 0;
5622 
5623 	while (info_dlen != 0) {
5624 		const void *data;
5625 		uint32_t dlen;
5626 
5627 		if (__predict_false(info_dlen < sizeof(*pi)))
5628 			return (EINVAL);
5629 		if (__predict_false(info_dlen < pi->rm_size))
5630 			return (EINVAL);
5631 		info_dlen -= pi->rm_size;
5632 
5633 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
5634 			return (EINVAL);
5635 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
5636 			return (EINVAL);
5637 		dlen = pi->rm_size - pi->rm_pktinfooffset;
5638 		data = pi->rm_data;
5639 
5640 		switch (pi->rm_type) {
5641 		case NDIS_PKTINFO_TYPE_VLAN:
5642 			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
5643 				return (EINVAL);
5644 			info->vlan_info = *((const uint32_t *)data);
5645 			mask |= HN_RXINFO_VLAN;
5646 			break;
5647 
5648 		case NDIS_PKTINFO_TYPE_CSUM:
5649 			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
5650 				return (EINVAL);
5651 			info->csum_info = *((const uint32_t *)data);
5652 			mask |= HN_RXINFO_CSUM;
5653 			break;
5654 
5655 		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
5656 			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
5657 				return (EINVAL);
5658 			info->hash_value = *((const uint32_t *)data);
5659 			mask |= HN_RXINFO_HASHVAL;
5660 			break;
5661 
5662 		case HN_NDIS_PKTINFO_TYPE_HASHINF:
5663 			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
5664 				return (EINVAL);
5665 			info->hash_info = *((const uint32_t *)data);
5666 			mask |= HN_RXINFO_HASHINF;
5667 			break;
5668 
5669 		default:
5670 			goto next;
5671 		}
5672 
5673 		if (mask == HN_RXINFO_ALL) {
5674 			/* All found; done */
5675 			break;
5676 		}
5677 next:
5678 		pi = (const struct rndis_pktinfo *)
5679 		    ((const uint8_t *)pi + pi->rm_size);
5680 	}
5681 
5682 	/*
5683 	 * Final fixup.
5684 	 * - If there is no hash value, invalidate the hash info.
5685 	 */
5686 	if ((mask & HN_RXINFO_HASHVAL) == 0)
5687 		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
5688 	return (0);
5689 }
5690 
5691 static __inline bool
5692 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
5693 {
5694 
5695 	if (off < check_off) {
5696 		if (__predict_true(off + len <= check_off))
5697 			return (false);
5698 	} else if (off > check_off) {
5699 		if (__predict_true(check_off + check_len <= off))
5700 			return (false);
5701 	}
5702 	return (true);
5703 }
5704 
5705 static void
5706 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
5707 {
5708 	const struct rndis_packet_msg *pkt;
5709 	struct hn_rxinfo info;
5710 	int data_off, pktinfo_off, data_len, pktinfo_len;
5711 
5712 	/*
5713 	 * Check length.
5714 	 */
5715 	if (__predict_false(dlen < sizeof(*pkt))) {
5716 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
5717 		return;
5718 	}
5719 	pkt = data;
5720 
5721 	if (__predict_false(dlen < pkt->rm_len)) {
5722 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
5723 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
5724 		return;
5725 	}
5726 	if (__predict_false(pkt->rm_len <
5727 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
5728 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
5729 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
5730 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
5731 		    pkt->rm_pktinfolen);
5732 		return;
5733 	}
5734 	if (__predict_false(pkt->rm_datalen == 0)) {
5735 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
5736 		return;
5737 	}
5738 
5739 	/*
5740 	 * Check offests.
5741 	 */
5742 #define IS_OFFSET_INVALID(ofs)			\
5743 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
5744 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
5745 
5746 	/* XXX Hyper-V does not meet data offset alignment requirement */
5747 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
5748 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5749 		    "data offset %u\n", pkt->rm_dataoffset);
5750 		return;
5751 	}
5752 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
5753 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
5754 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5755 		    "oob offset %u\n", pkt->rm_oobdataoffset);
5756 		return;
5757 	}
5758 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
5759 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
5760 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5761 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
5762 		return;
5763 	}
5764 
5765 #undef IS_OFFSET_INVALID
5766 
5767 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
5768 	data_len = pkt->rm_datalen;
5769 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
5770 	pktinfo_len = pkt->rm_pktinfolen;
5771 
5772 	/*
5773 	 * Check OOB coverage.
5774 	 */
5775 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
5776 		int oob_off, oob_len;
5777 
5778 		if_printf(rxr->hn_ifp, "got oobdata\n");
5779 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
5780 		oob_len = pkt->rm_oobdatalen;
5781 
5782 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
5783 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5784 			    "oob overflow, msglen %u, oob abs %d len %d\n",
5785 			    pkt->rm_len, oob_off, oob_len);
5786 			return;
5787 		}
5788 
5789 		/*
5790 		 * Check against data.
5791 		 */
5792 		if (hn_rndis_check_overlap(oob_off, oob_len,
5793 		    data_off, data_len)) {
5794 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5795 			    "oob overlaps data, oob abs %d len %d, "
5796 			    "data abs %d len %d\n",
5797 			    oob_off, oob_len, data_off, data_len);
5798 			return;
5799 		}
5800 
5801 		/*
5802 		 * Check against pktinfo.
5803 		 */
5804 		if (pktinfo_len != 0 &&
5805 		    hn_rndis_check_overlap(oob_off, oob_len,
5806 		    pktinfo_off, pktinfo_len)) {
5807 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5808 			    "oob overlaps pktinfo, oob abs %d len %d, "
5809 			    "pktinfo abs %d len %d\n",
5810 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
5811 			return;
5812 		}
5813 	}
5814 
5815 	/*
5816 	 * Check per-packet-info coverage and find useful per-packet-info.
5817 	 */
5818 	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
5819 	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
5820 	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
5821 	if (__predict_true(pktinfo_len != 0)) {
5822 		bool overlap;
5823 		int error;
5824 
5825 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
5826 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5827 			    "pktinfo overflow, msglen %u, "
5828 			    "pktinfo abs %d len %d\n",
5829 			    pkt->rm_len, pktinfo_off, pktinfo_len);
5830 			return;
5831 		}
5832 
5833 		/*
5834 		 * Check packet info coverage.
5835 		 */
5836 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
5837 		    data_off, data_len);
5838 		if (__predict_false(overlap)) {
5839 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5840 			    "pktinfo overlap data, pktinfo abs %d len %d, "
5841 			    "data abs %d len %d\n",
5842 			    pktinfo_off, pktinfo_len, data_off, data_len);
5843 			return;
5844 		}
5845 
5846 		/*
5847 		 * Find useful per-packet-info.
5848 		 */
5849 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
5850 		    pktinfo_len, &info);
5851 		if (__predict_false(error)) {
5852 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
5853 			    "pktinfo\n");
5854 			return;
5855 		}
5856 	}
5857 
5858 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
5859 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5860 		    "data overflow, msglen %u, data abs %d len %d\n",
5861 		    pkt->rm_len, data_off, data_len);
5862 		return;
5863 	}
5864 	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
5865 }
5866 
5867 static __inline void
5868 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
5869 {
5870 	const struct rndis_msghdr *hdr;
5871 
5872 	if (__predict_false(dlen < sizeof(*hdr))) {
5873 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
5874 		return;
5875 	}
5876 	hdr = data;
5877 
5878 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
5879 		/* Hot data path. */
5880 		hn_rndis_rx_data(rxr, data, dlen);
5881 		/* Done! */
5882 		return;
5883 	}
5884 
5885 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
5886 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
5887 	else
5888 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
5889 }
5890 
5891 static void
5892 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
5893 {
5894 	const struct hn_nvs_hdr *hdr;
5895 
5896 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
5897 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
5898 		return;
5899 	}
5900 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
5901 
5902 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
5903 		/* Useless; ignore */
5904 		return;
5905 	}
5906 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
5907 }
5908 
5909 static void
5910 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
5911     const struct vmbus_chanpkt_hdr *pkt)
5912 {
5913 	struct hn_nvs_sendctx *sndc;
5914 
5915 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
5916 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
5917 	    VMBUS_CHANPKT_DATALEN(pkt));
5918 	/*
5919 	 * NOTE:
5920 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
5921 	 * its callback.
5922 	 */
5923 }
5924 
5925 static void
5926 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5927     const struct vmbus_chanpkt_hdr *pkthdr)
5928 {
5929 	const struct vmbus_chanpkt_rxbuf *pkt;
5930 	const struct hn_nvs_hdr *nvs_hdr;
5931 	int count, i, hlen;
5932 
5933 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
5934 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
5935 		return;
5936 	}
5937 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
5938 
5939 	/* Make sure that this is a RNDIS message. */
5940 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
5941 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
5942 		    nvs_hdr->nvs_type);
5943 		return;
5944 	}
5945 
5946 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
5947 	if (__predict_false(hlen < sizeof(*pkt))) {
5948 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
5949 		return;
5950 	}
5951 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
5952 
5953 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
5954 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
5955 		    pkt->cp_rxbuf_id);
5956 		return;
5957 	}
5958 
5959 	count = pkt->cp_rxbuf_cnt;
5960 	if (__predict_false(hlen <
5961 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
5962 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
5963 		return;
5964 	}
5965 
5966 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
5967 	for (i = 0; i < count; ++i) {
5968 		int ofs, len;
5969 
5970 		ofs = pkt->cp_rxbuf[i].rb_ofs;
5971 		len = pkt->cp_rxbuf[i].rb_len;
5972 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
5973 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
5974 			    "ofs %d, len %d\n", i, ofs, len);
5975 			continue;
5976 		}
5977 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
5978 	}
5979 
5980 	/*
5981 	 * Ack the consumed RXBUF associated w/ this channel packet,
5982 	 * so that this RXBUF can be recycled by the hypervisor.
5983 	 */
5984 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
5985 }
5986 
5987 static void
5988 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5989     uint64_t tid)
5990 {
5991 	struct hn_nvs_rndis_ack ack;
5992 	int retries, error;
5993 
5994 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
5995 	ack.nvs_status = HN_NVS_STATUS_OK;
5996 
5997 	retries = 0;
5998 again:
5999 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
6000 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
6001 	if (__predict_false(error == EAGAIN)) {
6002 		/*
6003 		 * NOTE:
6004 		 * This should _not_ happen in real world, since the
6005 		 * consumption of the TX bufring from the TX path is
6006 		 * controlled.
6007 		 */
6008 		if (rxr->hn_ack_failed == 0)
6009 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
6010 		rxr->hn_ack_failed++;
6011 		retries++;
6012 		if (retries < 10) {
6013 			DELAY(100);
6014 			goto again;
6015 		}
6016 		/* RXBUF leaks! */
6017 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
6018 	}
6019 }
6020 
6021 static void
6022 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
6023 {
6024 	struct hn_rx_ring *rxr = xrxr;
6025 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
6026 
6027 	for (;;) {
6028 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
6029 		int error, pktlen;
6030 
6031 		pktlen = rxr->hn_pktbuf_len;
6032 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
6033 		if (__predict_false(error == ENOBUFS)) {
6034 			void *nbuf;
6035 			int nlen;
6036 
6037 			/*
6038 			 * Expand channel packet buffer.
6039 			 *
6040 			 * XXX
6041 			 * Use M_WAITOK here, since allocation failure
6042 			 * is fatal.
6043 			 */
6044 			nlen = rxr->hn_pktbuf_len * 2;
6045 			while (nlen < pktlen)
6046 				nlen *= 2;
6047 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
6048 
6049 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
6050 			    rxr->hn_pktbuf_len, nlen);
6051 
6052 			free(rxr->hn_pktbuf, M_DEVBUF);
6053 			rxr->hn_pktbuf = nbuf;
6054 			rxr->hn_pktbuf_len = nlen;
6055 			/* Retry! */
6056 			continue;
6057 		} else if (__predict_false(error == EAGAIN)) {
6058 			/* No more channel packets; done! */
6059 			break;
6060 		}
6061 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
6062 
6063 		switch (pkt->cph_type) {
6064 		case VMBUS_CHANPKT_TYPE_COMP:
6065 			hn_nvs_handle_comp(sc, chan, pkt);
6066 			break;
6067 
6068 		case VMBUS_CHANPKT_TYPE_RXBUF:
6069 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
6070 			break;
6071 
6072 		case VMBUS_CHANPKT_TYPE_INBAND:
6073 			hn_nvs_handle_notify(sc, pkt);
6074 			break;
6075 
6076 		default:
6077 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
6078 			    pkt->cph_type);
6079 			break;
6080 		}
6081 	}
6082 	hn_chan_rollup(rxr, rxr->hn_txr);
6083 }
6084 
6085 static void
6086 hn_sysinit(void *arg __unused)
6087 {
6088 	int i;
6089 
6090 	/*
6091 	 * Initialize VF map.
6092 	 */
6093 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
6094 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
6095 	hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
6096 	    M_WAITOK | M_ZERO);
6097 
6098 	/*
6099 	 * Fix the # of TX taskqueues.
6100 	 */
6101 	if (hn_tx_taskq_cnt <= 0)
6102 		hn_tx_taskq_cnt = 1;
6103 	else if (hn_tx_taskq_cnt > mp_ncpus)
6104 		hn_tx_taskq_cnt = mp_ncpus;
6105 
6106 	/*
6107 	 * Fix the TX taskqueue mode.
6108 	 */
6109 	switch (hn_tx_taskq_mode) {
6110 	case HN_TX_TASKQ_M_INDEP:
6111 	case HN_TX_TASKQ_M_GLOBAL:
6112 	case HN_TX_TASKQ_M_EVTTQ:
6113 		break;
6114 	default:
6115 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
6116 		break;
6117 	}
6118 
6119 	if (vm_guest != VM_GUEST_HV)
6120 		return;
6121 
6122 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
6123 		return;
6124 
6125 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
6126 	    M_DEVBUF, M_WAITOK);
6127 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
6128 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
6129 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
6130 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
6131 		    "hn tx%d", i);
6132 	}
6133 }
6134 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
6135 
6136 static void
6137 hn_sysuninit(void *arg __unused)
6138 {
6139 
6140 	if (hn_tx_taskque != NULL) {
6141 		int i;
6142 
6143 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
6144 			taskqueue_free(hn_tx_taskque[i]);
6145 		free(hn_tx_taskque, M_DEVBUF);
6146 	}
6147 
6148 	if (hn_vfmap != NULL)
6149 		free(hn_vfmap, M_DEVBUF);
6150 	rm_destroy(&hn_vfmap_lock);
6151 }
6152 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
6153