xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision e17f5b1d)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/bus.h>
66 #include <sys/counter.h>
67 #include <sys/kernel.h>
68 #include <sys/limits.h>
69 #include <sys/malloc.h>
70 #include <sys/mbuf.h>
71 #include <sys/module.h>
72 #include <sys/queue.h>
73 #include <sys/lock.h>
74 #include <sys/rmlock.h>
75 #include <sys/sbuf.h>
76 #include <sys/smp.h>
77 #include <sys/socket.h>
78 #include <sys/sockio.h>
79 #include <sys/sx.h>
80 #include <sys/sysctl.h>
81 #include <sys/taskqueue.h>
82 #include <sys/buf_ring.h>
83 #include <sys/eventhandler.h>
84 
85 #include <machine/atomic.h>
86 #include <machine/in_cksum.h>
87 
88 #include <net/bpf.h>
89 #include <net/ethernet.h>
90 #include <net/if.h>
91 #include <net/if_dl.h>
92 #include <net/if_media.h>
93 #include <net/if_types.h>
94 #include <net/if_var.h>
95 #include <net/rndis.h>
96 #ifdef RSS
97 #include <net/rss_config.h>
98 #endif
99 
100 #include <netinet/in_systm.h>
101 #include <netinet/in.h>
102 #include <netinet/ip.h>
103 #include <netinet/ip6.h>
104 #include <netinet/tcp.h>
105 #include <netinet/tcp_lro.h>
106 #include <netinet/udp.h>
107 
108 #include <dev/hyperv/include/hyperv.h>
109 #include <dev/hyperv/include/hyperv_busdma.h>
110 #include <dev/hyperv/include/vmbus.h>
111 #include <dev/hyperv/include/vmbus_xact.h>
112 
113 #include <dev/hyperv/netvsc/ndis.h>
114 #include <dev/hyperv/netvsc/if_hnreg.h>
115 #include <dev/hyperv/netvsc/if_hnvar.h>
116 #include <dev/hyperv/netvsc/hn_nvs.h>
117 #include <dev/hyperv/netvsc/hn_rndis.h>
118 
119 #include "vmbus_if.h"
120 
121 #define HN_IFSTART_SUPPORT
122 
123 #define HN_RING_CNT_DEF_MAX		8
124 
125 #define HN_VFMAP_SIZE_DEF		8
126 
127 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
128 
129 /* YYY should get it from the underlying channel */
130 #define HN_TX_DESC_CNT			512
131 
132 #define HN_RNDIS_PKT_LEN					\
133 	(sizeof(struct rndis_packet_msg) +			\
134 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
135 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
136 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
137 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
138 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
139 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
140 
141 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
142 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
143 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
144 /* -1 for RNDIS packet message */
145 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
146 
147 #define HN_DIRECT_TX_SIZE_DEF		128
148 
149 #define HN_EARLY_TXEOF_THRESH		8
150 
151 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
152 
153 #define HN_LROENT_CNT_DEF		128
154 
155 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
156 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
157 /* YYY 2*MTU is a bit rough, but should be good enough. */
158 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
159 
160 #define HN_LRO_ACKCNT_DEF		1
161 
162 #define HN_LOCK_INIT(sc)		\
163 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
164 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
165 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
166 #define HN_LOCK(sc)					\
167 do {							\
168 	while (sx_try_xlock(&(sc)->hn_lock) == 0)	\
169 		DELAY(1000);				\
170 } while (0)
171 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
172 
173 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
174 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
175 #define HN_CSUM_IP_HWASSIST(sc)		\
176 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
177 #define HN_CSUM_IP6_HWASSIST(sc)	\
178 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
179 
180 #define HN_PKTSIZE_MIN(align)		\
181 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
182 	    HN_RNDIS_PKT_LEN, (align))
183 #define HN_PKTSIZE(m, align)		\
184 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
185 
186 #ifdef RSS
187 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
188 #else
189 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
190 #endif
191 
192 struct hn_txdesc {
193 #ifndef HN_USE_TXDESC_BUFRING
194 	SLIST_ENTRY(hn_txdesc)		link;
195 #endif
196 	STAILQ_ENTRY(hn_txdesc)		agg_link;
197 
198 	/* Aggregated txdescs, in sending order. */
199 	STAILQ_HEAD(, hn_txdesc)	agg_list;
200 
201 	/* The oldest packet, if transmission aggregation happens. */
202 	struct mbuf			*m;
203 	struct hn_tx_ring		*txr;
204 	int				refs;
205 	uint32_t			flags;	/* HN_TXD_FLAG_ */
206 	struct hn_nvs_sendctx		send_ctx;
207 	uint32_t			chim_index;
208 	int				chim_size;
209 
210 	bus_dmamap_t			data_dmap;
211 
212 	bus_addr_t			rndis_pkt_paddr;
213 	struct rndis_packet_msg		*rndis_pkt;
214 	bus_dmamap_t			rndis_pkt_dmap;
215 };
216 
217 #define HN_TXD_FLAG_ONLIST		0x0001
218 #define HN_TXD_FLAG_DMAMAP		0x0002
219 #define HN_TXD_FLAG_ONAGG		0x0004
220 
221 struct hn_rxinfo {
222 	uint32_t			vlan_info;
223 	uint32_t			csum_info;
224 	uint32_t			hash_info;
225 	uint32_t			hash_value;
226 };
227 
228 struct hn_rxvf_setarg {
229 	struct hn_rx_ring	*rxr;
230 	struct ifnet		*vf_ifp;
231 };
232 
233 #define HN_RXINFO_VLAN			0x0001
234 #define HN_RXINFO_CSUM			0x0002
235 #define HN_RXINFO_HASHINF		0x0004
236 #define HN_RXINFO_HASHVAL		0x0008
237 #define HN_RXINFO_ALL			\
238 	(HN_RXINFO_VLAN |		\
239 	 HN_RXINFO_CSUM |		\
240 	 HN_RXINFO_HASHINF |		\
241 	 HN_RXINFO_HASHVAL)
242 
243 #define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
244 #define HN_NDIS_RXCSUM_INFO_INVALID	0
245 #define HN_NDIS_HASH_INFO_INVALID	0
246 
247 static int			hn_probe(device_t);
248 static int			hn_attach(device_t);
249 static int			hn_detach(device_t);
250 static int			hn_shutdown(device_t);
251 static void			hn_chan_callback(struct vmbus_channel *,
252 				    void *);
253 
254 static void			hn_init(void *);
255 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
256 #ifdef HN_IFSTART_SUPPORT
257 static void			hn_start(struct ifnet *);
258 #endif
259 static int			hn_transmit(struct ifnet *, struct mbuf *);
260 static void			hn_xmit_qflush(struct ifnet *);
261 static int			hn_ifmedia_upd(struct ifnet *);
262 static void			hn_ifmedia_sts(struct ifnet *,
263 				    struct ifmediareq *);
264 
265 static void			hn_ifnet_event(void *, struct ifnet *, int);
266 static void			hn_ifaddr_event(void *, struct ifnet *);
267 static void			hn_ifnet_attevent(void *, struct ifnet *);
268 static void			hn_ifnet_detevent(void *, struct ifnet *);
269 static void			hn_ifnet_lnkevent(void *, struct ifnet *, int);
270 
271 static bool			hn_ismyvf(const struct hn_softc *,
272 				    const struct ifnet *);
273 static void			hn_rxvf_change(struct hn_softc *,
274 				    struct ifnet *, bool);
275 static void			hn_rxvf_set(struct hn_softc *, struct ifnet *);
276 static void			hn_rxvf_set_task(void *, int);
277 static void			hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
278 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
279 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
280 				    struct ifreq *);
281 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
282 static bool			hn_xpnt_vf_isready(struct hn_softc *);
283 static void			hn_xpnt_vf_setready(struct hn_softc *);
284 static void			hn_xpnt_vf_init_taskfunc(void *, int);
285 static void			hn_xpnt_vf_init(struct hn_softc *);
286 static void			hn_xpnt_vf_setenable(struct hn_softc *);
287 static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
288 static void			hn_vf_rss_fixup(struct hn_softc *, bool);
289 static void			hn_vf_rss_restore(struct hn_softc *);
290 
291 static int			hn_rndis_rxinfo(const void *, int,
292 				    struct hn_rxinfo *);
293 static void			hn_rndis_rx_data(struct hn_rx_ring *,
294 				    const void *, int);
295 static void			hn_rndis_rx_status(struct hn_softc *,
296 				    const void *, int);
297 static void			hn_rndis_init_fixat(struct hn_softc *, int);
298 
299 static void			hn_nvs_handle_notify(struct hn_softc *,
300 				    const struct vmbus_chanpkt_hdr *);
301 static void			hn_nvs_handle_comp(struct hn_softc *,
302 				    struct vmbus_channel *,
303 				    const struct vmbus_chanpkt_hdr *);
304 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
305 				    struct vmbus_channel *,
306 				    const struct vmbus_chanpkt_hdr *);
307 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
308 				    struct vmbus_channel *, uint64_t);
309 
310 #if __FreeBSD_version >= 1100099
311 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
312 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
313 #endif
314 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
315 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
316 #if __FreeBSD_version < 1100095
317 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
318 #else
319 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
320 #endif
321 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
322 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
323 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
324 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
325 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
326 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
327 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
328 #ifndef RSS
329 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
330 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
331 #endif
332 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
333 static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
334 static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
335 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
336 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
337 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
338 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
339 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
340 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
341 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
342 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
343 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
344 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
345 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
346 
347 static void			hn_stop(struct hn_softc *, bool);
348 static void			hn_init_locked(struct hn_softc *);
349 static int			hn_chan_attach(struct hn_softc *,
350 				    struct vmbus_channel *);
351 static void			hn_chan_detach(struct hn_softc *,
352 				    struct vmbus_channel *);
353 static int			hn_attach_subchans(struct hn_softc *);
354 static void			hn_detach_allchans(struct hn_softc *);
355 static void			hn_chan_rollup(struct hn_rx_ring *,
356 				    struct hn_tx_ring *);
357 static void			hn_set_ring_inuse(struct hn_softc *, int);
358 static int			hn_synth_attach(struct hn_softc *, int);
359 static void			hn_synth_detach(struct hn_softc *);
360 static int			hn_synth_alloc_subchans(struct hn_softc *,
361 				    int *);
362 static bool			hn_synth_attachable(const struct hn_softc *);
363 static void			hn_suspend(struct hn_softc *);
364 static void			hn_suspend_data(struct hn_softc *);
365 static void			hn_suspend_mgmt(struct hn_softc *);
366 static void			hn_resume(struct hn_softc *);
367 static void			hn_resume_data(struct hn_softc *);
368 static void			hn_resume_mgmt(struct hn_softc *);
369 static void			hn_suspend_mgmt_taskfunc(void *, int);
370 static void			hn_chan_drain(struct hn_softc *,
371 				    struct vmbus_channel *);
372 static void			hn_disable_rx(struct hn_softc *);
373 static void			hn_drain_rxtx(struct hn_softc *, int);
374 static void			hn_polling(struct hn_softc *, u_int);
375 static void			hn_chan_polling(struct vmbus_channel *, u_int);
376 static void			hn_mtu_change_fixup(struct hn_softc *);
377 
378 static void			hn_update_link_status(struct hn_softc *);
379 static void			hn_change_network(struct hn_softc *);
380 static void			hn_link_taskfunc(void *, int);
381 static void			hn_netchg_init_taskfunc(void *, int);
382 static void			hn_netchg_status_taskfunc(void *, int);
383 static void			hn_link_status(struct hn_softc *);
384 
385 static int			hn_create_rx_data(struct hn_softc *, int);
386 static void			hn_destroy_rx_data(struct hn_softc *);
387 static int			hn_check_iplen(const struct mbuf *, int);
388 static void			hn_rxpkt_proto(const struct mbuf *, int *, int *);
389 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
390 static int			hn_rxfilter_config(struct hn_softc *);
391 static int			hn_rss_reconfig(struct hn_softc *);
392 static void			hn_rss_ind_fixup(struct hn_softc *);
393 static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
394 static int			hn_rxpkt(struct hn_rx_ring *, const void *,
395 				    int, const struct hn_rxinfo *);
396 static uint32_t			hn_rss_type_fromndis(uint32_t);
397 static uint32_t			hn_rss_type_tondis(uint32_t);
398 
399 static int			hn_tx_ring_create(struct hn_softc *, int);
400 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
401 static int			hn_create_tx_data(struct hn_softc *, int);
402 static void			hn_fixup_tx_data(struct hn_softc *);
403 static void			hn_fixup_rx_data(struct hn_softc *);
404 static void			hn_destroy_tx_data(struct hn_softc *);
405 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
406 static void			hn_txdesc_gc(struct hn_tx_ring *,
407 				    struct hn_txdesc *);
408 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
409 				    struct hn_txdesc *, struct mbuf **);
410 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
411 				    struct hn_txdesc *);
412 static void			hn_set_chim_size(struct hn_softc *, int);
413 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
414 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
415 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
416 static void			hn_resume_tx(struct hn_softc *, int);
417 static void			hn_set_txagg(struct hn_softc *);
418 static void			*hn_try_txagg(struct ifnet *,
419 				    struct hn_tx_ring *, struct hn_txdesc *,
420 				    int);
421 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
422 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
423 				    struct hn_softc *, struct vmbus_channel *,
424 				    const void *, int);
425 static int			hn_txpkt_sglist(struct hn_tx_ring *,
426 				    struct hn_txdesc *);
427 static int			hn_txpkt_chim(struct hn_tx_ring *,
428 				    struct hn_txdesc *);
429 static int			hn_xmit(struct hn_tx_ring *, int);
430 static void			hn_xmit_taskfunc(void *, int);
431 static void			hn_xmit_txeof(struct hn_tx_ring *);
432 static void			hn_xmit_txeof_taskfunc(void *, int);
433 #ifdef HN_IFSTART_SUPPORT
434 static int			hn_start_locked(struct hn_tx_ring *, int);
435 static void			hn_start_taskfunc(void *, int);
436 static void			hn_start_txeof(struct hn_tx_ring *);
437 static void			hn_start_txeof_taskfunc(void *, int);
438 #endif
439 
440 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
441     "Hyper-V network interface");
442 
443 /* Trust tcp segements verification on host side. */
444 static int			hn_trust_hosttcp = 1;
445 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
446     &hn_trust_hosttcp, 0,
447     "Trust tcp segement verification on host side, "
448     "when csum info is missing (global setting)");
449 
450 /* Trust udp datagrams verification on host side. */
451 static int			hn_trust_hostudp = 1;
452 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
453     &hn_trust_hostudp, 0,
454     "Trust udp datagram verification on host side, "
455     "when csum info is missing (global setting)");
456 
457 /* Trust ip packets verification on host side. */
458 static int			hn_trust_hostip = 1;
459 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
460     &hn_trust_hostip, 0,
461     "Trust ip packet verification on host side, "
462     "when csum info is missing (global setting)");
463 
464 /*
465  * Offload UDP/IPv4 checksum.
466  */
467 static int			hn_enable_udp4cs = 1;
468 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
469     &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
470 
471 /*
472  * Offload UDP/IPv6 checksum.
473  */
474 static int			hn_enable_udp6cs = 1;
475 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
476     &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
477 
478 /* Stats. */
479 static counter_u64_t		hn_udpcs_fixup;
480 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
481     &hn_udpcs_fixup, "# of UDP checksum fixup");
482 
483 /*
484  * See hn_set_hlen().
485  *
486  * This value is for Azure.  For Hyper-V, set this above
487  * 65536 to disable UDP datagram checksum fixup.
488  */
489 static int			hn_udpcs_fixup_mtu = 1420;
490 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
491     &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
492 
493 /* Limit TSO burst size */
494 static int			hn_tso_maxlen = IP_MAXPACKET;
495 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
496     &hn_tso_maxlen, 0, "TSO burst limit");
497 
498 /* Limit chimney send size */
499 static int			hn_tx_chimney_size = 0;
500 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
501     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
502 
503 /* Limit the size of packet for direct transmission */
504 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
505 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
506     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
507 
508 /* # of LRO entries per RX ring */
509 #if defined(INET) || defined(INET6)
510 #if __FreeBSD_version >= 1100095
511 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
512 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
513     &hn_lro_entry_count, 0, "LRO entry count");
514 #endif
515 #endif
516 
517 static int			hn_tx_taskq_cnt = 1;
518 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
519     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
520 
521 #define HN_TX_TASKQ_M_INDEP	0
522 #define HN_TX_TASKQ_M_GLOBAL	1
523 #define HN_TX_TASKQ_M_EVTTQ	2
524 
525 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
526 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
527     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
528     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
529 
530 #ifndef HN_USE_TXDESC_BUFRING
531 static int			hn_use_txdesc_bufring = 0;
532 #else
533 static int			hn_use_txdesc_bufring = 1;
534 #endif
535 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
536     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
537 
538 #ifdef HN_IFSTART_SUPPORT
539 /* Use ifnet.if_start instead of ifnet.if_transmit */
540 static int			hn_use_if_start = 0;
541 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
542     &hn_use_if_start, 0, "Use if_start TX method");
543 #endif
544 
545 /* # of channels to use */
546 static int			hn_chan_cnt = 0;
547 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
548     &hn_chan_cnt, 0,
549     "# of channels to use; each channel has one RX ring and one TX ring");
550 
551 /* # of transmit rings to use */
552 static int			hn_tx_ring_cnt = 0;
553 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
554     &hn_tx_ring_cnt, 0, "# of TX rings to use");
555 
556 /* Software TX ring deptch */
557 static int			hn_tx_swq_depth = 0;
558 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
559     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
560 
561 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
562 #if __FreeBSD_version >= 1100095
563 static u_int			hn_lro_mbufq_depth = 0;
564 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
565     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
566 #endif
567 
568 /* Packet transmission aggregation size limit */
569 static int			hn_tx_agg_size = -1;
570 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
571     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
572 
573 /* Packet transmission aggregation count limit */
574 static int			hn_tx_agg_pkts = -1;
575 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
576     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
577 
578 /* VF list */
579 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist,
580     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
581     hn_vflist_sysctl, "A",
582     "VF list");
583 
584 /* VF mapping */
585 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap,
586     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
587     hn_vfmap_sysctl, "A",
588     "VF mapping");
589 
590 /* Transparent VF */
591 static int			hn_xpnt_vf = 1;
592 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
593     &hn_xpnt_vf, 0, "Transparent VF mod");
594 
595 /* Accurate BPF support for Transparent VF */
596 static int			hn_xpnt_vf_accbpf = 0;
597 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
598     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
599 
600 /* Extra wait for transparent VF attach routing; unit seconds. */
601 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
602 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
603     &hn_xpnt_vf_attwait, 0,
604     "Extra wait for transparent VF attach routing; unit: seconds");
605 
606 static u_int			hn_cpu_index;	/* next CPU for channel */
607 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
608 
609 static struct rmlock		hn_vfmap_lock;
610 static int			hn_vfmap_size;
611 static struct ifnet		**hn_vfmap;
612 
613 #ifndef RSS
614 static const uint8_t
615 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
616 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
617 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
618 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
619 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
620 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
621 };
622 #endif	/* !RSS */
623 
624 static const struct hyperv_guid	hn_guid = {
625 	.hv_guid = {
626 	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
627 	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
628 };
629 
630 static device_method_t hn_methods[] = {
631 	/* Device interface */
632 	DEVMETHOD(device_probe,		hn_probe),
633 	DEVMETHOD(device_attach,	hn_attach),
634 	DEVMETHOD(device_detach,	hn_detach),
635 	DEVMETHOD(device_shutdown,	hn_shutdown),
636 	DEVMETHOD_END
637 };
638 
639 static driver_t hn_driver = {
640 	"hn",
641 	hn_methods,
642 	sizeof(struct hn_softc)
643 };
644 
645 static devclass_t hn_devclass;
646 
647 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
648 MODULE_VERSION(hn, 1);
649 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
650 
651 #if __FreeBSD_version >= 1100099
652 static void
653 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
654 {
655 	int i;
656 
657 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
658 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
659 }
660 #endif
661 
662 static int
663 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
664 {
665 
666 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
667 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
668 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
669 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
670 }
671 
672 static int
673 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
674 {
675 	struct hn_nvs_rndis rndis;
676 
677 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
678 	    txd->chim_size > 0, ("invalid rndis chim txd"));
679 
680 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
681 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
682 	rndis.nvs_chim_idx = txd->chim_index;
683 	rndis.nvs_chim_sz = txd->chim_size;
684 
685 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
686 	    &rndis, sizeof(rndis), &txd->send_ctx));
687 }
688 
689 static __inline uint32_t
690 hn_chim_alloc(struct hn_softc *sc)
691 {
692 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
693 	u_long *bmap = sc->hn_chim_bmap;
694 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
695 
696 	for (i = 0; i < bmap_cnt; ++i) {
697 		int idx;
698 
699 		idx = ffsl(~bmap[i]);
700 		if (idx == 0)
701 			continue;
702 
703 		--idx; /* ffsl is 1-based */
704 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
705 		    ("invalid i %d and idx %d", i, idx));
706 
707 		if (atomic_testandset_long(&bmap[i], idx))
708 			continue;
709 
710 		ret = i * LONG_BIT + idx;
711 		break;
712 	}
713 	return (ret);
714 }
715 
716 static __inline void
717 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
718 {
719 	u_long mask;
720 	uint32_t idx;
721 
722 	idx = chim_idx / LONG_BIT;
723 	KASSERT(idx < sc->hn_chim_bmap_cnt,
724 	    ("invalid chimney index 0x%x", chim_idx));
725 
726 	mask = 1UL << (chim_idx % LONG_BIT);
727 	KASSERT(sc->hn_chim_bmap[idx] & mask,
728 	    ("index bitmap 0x%lx, chimney index %u, "
729 	     "bitmap idx %d, bitmask 0x%lx",
730 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
731 
732 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
733 }
734 
735 #if defined(INET6) || defined(INET)
736 
737 #define PULLUP_HDR(m, len)				\
738 do {							\
739 	if (__predict_false((m)->m_len < (len))) {	\
740 		(m) = m_pullup((m), (len));		\
741 		if ((m) == NULL)			\
742 			return (NULL);			\
743 	}						\
744 } while (0)
745 
746 /*
747  * NOTE: If this function failed, the m_head would be freed.
748  */
749 static __inline struct mbuf *
750 hn_tso_fixup(struct mbuf *m_head)
751 {
752 	struct ether_vlan_header *evl;
753 	struct tcphdr *th;
754 	int ehlen;
755 
756 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
757 
758 	PULLUP_HDR(m_head, sizeof(*evl));
759 	evl = mtod(m_head, struct ether_vlan_header *);
760 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
761 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
762 	else
763 		ehlen = ETHER_HDR_LEN;
764 	m_head->m_pkthdr.l2hlen = ehlen;
765 
766 #ifdef INET
767 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
768 		struct ip *ip;
769 		int iphlen;
770 
771 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
772 		ip = mtodo(m_head, ehlen);
773 		iphlen = ip->ip_hl << 2;
774 		m_head->m_pkthdr.l3hlen = iphlen;
775 
776 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
777 		th = mtodo(m_head, ehlen + iphlen);
778 
779 		ip->ip_len = 0;
780 		ip->ip_sum = 0;
781 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
782 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
783 	}
784 #endif
785 #if defined(INET6) && defined(INET)
786 	else
787 #endif
788 #ifdef INET6
789 	{
790 		struct ip6_hdr *ip6;
791 
792 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
793 		ip6 = mtodo(m_head, ehlen);
794 		if (ip6->ip6_nxt != IPPROTO_TCP) {
795 			m_freem(m_head);
796 			return (NULL);
797 		}
798 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
799 
800 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
801 		th = mtodo(m_head, ehlen + sizeof(*ip6));
802 
803 		ip6->ip6_plen = 0;
804 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
805 	}
806 #endif
807 	return (m_head);
808 }
809 
810 /*
811  * NOTE: If this function failed, the m_head would be freed.
812  */
813 static __inline struct mbuf *
814 hn_set_hlen(struct mbuf *m_head)
815 {
816 	const struct ether_vlan_header *evl;
817 	int ehlen;
818 
819 	PULLUP_HDR(m_head, sizeof(*evl));
820 	evl = mtod(m_head, const struct ether_vlan_header *);
821 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
822 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
823 	else
824 		ehlen = ETHER_HDR_LEN;
825 	m_head->m_pkthdr.l2hlen = ehlen;
826 
827 #ifdef INET
828 	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
829 		const struct ip *ip;
830 		int iphlen;
831 
832 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
833 		ip = mtodo(m_head, ehlen);
834 		iphlen = ip->ip_hl << 2;
835 		m_head->m_pkthdr.l3hlen = iphlen;
836 
837 		/*
838 		 * UDP checksum offload does not work in Azure, if the
839 		 * following conditions meet:
840 		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
841 		 * - IP_DF is not set in the IP hdr.
842 		 *
843 		 * Fallback to software checksum for these UDP datagrams.
844 		 */
845 		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
846 		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
847 		    (ntohs(ip->ip_off) & IP_DF) == 0) {
848 			uint16_t off = ehlen + iphlen;
849 
850 			counter_u64_add(hn_udpcs_fixup, 1);
851 			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
852 			*(uint16_t *)(m_head->m_data + off +
853                             m_head->m_pkthdr.csum_data) = in_cksum_skip(
854 			    m_head, m_head->m_pkthdr.len, off);
855 			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
856 		}
857 	}
858 #endif
859 #if defined(INET6) && defined(INET)
860 	else
861 #endif
862 #ifdef INET6
863 	{
864 		const struct ip6_hdr *ip6;
865 
866 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
867 		ip6 = mtodo(m_head, ehlen);
868 		if (ip6->ip6_nxt != IPPROTO_TCP &&
869 		    ip6->ip6_nxt != IPPROTO_UDP) {
870 			m_freem(m_head);
871 			return (NULL);
872 		}
873 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
874 	}
875 #endif
876 	return (m_head);
877 }
878 
879 /*
880  * NOTE: If this function failed, the m_head would be freed.
881  */
882 static __inline struct mbuf *
883 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
884 {
885 	const struct tcphdr *th;
886 	int ehlen, iphlen;
887 
888 	*tcpsyn = 0;
889 	ehlen = m_head->m_pkthdr.l2hlen;
890 	iphlen = m_head->m_pkthdr.l3hlen;
891 
892 	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
893 	th = mtodo(m_head, ehlen + iphlen);
894 	if (th->th_flags & TH_SYN)
895 		*tcpsyn = 1;
896 	return (m_head);
897 }
898 
899 #undef PULLUP_HDR
900 
901 #endif	/* INET6 || INET */
902 
903 static int
904 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
905 {
906 	int error = 0;
907 
908 	HN_LOCK_ASSERT(sc);
909 
910 	if (sc->hn_rx_filter != filter) {
911 		error = hn_rndis_set_rxfilter(sc, filter);
912 		if (!error)
913 			sc->hn_rx_filter = filter;
914 	}
915 	return (error);
916 }
917 
918 static int
919 hn_rxfilter_config(struct hn_softc *sc)
920 {
921 	struct ifnet *ifp = sc->hn_ifp;
922 	uint32_t filter;
923 
924 	HN_LOCK_ASSERT(sc);
925 
926 	/*
927 	 * If the non-transparent mode VF is activated, we don't know how
928 	 * its RX filter is configured, so stick the synthetic device in
929 	 * the promiscous mode.
930 	 */
931 	if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
932 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
933 	} else {
934 		filter = NDIS_PACKET_TYPE_DIRECTED;
935 		if (ifp->if_flags & IFF_BROADCAST)
936 			filter |= NDIS_PACKET_TYPE_BROADCAST;
937 		/* TODO: support multicast list */
938 		if ((ifp->if_flags & IFF_ALLMULTI) ||
939 		    !CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
940 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
941 	}
942 	return (hn_set_rxfilter(sc, filter));
943 }
944 
945 static void
946 hn_set_txagg(struct hn_softc *sc)
947 {
948 	uint32_t size, pkts;
949 	int i;
950 
951 	/*
952 	 * Setup aggregation size.
953 	 */
954 	if (sc->hn_agg_size < 0)
955 		size = UINT32_MAX;
956 	else
957 		size = sc->hn_agg_size;
958 
959 	if (sc->hn_rndis_agg_size < size)
960 		size = sc->hn_rndis_agg_size;
961 
962 	/* NOTE: We only aggregate packets using chimney sending buffers. */
963 	if (size > (uint32_t)sc->hn_chim_szmax)
964 		size = sc->hn_chim_szmax;
965 
966 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
967 		/* Disable */
968 		size = 0;
969 		pkts = 0;
970 		goto done;
971 	}
972 
973 	/* NOTE: Type of the per TX ring setting is 'int'. */
974 	if (size > INT_MAX)
975 		size = INT_MAX;
976 
977 	/*
978 	 * Setup aggregation packet count.
979 	 */
980 	if (sc->hn_agg_pkts < 0)
981 		pkts = UINT32_MAX;
982 	else
983 		pkts = sc->hn_agg_pkts;
984 
985 	if (sc->hn_rndis_agg_pkts < pkts)
986 		pkts = sc->hn_rndis_agg_pkts;
987 
988 	if (pkts <= 1) {
989 		/* Disable */
990 		size = 0;
991 		pkts = 0;
992 		goto done;
993 	}
994 
995 	/* NOTE: Type of the per TX ring setting is 'short'. */
996 	if (pkts > SHRT_MAX)
997 		pkts = SHRT_MAX;
998 
999 done:
1000 	/* NOTE: Type of the per TX ring setting is 'short'. */
1001 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
1002 		/* Disable */
1003 		size = 0;
1004 		pkts = 0;
1005 	}
1006 
1007 	if (bootverbose) {
1008 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1009 		    size, pkts, sc->hn_rndis_agg_align);
1010 	}
1011 
1012 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1013 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1014 
1015 		mtx_lock(&txr->hn_tx_lock);
1016 		txr->hn_agg_szmax = size;
1017 		txr->hn_agg_pktmax = pkts;
1018 		txr->hn_agg_align = sc->hn_rndis_agg_align;
1019 		mtx_unlock(&txr->hn_tx_lock);
1020 	}
1021 }
1022 
1023 static int
1024 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1025 {
1026 
1027 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1028 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1029 		return txr->hn_txdesc_cnt;
1030 	return hn_tx_swq_depth;
1031 }
1032 
1033 static int
1034 hn_rss_reconfig(struct hn_softc *sc)
1035 {
1036 	int error;
1037 
1038 	HN_LOCK_ASSERT(sc);
1039 
1040 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1041 		return (ENXIO);
1042 
1043 	/*
1044 	 * Disable RSS first.
1045 	 *
1046 	 * NOTE:
1047 	 * Direct reconfiguration by setting the UNCHG flags does
1048 	 * _not_ work properly.
1049 	 */
1050 	if (bootverbose)
1051 		if_printf(sc->hn_ifp, "disable RSS\n");
1052 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1053 	if (error) {
1054 		if_printf(sc->hn_ifp, "RSS disable failed\n");
1055 		return (error);
1056 	}
1057 
1058 	/*
1059 	 * Reenable the RSS w/ the updated RSS key or indirect
1060 	 * table.
1061 	 */
1062 	if (bootverbose)
1063 		if_printf(sc->hn_ifp, "reconfig RSS\n");
1064 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1065 	if (error) {
1066 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1067 		return (error);
1068 	}
1069 	return (0);
1070 }
1071 
1072 static void
1073 hn_rss_ind_fixup(struct hn_softc *sc)
1074 {
1075 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1076 	int i, nchan;
1077 
1078 	nchan = sc->hn_rx_ring_inuse;
1079 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1080 
1081 	/*
1082 	 * Check indirect table to make sure that all channels in it
1083 	 * can be used.
1084 	 */
1085 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1086 		if (rss->rss_ind[i] >= nchan) {
1087 			if_printf(sc->hn_ifp,
1088 			    "RSS indirect table %d fixup: %u -> %d\n",
1089 			    i, rss->rss_ind[i], nchan - 1);
1090 			rss->rss_ind[i] = nchan - 1;
1091 		}
1092 	}
1093 }
1094 
1095 static int
1096 hn_ifmedia_upd(struct ifnet *ifp __unused)
1097 {
1098 
1099 	return EOPNOTSUPP;
1100 }
1101 
1102 static void
1103 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1104 {
1105 	struct hn_softc *sc = ifp->if_softc;
1106 
1107 	ifmr->ifm_status = IFM_AVALID;
1108 	ifmr->ifm_active = IFM_ETHER;
1109 
1110 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1111 		ifmr->ifm_active |= IFM_NONE;
1112 		return;
1113 	}
1114 	ifmr->ifm_status |= IFM_ACTIVE;
1115 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1116 }
1117 
1118 static void
1119 hn_rxvf_set_task(void *xarg, int pending __unused)
1120 {
1121 	struct hn_rxvf_setarg *arg = xarg;
1122 
1123 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1124 }
1125 
1126 static void
1127 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1128 {
1129 	struct hn_rx_ring *rxr;
1130 	struct hn_rxvf_setarg arg;
1131 	struct task task;
1132 	int i;
1133 
1134 	HN_LOCK_ASSERT(sc);
1135 
1136 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1137 
1138 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1139 		rxr = &sc->hn_rx_ring[i];
1140 
1141 		if (i < sc->hn_rx_ring_inuse) {
1142 			arg.rxr = rxr;
1143 			arg.vf_ifp = vf_ifp;
1144 			vmbus_chan_run_task(rxr->hn_chan, &task);
1145 		} else {
1146 			rxr->hn_rxvf_ifp = vf_ifp;
1147 		}
1148 	}
1149 }
1150 
1151 static bool
1152 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1153 {
1154 	const struct ifnet *hn_ifp;
1155 
1156 	hn_ifp = sc->hn_ifp;
1157 
1158 	if (ifp == hn_ifp)
1159 		return (false);
1160 
1161 	if (ifp->if_alloctype != IFT_ETHER)
1162 		return (false);
1163 
1164 	/* Ignore lagg/vlan interfaces */
1165 	if (strcmp(ifp->if_dname, "lagg") == 0 ||
1166 	    strcmp(ifp->if_dname, "vlan") == 0)
1167 		return (false);
1168 
1169 	/*
1170 	 * During detach events ifp->if_addr might be NULL.
1171 	 * Make sure the bcmp() below doesn't panic on that:
1172 	 */
1173 	if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL)
1174 		return (false);
1175 
1176 	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1177 		return (false);
1178 
1179 	return (true);
1180 }
1181 
1182 static void
1183 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1184 {
1185 	struct ifnet *hn_ifp;
1186 
1187 	HN_LOCK(sc);
1188 
1189 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1190 		goto out;
1191 
1192 	if (!hn_ismyvf(sc, ifp))
1193 		goto out;
1194 	hn_ifp = sc->hn_ifp;
1195 
1196 	if (rxvf) {
1197 		if (sc->hn_flags & HN_FLAG_RXVF)
1198 			goto out;
1199 
1200 		sc->hn_flags |= HN_FLAG_RXVF;
1201 		hn_rxfilter_config(sc);
1202 	} else {
1203 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1204 			goto out;
1205 
1206 		sc->hn_flags &= ~HN_FLAG_RXVF;
1207 		if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1208 			hn_rxfilter_config(sc);
1209 		else
1210 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1211 	}
1212 
1213 	hn_nvs_set_datapath(sc,
1214 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1215 
1216 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1217 
1218 	if (rxvf) {
1219 		hn_vf_rss_fixup(sc, true);
1220 		hn_suspend_mgmt(sc);
1221 		sc->hn_link_flags &=
1222 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1223 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1224 	} else {
1225 		hn_vf_rss_restore(sc);
1226 		hn_resume_mgmt(sc);
1227 	}
1228 
1229 	devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1230 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1231 
1232 	if (bootverbose) {
1233 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1234 		    rxvf ? "to" : "from", ifp->if_xname);
1235 	}
1236 out:
1237 	HN_UNLOCK(sc);
1238 }
1239 
1240 static void
1241 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1242 {
1243 
1244 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1245 		return;
1246 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1247 }
1248 
1249 static void
1250 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1251 {
1252 
1253 	hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1254 }
1255 
1256 static int
1257 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1258 {
1259 	struct ifnet *ifp, *vf_ifp;
1260 	uint64_t tmp;
1261 	int error;
1262 
1263 	HN_LOCK_ASSERT(sc);
1264 	ifp = sc->hn_ifp;
1265 	vf_ifp = sc->hn_vf_ifp;
1266 
1267 	/*
1268 	 * Fix up requested capabilities w/ supported capabilities,
1269 	 * since the supported capabilities could have been changed.
1270 	 */
1271 	ifr->ifr_reqcap &= ifp->if_capabilities;
1272 	/* Pass SIOCSIFCAP to VF. */
1273 	error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1274 
1275 	/*
1276 	 * NOTE:
1277 	 * The error will be propagated to the callers, however, it
1278 	 * is _not_ useful here.
1279 	 */
1280 
1281 	/*
1282 	 * Merge VF's enabled capabilities.
1283 	 */
1284 	ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1285 
1286 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1287 	if (ifp->if_capenable & IFCAP_TXCSUM)
1288 		ifp->if_hwassist |= tmp;
1289 	else
1290 		ifp->if_hwassist &= ~tmp;
1291 
1292 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1293 	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1294 		ifp->if_hwassist |= tmp;
1295 	else
1296 		ifp->if_hwassist &= ~tmp;
1297 
1298 	tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1299 	if (ifp->if_capenable & IFCAP_TSO4)
1300 		ifp->if_hwassist |= tmp;
1301 	else
1302 		ifp->if_hwassist &= ~tmp;
1303 
1304 	tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1305 	if (ifp->if_capenable & IFCAP_TSO6)
1306 		ifp->if_hwassist |= tmp;
1307 	else
1308 		ifp->if_hwassist &= ~tmp;
1309 
1310 	return (error);
1311 }
1312 
1313 static int
1314 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1315 {
1316 	struct ifnet *vf_ifp;
1317 	struct ifreq ifr;
1318 
1319 	HN_LOCK_ASSERT(sc);
1320 	vf_ifp = sc->hn_vf_ifp;
1321 
1322 	memset(&ifr, 0, sizeof(ifr));
1323 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1324 	ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1325 	ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1326 	return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1327 }
1328 
1329 static void
1330 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1331 {
1332 	struct ifnet *ifp = sc->hn_ifp;
1333 	int allmulti = 0;
1334 
1335 	HN_LOCK_ASSERT(sc);
1336 
1337 	/* XXX vlan(4) style mcast addr maintenance */
1338 	if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
1339 		allmulti = IFF_ALLMULTI;
1340 
1341 	/* Always set the VF's if_flags */
1342 	sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1343 }
1344 
1345 static void
1346 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1347 {
1348 	struct rm_priotracker pt;
1349 	struct ifnet *hn_ifp = NULL;
1350 	struct mbuf *mn;
1351 
1352 	/*
1353 	 * XXX racy, if hn(4) ever detached.
1354 	 */
1355 	rm_rlock(&hn_vfmap_lock, &pt);
1356 	if (vf_ifp->if_index < hn_vfmap_size)
1357 		hn_ifp = hn_vfmap[vf_ifp->if_index];
1358 	rm_runlock(&hn_vfmap_lock, &pt);
1359 
1360 	if (hn_ifp != NULL) {
1361 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1362 			/*
1363 			 * Allow tapping on the VF.
1364 			 */
1365 			ETHER_BPF_MTAP(vf_ifp, mn);
1366 
1367 			/*
1368 			 * Update VF stats.
1369 			 */
1370 			if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1371 				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1372 				    mn->m_pkthdr.len);
1373 			}
1374 			/*
1375 			 * XXX IFCOUNTER_IMCAST
1376 			 * This stat updating is kinda invasive, since it
1377 			 * requires two checks on the mbuf: the length check
1378 			 * and the ethernet header check.  As of this write,
1379 			 * all multicast packets go directly to hn(4), which
1380 			 * makes imcast stat updating in the VF a try in vian.
1381 			 */
1382 
1383 			/*
1384 			 * Fix up rcvif and increase hn(4)'s ipackets.
1385 			 */
1386 			mn->m_pkthdr.rcvif = hn_ifp;
1387 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1388 		}
1389 		/*
1390 		 * Go through hn(4)'s if_input.
1391 		 */
1392 		hn_ifp->if_input(hn_ifp, m);
1393 	} else {
1394 		/*
1395 		 * In the middle of the transition; free this
1396 		 * mbuf chain.
1397 		 */
1398 		while (m != NULL) {
1399 			mn = m->m_nextpkt;
1400 			m->m_nextpkt = NULL;
1401 			m_freem(m);
1402 			m = mn;
1403 		}
1404 	}
1405 }
1406 
1407 static void
1408 hn_mtu_change_fixup(struct hn_softc *sc)
1409 {
1410 	struct ifnet *ifp;
1411 
1412 	HN_LOCK_ASSERT(sc);
1413 	ifp = sc->hn_ifp;
1414 
1415 	hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1416 #if __FreeBSD_version >= 1100099
1417 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1418 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1419 #endif
1420 }
1421 
1422 static uint32_t
1423 hn_rss_type_fromndis(uint32_t rss_hash)
1424 {
1425 	uint32_t types = 0;
1426 
1427 	if (rss_hash & NDIS_HASH_IPV4)
1428 		types |= RSS_TYPE_IPV4;
1429 	if (rss_hash & NDIS_HASH_TCP_IPV4)
1430 		types |= RSS_TYPE_TCP_IPV4;
1431 	if (rss_hash & NDIS_HASH_IPV6)
1432 		types |= RSS_TYPE_IPV6;
1433 	if (rss_hash & NDIS_HASH_IPV6_EX)
1434 		types |= RSS_TYPE_IPV6_EX;
1435 	if (rss_hash & NDIS_HASH_TCP_IPV6)
1436 		types |= RSS_TYPE_TCP_IPV6;
1437 	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1438 		types |= RSS_TYPE_TCP_IPV6_EX;
1439 	if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1440 		types |= RSS_TYPE_UDP_IPV4;
1441 	return (types);
1442 }
1443 
1444 static uint32_t
1445 hn_rss_type_tondis(uint32_t types)
1446 {
1447 	uint32_t rss_hash = 0;
1448 
1449 	KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1450 	    ("UDP6 and UDP6EX are not supported"));
1451 
1452 	if (types & RSS_TYPE_IPV4)
1453 		rss_hash |= NDIS_HASH_IPV4;
1454 	if (types & RSS_TYPE_TCP_IPV4)
1455 		rss_hash |= NDIS_HASH_TCP_IPV4;
1456 	if (types & RSS_TYPE_IPV6)
1457 		rss_hash |= NDIS_HASH_IPV6;
1458 	if (types & RSS_TYPE_IPV6_EX)
1459 		rss_hash |= NDIS_HASH_IPV6_EX;
1460 	if (types & RSS_TYPE_TCP_IPV6)
1461 		rss_hash |= NDIS_HASH_TCP_IPV6;
1462 	if (types & RSS_TYPE_TCP_IPV6_EX)
1463 		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1464 	if (types & RSS_TYPE_UDP_IPV4)
1465 		rss_hash |= NDIS_HASH_UDP_IPV4_X;
1466 	return (rss_hash);
1467 }
1468 
1469 static void
1470 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1471 {
1472 	int i;
1473 
1474 	HN_LOCK_ASSERT(sc);
1475 
1476 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1477 		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1478 }
1479 
1480 static void
1481 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1482 {
1483 	struct ifnet *ifp, *vf_ifp;
1484 	struct ifrsshash ifrh;
1485 	struct ifrsskey ifrk;
1486 	int error;
1487 	uint32_t my_types, diff_types, mbuf_types = 0;
1488 
1489 	HN_LOCK_ASSERT(sc);
1490 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1491 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1492 
1493 	if (sc->hn_rx_ring_inuse == 1) {
1494 		/* No RSS on synthetic parts; done. */
1495 		return;
1496 	}
1497 	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1498 		/* Synthetic parts do not support Toeplitz; done. */
1499 		return;
1500 	}
1501 
1502 	ifp = sc->hn_ifp;
1503 	vf_ifp = sc->hn_vf_ifp;
1504 
1505 	/*
1506 	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
1507 	 * supported.
1508 	 */
1509 	memset(&ifrk, 0, sizeof(ifrk));
1510 	strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1511 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1512 	if (error) {
1513 		if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
1514 		    vf_ifp->if_xname, error);
1515 		goto done;
1516 	}
1517 	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1518 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1519 		    vf_ifp->if_xname, ifrk.ifrk_func);
1520 		goto done;
1521 	}
1522 	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1523 		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1524 		    vf_ifp->if_xname, ifrk.ifrk_keylen);
1525 		goto done;
1526 	}
1527 
1528 	/*
1529 	 * Extract VF's RSS hash.  Only Toeplitz is supported.
1530 	 */
1531 	memset(&ifrh, 0, sizeof(ifrh));
1532 	strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1533 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1534 	if (error) {
1535 		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1536 		    vf_ifp->if_xname, error);
1537 		goto done;
1538 	}
1539 	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1540 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1541 		    vf_ifp->if_xname, ifrh.ifrh_func);
1542 		goto done;
1543 	}
1544 
1545 	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1546 	if ((ifrh.ifrh_types & my_types) == 0) {
1547 		/* This disables RSS; ignore it then */
1548 		if_printf(ifp, "%s intersection of RSS types failed.  "
1549 		    "VF %#x, mine %#x\n", vf_ifp->if_xname,
1550 		    ifrh.ifrh_types, my_types);
1551 		goto done;
1552 	}
1553 
1554 	diff_types = my_types ^ ifrh.ifrh_types;
1555 	my_types &= ifrh.ifrh_types;
1556 	mbuf_types = my_types;
1557 
1558 	/*
1559 	 * Detect RSS hash value/type confliction.
1560 	 *
1561 	 * NOTE:
1562 	 * We don't disable the hash type, but stop delivery the hash
1563 	 * value/type through mbufs on RX path.
1564 	 *
1565 	 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1566 	 * hash is delivered with type of TCP_IPV4.  This means if
1567 	 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1568 	 * least to hn_mbuf_hash.  However, given that _all_ of the
1569 	 * NICs implement TCP_IPV4, this will _not_ impose any issues
1570 	 * here.
1571 	 */
1572 	if ((my_types & RSS_TYPE_IPV4) &&
1573 	    (diff_types & ifrh.ifrh_types &
1574 	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1575 		/* Conflict; disable IPV4 hash type/value delivery. */
1576 		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1577 		mbuf_types &= ~RSS_TYPE_IPV4;
1578 	}
1579 	if ((my_types & RSS_TYPE_IPV6) &&
1580 	    (diff_types & ifrh.ifrh_types &
1581 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1582 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1583 	      RSS_TYPE_IPV6_EX))) {
1584 		/* Conflict; disable IPV6 hash type/value delivery. */
1585 		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1586 		mbuf_types &= ~RSS_TYPE_IPV6;
1587 	}
1588 	if ((my_types & RSS_TYPE_IPV6_EX) &&
1589 	    (diff_types & ifrh.ifrh_types &
1590 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1591 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1592 	      RSS_TYPE_IPV6))) {
1593 		/* Conflict; disable IPV6_EX hash type/value delivery. */
1594 		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1595 		mbuf_types &= ~RSS_TYPE_IPV6_EX;
1596 	}
1597 	if ((my_types & RSS_TYPE_TCP_IPV6) &&
1598 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1599 		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
1600 		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1601 		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1602 	}
1603 	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1604 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1605 		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1606 		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1607 		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1608 	}
1609 	if ((my_types & RSS_TYPE_UDP_IPV6) &&
1610 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1611 		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
1612 		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1613 		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1614 	}
1615 	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1616 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1617 		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1618 		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1619 		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1620 	}
1621 
1622 	/*
1623 	 * Indirect table does not matter.
1624 	 */
1625 
1626 	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1627 	    hn_rss_type_tondis(my_types);
1628 	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1629 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1630 
1631 	if (reconf) {
1632 		error = hn_rss_reconfig(sc);
1633 		if (error) {
1634 			/* XXX roll-back? */
1635 			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1636 			/* XXX keep going. */
1637 		}
1638 	}
1639 done:
1640 	/* Hash deliverability for mbufs. */
1641 	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1642 }
1643 
1644 static void
1645 hn_vf_rss_restore(struct hn_softc *sc)
1646 {
1647 
1648 	HN_LOCK_ASSERT(sc);
1649 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1650 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1651 
1652 	if (sc->hn_rx_ring_inuse == 1)
1653 		goto done;
1654 
1655 	/*
1656 	 * Restore hash types.  Key does _not_ matter.
1657 	 */
1658 	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1659 		int error;
1660 
1661 		sc->hn_rss_hash = sc->hn_rss_hcap;
1662 		error = hn_rss_reconfig(sc);
1663 		if (error) {
1664 			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1665 			    error);
1666 			/* XXX keep going. */
1667 		}
1668 	}
1669 done:
1670 	/* Hash deliverability for mbufs. */
1671 	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1672 }
1673 
1674 static void
1675 hn_xpnt_vf_setready(struct hn_softc *sc)
1676 {
1677 	struct ifnet *ifp, *vf_ifp;
1678 	struct ifreq ifr;
1679 
1680 	HN_LOCK_ASSERT(sc);
1681 	ifp = sc->hn_ifp;
1682 	vf_ifp = sc->hn_vf_ifp;
1683 
1684 	/*
1685 	 * Mark the VF ready.
1686 	 */
1687 	sc->hn_vf_rdytick = 0;
1688 
1689 	/*
1690 	 * Save information for restoration.
1691 	 */
1692 	sc->hn_saved_caps = ifp->if_capabilities;
1693 	sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1694 	sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1695 	sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1696 
1697 	/*
1698 	 * Intersect supported/enabled capabilities.
1699 	 *
1700 	 * NOTE:
1701 	 * if_hwassist is not changed here.
1702 	 */
1703 	ifp->if_capabilities &= vf_ifp->if_capabilities;
1704 	ifp->if_capenable &= ifp->if_capabilities;
1705 
1706 	/*
1707 	 * Fix TSO settings.
1708 	 */
1709 	if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1710 		ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1711 	if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1712 		ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1713 	if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1714 		ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1715 
1716 	/*
1717 	 * Change VF's enabled capabilities.
1718 	 */
1719 	memset(&ifr, 0, sizeof(ifr));
1720 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1721 	ifr.ifr_reqcap = ifp->if_capenable;
1722 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1723 
1724 	if (ifp->if_mtu != ETHERMTU) {
1725 		int error;
1726 
1727 		/*
1728 		 * Change VF's MTU.
1729 		 */
1730 		memset(&ifr, 0, sizeof(ifr));
1731 		strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1732 		ifr.ifr_mtu = ifp->if_mtu;
1733 		error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1734 		if (error) {
1735 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1736 			    vf_ifp->if_xname, ifp->if_mtu);
1737 			if (ifp->if_mtu > ETHERMTU) {
1738 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1739 
1740 				/*
1741 				 * XXX
1742 				 * No need to adjust the synthetic parts' MTU;
1743 				 * failure of the adjustment will cause us
1744 				 * infinite headache.
1745 				 */
1746 				ifp->if_mtu = ETHERMTU;
1747 				hn_mtu_change_fixup(sc);
1748 			}
1749 		}
1750 	}
1751 }
1752 
1753 static bool
1754 hn_xpnt_vf_isready(struct hn_softc *sc)
1755 {
1756 
1757 	HN_LOCK_ASSERT(sc);
1758 
1759 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1760 		return (false);
1761 
1762 	if (sc->hn_vf_rdytick == 0)
1763 		return (true);
1764 
1765 	if (sc->hn_vf_rdytick > ticks)
1766 		return (false);
1767 
1768 	/* Mark VF as ready. */
1769 	hn_xpnt_vf_setready(sc);
1770 	return (true);
1771 }
1772 
1773 static void
1774 hn_xpnt_vf_setenable(struct hn_softc *sc)
1775 {
1776 	int i;
1777 
1778 	HN_LOCK_ASSERT(sc);
1779 
1780 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1781 	rm_wlock(&sc->hn_vf_lock);
1782 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1783 	rm_wunlock(&sc->hn_vf_lock);
1784 
1785 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1786 		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1787 }
1788 
1789 static void
1790 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1791 {
1792 	int i;
1793 
1794 	HN_LOCK_ASSERT(sc);
1795 
1796 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1797 	rm_wlock(&sc->hn_vf_lock);
1798 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1799 	if (clear_vf)
1800 		sc->hn_vf_ifp = NULL;
1801 	rm_wunlock(&sc->hn_vf_lock);
1802 
1803 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1804 		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1805 }
1806 
1807 static void
1808 hn_xpnt_vf_init(struct hn_softc *sc)
1809 {
1810 	int error;
1811 
1812 	HN_LOCK_ASSERT(sc);
1813 
1814 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1815 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1816 
1817 	if (bootverbose) {
1818 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1819 		    sc->hn_vf_ifp->if_xname);
1820 	}
1821 
1822 	/*
1823 	 * Bring the VF up.
1824 	 */
1825 	hn_xpnt_vf_saveifflags(sc);
1826 	sc->hn_vf_ifp->if_flags |= IFF_UP;
1827 	error = hn_xpnt_vf_iocsetflags(sc);
1828 	if (error) {
1829 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1830 		    sc->hn_vf_ifp->if_xname, error);
1831 		return;
1832 	}
1833 
1834 	/*
1835 	 * NOTE:
1836 	 * Datapath setting must happen _after_ bringing the VF up.
1837 	 */
1838 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1839 
1840 	/*
1841 	 * NOTE:
1842 	 * Fixup RSS related bits _after_ the VF is brought up, since
1843 	 * many VFs generate RSS key during it's initialization.
1844 	 */
1845 	hn_vf_rss_fixup(sc, true);
1846 
1847 	/* Mark transparent mode VF as enabled. */
1848 	hn_xpnt_vf_setenable(sc);
1849 }
1850 
1851 static void
1852 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1853 {
1854 	struct hn_softc *sc = xsc;
1855 
1856 	HN_LOCK(sc);
1857 
1858 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1859 		goto done;
1860 	if (sc->hn_vf_ifp == NULL)
1861 		goto done;
1862 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1863 		goto done;
1864 
1865 	if (sc->hn_vf_rdytick != 0) {
1866 		/* Mark VF as ready. */
1867 		hn_xpnt_vf_setready(sc);
1868 	}
1869 
1870 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1871 		/*
1872 		 * Delayed VF initialization.
1873 		 */
1874 		if (bootverbose) {
1875 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1876 			    sc->hn_vf_ifp->if_xname);
1877 		}
1878 		hn_xpnt_vf_init(sc);
1879 	}
1880 done:
1881 	HN_UNLOCK(sc);
1882 }
1883 
1884 static void
1885 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1886 {
1887 	struct hn_softc *sc = xsc;
1888 
1889 	HN_LOCK(sc);
1890 
1891 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1892 		goto done;
1893 
1894 	if (!hn_ismyvf(sc, ifp))
1895 		goto done;
1896 
1897 	if (sc->hn_vf_ifp != NULL) {
1898 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1899 		    sc->hn_vf_ifp->if_xname);
1900 		goto done;
1901 	}
1902 
1903 	if (hn_xpnt_vf && ifp->if_start != NULL) {
1904 		/*
1905 		 * ifnet.if_start is _not_ supported by transparent
1906 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1907 		 */
1908 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1909 		    "in transparent VF mode.\n", ifp->if_xname);
1910 		goto done;
1911 	}
1912 
1913 	rm_wlock(&hn_vfmap_lock);
1914 
1915 	if (ifp->if_index >= hn_vfmap_size) {
1916 		struct ifnet **newmap;
1917 		int newsize;
1918 
1919 		newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1920 		newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1921 		    M_WAITOK | M_ZERO);
1922 
1923 		memcpy(newmap, hn_vfmap,
1924 		    sizeof(struct ifnet *) * hn_vfmap_size);
1925 		free(hn_vfmap, M_DEVBUF);
1926 		hn_vfmap = newmap;
1927 		hn_vfmap_size = newsize;
1928 	}
1929 	KASSERT(hn_vfmap[ifp->if_index] == NULL,
1930 	    ("%s: ifindex %d was mapped to %s",
1931 	     ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1932 	hn_vfmap[ifp->if_index] = sc->hn_ifp;
1933 
1934 	rm_wunlock(&hn_vfmap_lock);
1935 
1936 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1937 	rm_wlock(&sc->hn_vf_lock);
1938 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1939 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1940 	sc->hn_vf_ifp = ifp;
1941 	rm_wunlock(&sc->hn_vf_lock);
1942 
1943 	if (hn_xpnt_vf) {
1944 		int wait_ticks;
1945 
1946 		/*
1947 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1948 		 * Save vf_ifp's current if_input for later restoration.
1949 		 */
1950 		sc->hn_vf_input = ifp->if_input;
1951 		ifp->if_input = hn_xpnt_vf_input;
1952 
1953 		/*
1954 		 * Stop link status management; use the VF's.
1955 		 */
1956 		hn_suspend_mgmt(sc);
1957 
1958 		/*
1959 		 * Give VF sometime to complete its attach routing.
1960 		 */
1961 		wait_ticks = hn_xpnt_vf_attwait * hz;
1962 		sc->hn_vf_rdytick = ticks + wait_ticks;
1963 
1964 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1965 		    wait_ticks);
1966 	}
1967 done:
1968 	HN_UNLOCK(sc);
1969 }
1970 
1971 static void
1972 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1973 {
1974 	struct hn_softc *sc = xsc;
1975 
1976 	HN_LOCK(sc);
1977 
1978 	if (sc->hn_vf_ifp == NULL)
1979 		goto done;
1980 
1981 	if (!hn_ismyvf(sc, ifp))
1982 		goto done;
1983 
1984 	if (hn_xpnt_vf) {
1985 		/*
1986 		 * Make sure that the delayed initialization is not running.
1987 		 *
1988 		 * NOTE:
1989 		 * - This lock _must_ be released, since the hn_vf_init task
1990 		 *   will try holding this lock.
1991 		 * - It is safe to release this lock here, since the
1992 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1993 		 *
1994 		 * XXX racy, if hn(4) ever detached.
1995 		 */
1996 		HN_UNLOCK(sc);
1997 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1998 		HN_LOCK(sc);
1999 
2000 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
2001 		    sc->hn_ifp->if_xname));
2002 		ifp->if_input = sc->hn_vf_input;
2003 		sc->hn_vf_input = NULL;
2004 
2005 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
2006 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
2007 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
2008 
2009 		if (sc->hn_vf_rdytick == 0) {
2010 			/*
2011 			 * The VF was ready; restore some settings.
2012 			 */
2013 			sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
2014 			/*
2015 			 * NOTE:
2016 			 * There is _no_ need to fixup if_capenable and
2017 			 * if_hwassist, since the if_capabilities before
2018 			 * restoration was an intersection of the VF's
2019 			 * if_capabilites and the synthetic device's
2020 			 * if_capabilites.
2021 			 */
2022 			sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
2023 			sc->hn_ifp->if_hw_tsomaxsegcount =
2024 			    sc->hn_saved_tsosegcnt;
2025 			sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
2026 		}
2027 
2028 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2029 			/*
2030 			 * Restore RSS settings.
2031 			 */
2032 			hn_vf_rss_restore(sc);
2033 
2034 			/*
2035 			 * Resume link status management, which was suspended
2036 			 * by hn_ifnet_attevent().
2037 			 */
2038 			hn_resume_mgmt(sc);
2039 		}
2040 	}
2041 
2042 	/* Mark transparent mode VF as disabled. */
2043 	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2044 
2045 	rm_wlock(&hn_vfmap_lock);
2046 
2047 	KASSERT(ifp->if_index < hn_vfmap_size,
2048 	    ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
2049 	if (hn_vfmap[ifp->if_index] != NULL) {
2050 		KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
2051 		    ("%s: ifindex %d was mapped to %s",
2052 		     ifp->if_xname, ifp->if_index,
2053 		     hn_vfmap[ifp->if_index]->if_xname));
2054 		hn_vfmap[ifp->if_index] = NULL;
2055 	}
2056 
2057 	rm_wunlock(&hn_vfmap_lock);
2058 done:
2059 	HN_UNLOCK(sc);
2060 }
2061 
2062 static void
2063 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
2064 {
2065 	struct hn_softc *sc = xsc;
2066 
2067 	if (sc->hn_vf_ifp == ifp)
2068 		if_link_state_change(sc->hn_ifp, link_state);
2069 }
2070 
2071 static int
2072 hn_probe(device_t dev)
2073 {
2074 
2075 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2076 		device_set_desc(dev, "Hyper-V Network Interface");
2077 		return BUS_PROBE_DEFAULT;
2078 	}
2079 	return ENXIO;
2080 }
2081 
2082 static int
2083 hn_attach(device_t dev)
2084 {
2085 	struct hn_softc *sc = device_get_softc(dev);
2086 	struct sysctl_oid_list *child;
2087 	struct sysctl_ctx_list *ctx;
2088 	uint8_t eaddr[ETHER_ADDR_LEN];
2089 	struct ifnet *ifp = NULL;
2090 	int error, ring_cnt, tx_ring_cnt;
2091 	uint32_t mtu;
2092 
2093 	sc->hn_dev = dev;
2094 	sc->hn_prichan = vmbus_get_channel(dev);
2095 	HN_LOCK_INIT(sc);
2096 	rm_init(&sc->hn_vf_lock, "hnvf");
2097 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2098 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2099 
2100 	/*
2101 	 * Initialize these tunables once.
2102 	 */
2103 	sc->hn_agg_size = hn_tx_agg_size;
2104 	sc->hn_agg_pkts = hn_tx_agg_pkts;
2105 
2106 	/*
2107 	 * Setup taskqueue for transmission.
2108 	 */
2109 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2110 		int i;
2111 
2112 		sc->hn_tx_taskqs =
2113 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2114 		    M_DEVBUF, M_WAITOK);
2115 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2116 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2117 			    M_WAITOK, taskqueue_thread_enqueue,
2118 			    &sc->hn_tx_taskqs[i]);
2119 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2120 			    "%s tx%d", device_get_nameunit(dev), i);
2121 		}
2122 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2123 		sc->hn_tx_taskqs = hn_tx_taskque;
2124 	}
2125 
2126 	/*
2127 	 * Setup taskqueue for mangement tasks, e.g. link status.
2128 	 */
2129 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2130 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2131 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2132 	    device_get_nameunit(dev));
2133 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2134 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2135 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2136 	    hn_netchg_status_taskfunc, sc);
2137 
2138 	if (hn_xpnt_vf) {
2139 		/*
2140 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2141 		 */
2142 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2143 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2144 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2145 		    device_get_nameunit(dev));
2146 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2147 		    hn_xpnt_vf_init_taskfunc, sc);
2148 	}
2149 
2150 	/*
2151 	 * Allocate ifnet and setup its name earlier, so that if_printf
2152 	 * can be used by functions, which will be called after
2153 	 * ether_ifattach().
2154 	 */
2155 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2156 	ifp->if_softc = sc;
2157 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2158 
2159 	/*
2160 	 * Initialize ifmedia earlier so that it can be unconditionally
2161 	 * destroyed, if error happened later on.
2162 	 */
2163 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2164 
2165 	/*
2166 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2167 	 * to use (tx_ring_cnt).
2168 	 *
2169 	 * NOTE:
2170 	 * The # of RX rings to use is same as the # of channels to use.
2171 	 */
2172 	ring_cnt = hn_chan_cnt;
2173 	if (ring_cnt <= 0) {
2174 		/* Default */
2175 		ring_cnt = mp_ncpus;
2176 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
2177 			ring_cnt = HN_RING_CNT_DEF_MAX;
2178 	} else if (ring_cnt > mp_ncpus) {
2179 		ring_cnt = mp_ncpus;
2180 	}
2181 #ifdef RSS
2182 	if (ring_cnt > rss_getnumbuckets())
2183 		ring_cnt = rss_getnumbuckets();
2184 #endif
2185 
2186 	tx_ring_cnt = hn_tx_ring_cnt;
2187 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2188 		tx_ring_cnt = ring_cnt;
2189 #ifdef HN_IFSTART_SUPPORT
2190 	if (hn_use_if_start) {
2191 		/* ifnet.if_start only needs one TX ring. */
2192 		tx_ring_cnt = 1;
2193 	}
2194 #endif
2195 
2196 	/*
2197 	 * Set the leader CPU for channels.
2198 	 */
2199 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2200 
2201 	/*
2202 	 * Create enough TX/RX rings, even if only limited number of
2203 	 * channels can be allocated.
2204 	 */
2205 	error = hn_create_tx_data(sc, tx_ring_cnt);
2206 	if (error)
2207 		goto failed;
2208 	error = hn_create_rx_data(sc, ring_cnt);
2209 	if (error)
2210 		goto failed;
2211 
2212 	/*
2213 	 * Create transaction context for NVS and RNDIS transactions.
2214 	 */
2215 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2216 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2217 	if (sc->hn_xact == NULL) {
2218 		error = ENXIO;
2219 		goto failed;
2220 	}
2221 
2222 	/*
2223 	 * Install orphan handler for the revocation of this device's
2224 	 * primary channel.
2225 	 *
2226 	 * NOTE:
2227 	 * The processing order is critical here:
2228 	 * Install the orphan handler, _before_ testing whether this
2229 	 * device's primary channel has been revoked or not.
2230 	 */
2231 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2232 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2233 		error = ENXIO;
2234 		goto failed;
2235 	}
2236 
2237 	/*
2238 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
2239 	 */
2240 	error = hn_synth_attach(sc, ETHERMTU);
2241 	if (error)
2242 		goto failed;
2243 
2244 	error = hn_rndis_get_eaddr(sc, eaddr);
2245 	if (error)
2246 		goto failed;
2247 
2248 	error = hn_rndis_get_mtu(sc, &mtu);
2249 	if (error)
2250 		mtu = ETHERMTU;
2251 	else if (bootverbose)
2252 		device_printf(dev, "RNDIS mtu %u\n", mtu);
2253 
2254 #if __FreeBSD_version >= 1100099
2255 	if (sc->hn_rx_ring_inuse > 1) {
2256 		/*
2257 		 * Reduce TCP segment aggregation limit for multiple
2258 		 * RX rings to increase ACK timeliness.
2259 		 */
2260 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2261 	}
2262 #endif
2263 
2264 	/*
2265 	 * Fixup TX/RX stuffs after synthetic parts are attached.
2266 	 */
2267 	hn_fixup_tx_data(sc);
2268 	hn_fixup_rx_data(sc);
2269 
2270 	ctx = device_get_sysctl_ctx(dev);
2271 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2272 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2273 	    &sc->hn_nvs_ver, 0, "NVS version");
2274 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2275 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2276 	    hn_ndis_version_sysctl, "A", "NDIS version");
2277 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2278 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2279 	    hn_caps_sysctl, "A", "capabilities");
2280 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2281 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2282 	    hn_hwassist_sysctl, "A", "hwassist");
2283 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2284 	    CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2285 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2286 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2287 	    "max # of TSO segments");
2288 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2289 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2290 	    "max size of TSO segment");
2291 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2292 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2293 	    hn_rxfilter_sysctl, "A", "rxfilter");
2294 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2295 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2296 	    hn_rss_hash_sysctl, "A", "RSS hash");
2297 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2298 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2299 	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2300 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2301 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2302 	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2303 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2304 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2305 #ifndef RSS
2306 	/*
2307 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
2308 	 */
2309 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2310 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2311 	    hn_rss_key_sysctl, "IU", "RSS key");
2312 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2313 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2314 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
2315 #endif
2316 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2317 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2318 	    "RNDIS offered packet transmission aggregation size limit");
2319 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2320 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2321 	    "RNDIS offered packet transmission aggregation count limit");
2322 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2323 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2324 	    "RNDIS packet transmission aggregation alignment");
2325 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2326 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2327 	    hn_txagg_size_sysctl, "I",
2328 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2329 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2330 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2331 	    hn_txagg_pkts_sysctl, "I",
2332 	    "Packet transmission aggregation packets, "
2333 	    "0 -- disable, -1 -- auto");
2334 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2335 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2336 	    hn_polling_sysctl, "I",
2337 	    "Polling frequency: [100,1000000], 0 disable polling");
2338 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2339 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2340 	    hn_vf_sysctl, "A", "Virtual Function's name");
2341 	if (!hn_xpnt_vf) {
2342 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2343 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2344 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2345 	} else {
2346 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2347 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2348 		    hn_xpnt_vf_enabled_sysctl, "I",
2349 		    "Transparent VF enabled");
2350 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2351 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2352 		    hn_xpnt_vf_accbpf_sysctl, "I",
2353 		    "Accurate BPF for transparent VF");
2354 	}
2355 
2356 	/*
2357 	 * Setup the ifmedia, which has been initialized earlier.
2358 	 */
2359 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2360 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2361 	/* XXX ifmedia_set really should do this for us */
2362 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2363 
2364 	/*
2365 	 * Setup the ifnet for this interface.
2366 	 */
2367 
2368 	ifp->if_baudrate = IF_Gbps(10);
2369 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2370 	ifp->if_ioctl = hn_ioctl;
2371 	ifp->if_init = hn_init;
2372 #ifdef HN_IFSTART_SUPPORT
2373 	if (hn_use_if_start) {
2374 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2375 
2376 		ifp->if_start = hn_start;
2377 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2378 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2379 		IFQ_SET_READY(&ifp->if_snd);
2380 	} else
2381 #endif
2382 	{
2383 		ifp->if_transmit = hn_transmit;
2384 		ifp->if_qflush = hn_xmit_qflush;
2385 	}
2386 
2387 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2388 #ifdef foo
2389 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2390 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2391 #endif
2392 	if (sc->hn_caps & HN_CAP_VLAN) {
2393 		/* XXX not sure about VLAN_MTU. */
2394 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2395 	}
2396 
2397 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2398 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2399 		ifp->if_capabilities |= IFCAP_TXCSUM;
2400 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2401 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2402 	if (sc->hn_caps & HN_CAP_TSO4) {
2403 		ifp->if_capabilities |= IFCAP_TSO4;
2404 		ifp->if_hwassist |= CSUM_IP_TSO;
2405 	}
2406 	if (sc->hn_caps & HN_CAP_TSO6) {
2407 		ifp->if_capabilities |= IFCAP_TSO6;
2408 		ifp->if_hwassist |= CSUM_IP6_TSO;
2409 	}
2410 
2411 	/* Enable all available capabilities by default. */
2412 	ifp->if_capenable = ifp->if_capabilities;
2413 
2414 	/*
2415 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2416 	 * be enabled through SIOCSIFCAP.
2417 	 */
2418 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2419 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2420 
2421 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2422 		/*
2423 		 * Lock hn_set_tso_maxsize() to simplify its
2424 		 * internal logic.
2425 		 */
2426 		HN_LOCK(sc);
2427 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2428 		HN_UNLOCK(sc);
2429 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2430 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2431 	}
2432 
2433 	ether_ifattach(ifp, eaddr);
2434 
2435 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2436 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2437 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2438 	}
2439 	if (mtu < ETHERMTU) {
2440 		if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu);
2441 		ifp->if_mtu = mtu;
2442 	}
2443 
2444 	/* Inform the upper layer about the long frame support. */
2445 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2446 
2447 	/*
2448 	 * Kick off link status check.
2449 	 */
2450 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2451 	hn_update_link_status(sc);
2452 
2453 	if (!hn_xpnt_vf) {
2454 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2455 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2456 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2457 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2458 	} else {
2459 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2460 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2461 	}
2462 
2463 	/*
2464 	 * NOTE:
2465 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2466 	 * since interface's LLADDR is needed; interface LLADDR is not
2467 	 * available when ifnet_arrival event is triggered.
2468 	 */
2469 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2470 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2471 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2472 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2473 
2474 	return (0);
2475 failed:
2476 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2477 		hn_synth_detach(sc);
2478 	hn_detach(dev);
2479 	return (error);
2480 }
2481 
2482 static int
2483 hn_detach(device_t dev)
2484 {
2485 	struct hn_softc *sc = device_get_softc(dev);
2486 	struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2487 
2488 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2489 		/*
2490 		 * In case that the vmbus missed the orphan handler
2491 		 * installation.
2492 		 */
2493 		vmbus_xact_ctx_orphan(sc->hn_xact);
2494 	}
2495 
2496 	if (sc->hn_ifaddr_evthand != NULL)
2497 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2498 	if (sc->hn_ifnet_evthand != NULL)
2499 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2500 	if (sc->hn_ifnet_atthand != NULL) {
2501 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2502 		    sc->hn_ifnet_atthand);
2503 	}
2504 	if (sc->hn_ifnet_dethand != NULL) {
2505 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2506 		    sc->hn_ifnet_dethand);
2507 	}
2508 	if (sc->hn_ifnet_lnkhand != NULL)
2509 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2510 
2511 	vf_ifp = sc->hn_vf_ifp;
2512 	__compiler_membar();
2513 	if (vf_ifp != NULL)
2514 		hn_ifnet_detevent(sc, vf_ifp);
2515 
2516 	if (device_is_attached(dev)) {
2517 		HN_LOCK(sc);
2518 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2519 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2520 				hn_stop(sc, true);
2521 			/*
2522 			 * NOTE:
2523 			 * hn_stop() only suspends data, so managment
2524 			 * stuffs have to be suspended manually here.
2525 			 */
2526 			hn_suspend_mgmt(sc);
2527 			hn_synth_detach(sc);
2528 		}
2529 		HN_UNLOCK(sc);
2530 		ether_ifdetach(ifp);
2531 	}
2532 
2533 	ifmedia_removeall(&sc->hn_media);
2534 	hn_destroy_rx_data(sc);
2535 	hn_destroy_tx_data(sc);
2536 
2537 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2538 		int i;
2539 
2540 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2541 			taskqueue_free(sc->hn_tx_taskqs[i]);
2542 		free(sc->hn_tx_taskqs, M_DEVBUF);
2543 	}
2544 	taskqueue_free(sc->hn_mgmt_taskq0);
2545 	if (sc->hn_vf_taskq != NULL)
2546 		taskqueue_free(sc->hn_vf_taskq);
2547 
2548 	if (sc->hn_xact != NULL) {
2549 		/*
2550 		 * Uninstall the orphan handler _before_ the xact is
2551 		 * destructed.
2552 		 */
2553 		vmbus_chan_unset_orphan(sc->hn_prichan);
2554 		vmbus_xact_ctx_destroy(sc->hn_xact);
2555 	}
2556 
2557 	if_free(ifp);
2558 
2559 	HN_LOCK_DESTROY(sc);
2560 	rm_destroy(&sc->hn_vf_lock);
2561 	return (0);
2562 }
2563 
2564 static int
2565 hn_shutdown(device_t dev)
2566 {
2567 
2568 	return (0);
2569 }
2570 
2571 static void
2572 hn_link_status(struct hn_softc *sc)
2573 {
2574 	uint32_t link_status;
2575 	int error;
2576 
2577 	error = hn_rndis_get_linkstatus(sc, &link_status);
2578 	if (error) {
2579 		/* XXX what to do? */
2580 		return;
2581 	}
2582 
2583 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2584 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2585 	else
2586 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2587 	if_link_state_change(sc->hn_ifp,
2588 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2589 	    LINK_STATE_UP : LINK_STATE_DOWN);
2590 }
2591 
2592 static void
2593 hn_link_taskfunc(void *xsc, int pending __unused)
2594 {
2595 	struct hn_softc *sc = xsc;
2596 
2597 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2598 		return;
2599 	hn_link_status(sc);
2600 }
2601 
2602 static void
2603 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2604 {
2605 	struct hn_softc *sc = xsc;
2606 
2607 	/* Prevent any link status checks from running. */
2608 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2609 
2610 	/*
2611 	 * Fake up a [link down --> link up] state change; 5 seconds
2612 	 * delay is used, which closely simulates miibus reaction
2613 	 * upon link down event.
2614 	 */
2615 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2616 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2617 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2618 	    &sc->hn_netchg_status, 5 * hz);
2619 }
2620 
2621 static void
2622 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2623 {
2624 	struct hn_softc *sc = xsc;
2625 
2626 	/* Re-allow link status checks. */
2627 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2628 	hn_link_status(sc);
2629 }
2630 
2631 static void
2632 hn_update_link_status(struct hn_softc *sc)
2633 {
2634 
2635 	if (sc->hn_mgmt_taskq != NULL)
2636 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2637 }
2638 
2639 static void
2640 hn_change_network(struct hn_softc *sc)
2641 {
2642 
2643 	if (sc->hn_mgmt_taskq != NULL)
2644 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2645 }
2646 
2647 static __inline int
2648 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2649     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2650 {
2651 	struct mbuf *m = *m_head;
2652 	int error;
2653 
2654 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2655 
2656 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2657 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2658 	if (error == EFBIG) {
2659 		struct mbuf *m_new;
2660 
2661 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2662 		if (m_new == NULL)
2663 			return ENOBUFS;
2664 		else
2665 			*m_head = m = m_new;
2666 		txr->hn_tx_collapsed++;
2667 
2668 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2669 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2670 	}
2671 	if (!error) {
2672 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2673 		    BUS_DMASYNC_PREWRITE);
2674 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2675 	}
2676 	return error;
2677 }
2678 
2679 static __inline int
2680 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2681 {
2682 
2683 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2684 	    ("put an onlist txd %#x", txd->flags));
2685 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2686 	    ("put an onagg txd %#x", txd->flags));
2687 
2688 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2689 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2690 		return 0;
2691 
2692 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2693 		struct hn_txdesc *tmp_txd;
2694 
2695 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2696 			int freed;
2697 
2698 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2699 			    ("resursive aggregation on aggregated txdesc"));
2700 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2701 			    ("not aggregated txdesc"));
2702 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2703 			    ("aggregated txdesc uses dmamap"));
2704 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2705 			    ("aggregated txdesc consumes "
2706 			     "chimney sending buffer"));
2707 			KASSERT(tmp_txd->chim_size == 0,
2708 			    ("aggregated txdesc has non-zero "
2709 			     "chimney sending size"));
2710 
2711 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2712 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2713 			freed = hn_txdesc_put(txr, tmp_txd);
2714 			KASSERT(freed, ("failed to free aggregated txdesc"));
2715 		}
2716 	}
2717 
2718 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2719 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2720 		    ("chim txd uses dmamap"));
2721 		hn_chim_free(txr->hn_sc, txd->chim_index);
2722 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2723 		txd->chim_size = 0;
2724 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2725 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2726 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2727 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2728 		    txd->data_dmap);
2729 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2730 	}
2731 
2732 	if (txd->m != NULL) {
2733 		m_freem(txd->m);
2734 		txd->m = NULL;
2735 	}
2736 
2737 	txd->flags |= HN_TXD_FLAG_ONLIST;
2738 #ifndef HN_USE_TXDESC_BUFRING
2739 	mtx_lock_spin(&txr->hn_txlist_spin);
2740 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2741 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2742 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2743 	txr->hn_txdesc_avail++;
2744 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2745 	mtx_unlock_spin(&txr->hn_txlist_spin);
2746 #else	/* HN_USE_TXDESC_BUFRING */
2747 #ifdef HN_DEBUG
2748 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2749 #endif
2750 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2751 #endif	/* !HN_USE_TXDESC_BUFRING */
2752 
2753 	return 1;
2754 }
2755 
2756 static __inline struct hn_txdesc *
2757 hn_txdesc_get(struct hn_tx_ring *txr)
2758 {
2759 	struct hn_txdesc *txd;
2760 
2761 #ifndef HN_USE_TXDESC_BUFRING
2762 	mtx_lock_spin(&txr->hn_txlist_spin);
2763 	txd = SLIST_FIRST(&txr->hn_txlist);
2764 	if (txd != NULL) {
2765 		KASSERT(txr->hn_txdesc_avail > 0,
2766 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2767 		txr->hn_txdesc_avail--;
2768 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2769 	}
2770 	mtx_unlock_spin(&txr->hn_txlist_spin);
2771 #else
2772 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2773 #endif
2774 
2775 	if (txd != NULL) {
2776 #ifdef HN_USE_TXDESC_BUFRING
2777 #ifdef HN_DEBUG
2778 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2779 #endif
2780 #endif	/* HN_USE_TXDESC_BUFRING */
2781 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2782 		    STAILQ_EMPTY(&txd->agg_list) &&
2783 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2784 		    txd->chim_size == 0 &&
2785 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2786 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2787 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2788 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2789 		txd->refs = 1;
2790 	}
2791 	return txd;
2792 }
2793 
2794 static __inline void
2795 hn_txdesc_hold(struct hn_txdesc *txd)
2796 {
2797 
2798 	/* 0->1 transition will never work */
2799 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2800 	atomic_add_int(&txd->refs, 1);
2801 }
2802 
2803 static __inline void
2804 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2805 {
2806 
2807 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2808 	    ("recursive aggregation on aggregating txdesc"));
2809 
2810 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2811 	    ("already aggregated"));
2812 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2813 	    ("recursive aggregation on to-be-aggregated txdesc"));
2814 
2815 	txd->flags |= HN_TXD_FLAG_ONAGG;
2816 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2817 }
2818 
2819 static bool
2820 hn_tx_ring_pending(struct hn_tx_ring *txr)
2821 {
2822 	bool pending = false;
2823 
2824 #ifndef HN_USE_TXDESC_BUFRING
2825 	mtx_lock_spin(&txr->hn_txlist_spin);
2826 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2827 		pending = true;
2828 	mtx_unlock_spin(&txr->hn_txlist_spin);
2829 #else
2830 	if (!buf_ring_full(txr->hn_txdesc_br))
2831 		pending = true;
2832 #endif
2833 	return (pending);
2834 }
2835 
2836 static __inline void
2837 hn_txeof(struct hn_tx_ring *txr)
2838 {
2839 	txr->hn_has_txeof = 0;
2840 	txr->hn_txeof(txr);
2841 }
2842 
2843 static void
2844 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2845     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2846 {
2847 	struct hn_txdesc *txd = sndc->hn_cbarg;
2848 	struct hn_tx_ring *txr;
2849 
2850 	txr = txd->txr;
2851 	KASSERT(txr->hn_chan == chan,
2852 	    ("channel mismatch, on chan%u, should be chan%u",
2853 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2854 
2855 	txr->hn_has_txeof = 1;
2856 	hn_txdesc_put(txr, txd);
2857 
2858 	++txr->hn_txdone_cnt;
2859 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2860 		txr->hn_txdone_cnt = 0;
2861 		if (txr->hn_oactive)
2862 			hn_txeof(txr);
2863 	}
2864 }
2865 
2866 static void
2867 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2868 {
2869 #if defined(INET) || defined(INET6)
2870 	tcp_lro_flush_all(&rxr->hn_lro);
2871 #endif
2872 
2873 	/*
2874 	 * NOTE:
2875 	 * 'txr' could be NULL, if multiple channels and
2876 	 * ifnet.if_start method are enabled.
2877 	 */
2878 	if (txr == NULL || !txr->hn_has_txeof)
2879 		return;
2880 
2881 	txr->hn_txdone_cnt = 0;
2882 	hn_txeof(txr);
2883 }
2884 
2885 static __inline uint32_t
2886 hn_rndis_pktmsg_offset(uint32_t ofs)
2887 {
2888 
2889 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2890 	    ("invalid RNDIS packet msg offset %u", ofs));
2891 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2892 }
2893 
2894 static __inline void *
2895 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2896     size_t pi_dlen, uint32_t pi_type)
2897 {
2898 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2899 	struct rndis_pktinfo *pi;
2900 
2901 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2902 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2903 
2904 	/*
2905 	 * Per-packet-info does not move; it only grows.
2906 	 *
2907 	 * NOTE:
2908 	 * rm_pktinfooffset in this phase counts from the beginning
2909 	 * of rndis_packet_msg.
2910 	 */
2911 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2912 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2913 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2914 	    pkt->rm_pktinfolen);
2915 	pkt->rm_pktinfolen += pi_size;
2916 
2917 	pi->rm_size = pi_size;
2918 	pi->rm_type = pi_type;
2919 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2920 
2921 	return (pi->rm_data);
2922 }
2923 
2924 static __inline int
2925 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2926 {
2927 	struct hn_txdesc *txd;
2928 	struct mbuf *m;
2929 	int error, pkts;
2930 
2931 	txd = txr->hn_agg_txd;
2932 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2933 
2934 	/*
2935 	 * Since hn_txpkt() will reset this temporary stat, save
2936 	 * it now, so that oerrors can be updated properly, if
2937 	 * hn_txpkt() ever fails.
2938 	 */
2939 	pkts = txr->hn_stat_pkts;
2940 
2941 	/*
2942 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2943 	 * failure, save it for later freeing, if hn_txpkt() ever
2944 	 * fails.
2945 	 */
2946 	m = txd->m;
2947 	error = hn_txpkt(ifp, txr, txd);
2948 	if (__predict_false(error)) {
2949 		/* txd is freed, but m is not. */
2950 		m_freem(m);
2951 
2952 		txr->hn_flush_failed++;
2953 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2954 	}
2955 
2956 	/* Reset all aggregation states. */
2957 	txr->hn_agg_txd = NULL;
2958 	txr->hn_agg_szleft = 0;
2959 	txr->hn_agg_pktleft = 0;
2960 	txr->hn_agg_prevpkt = NULL;
2961 
2962 	return (error);
2963 }
2964 
2965 static void *
2966 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2967     int pktsize)
2968 {
2969 	void *chim;
2970 
2971 	if (txr->hn_agg_txd != NULL) {
2972 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2973 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2974 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2975 			int olen;
2976 
2977 			/*
2978 			 * Update the previous RNDIS packet's total length,
2979 			 * it can be increased due to the mandatory alignment
2980 			 * padding for this RNDIS packet.  And update the
2981 			 * aggregating txdesc's chimney sending buffer size
2982 			 * accordingly.
2983 			 *
2984 			 * XXX
2985 			 * Zero-out the padding, as required by the RNDIS spec.
2986 			 */
2987 			olen = pkt->rm_len;
2988 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2989 			agg_txd->chim_size += pkt->rm_len - olen;
2990 
2991 			/* Link this txdesc to the parent. */
2992 			hn_txdesc_agg(agg_txd, txd);
2993 
2994 			chim = (uint8_t *)pkt + pkt->rm_len;
2995 			/* Save the current packet for later fixup. */
2996 			txr->hn_agg_prevpkt = chim;
2997 
2998 			txr->hn_agg_pktleft--;
2999 			txr->hn_agg_szleft -= pktsize;
3000 			if (txr->hn_agg_szleft <=
3001 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3002 				/*
3003 				 * Probably can't aggregate more packets,
3004 				 * flush this aggregating txdesc proactively.
3005 				 */
3006 				txr->hn_agg_pktleft = 0;
3007 			}
3008 			/* Done! */
3009 			return (chim);
3010 		}
3011 		hn_flush_txagg(ifp, txr);
3012 	}
3013 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3014 
3015 	txr->hn_tx_chimney_tried++;
3016 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
3017 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3018 		return (NULL);
3019 	txr->hn_tx_chimney++;
3020 
3021 	chim = txr->hn_sc->hn_chim +
3022 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3023 
3024 	if (txr->hn_agg_pktmax > 1 &&
3025 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3026 		txr->hn_agg_txd = txd;
3027 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3028 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3029 		txr->hn_agg_prevpkt = chim;
3030 	}
3031 	return (chim);
3032 }
3033 
3034 /*
3035  * NOTE:
3036  * If this function fails, then both txd and m_head0 will be freed.
3037  */
3038 static int
3039 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3040     struct mbuf **m_head0)
3041 {
3042 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3043 	int error, nsegs, i;
3044 	struct mbuf *m_head = *m_head0;
3045 	struct rndis_packet_msg *pkt;
3046 	uint32_t *pi_data;
3047 	void *chim = NULL;
3048 	int pkt_hlen, pkt_size;
3049 
3050 	pkt = txd->rndis_pkt;
3051 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3052 	if (pkt_size < txr->hn_chim_size) {
3053 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3054 		if (chim != NULL)
3055 			pkt = chim;
3056 	} else {
3057 		if (txr->hn_agg_txd != NULL)
3058 			hn_flush_txagg(ifp, txr);
3059 	}
3060 
3061 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3062 	pkt->rm_len = m_head->m_pkthdr.len;
3063 	pkt->rm_dataoffset = 0;
3064 	pkt->rm_datalen = m_head->m_pkthdr.len;
3065 	pkt->rm_oobdataoffset = 0;
3066 	pkt->rm_oobdatalen = 0;
3067 	pkt->rm_oobdataelements = 0;
3068 	pkt->rm_pktinfooffset = sizeof(*pkt);
3069 	pkt->rm_pktinfolen = 0;
3070 	pkt->rm_vchandle = 0;
3071 	pkt->rm_reserved = 0;
3072 
3073 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3074 		/*
3075 		 * Set the hash value for this packet, so that the host could
3076 		 * dispatch the TX done event for this packet back to this TX
3077 		 * ring's channel.
3078 		 */
3079 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3080 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3081 		*pi_data = txr->hn_tx_idx;
3082 	}
3083 
3084 	if (m_head->m_flags & M_VLANTAG) {
3085 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3086 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3087 		*pi_data = NDIS_VLAN_INFO_MAKE(
3088 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3089 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3090 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3091 	}
3092 
3093 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3094 #if defined(INET6) || defined(INET)
3095 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3096 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3097 #ifdef INET
3098 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3099 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3100 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3101 			    m_head->m_pkthdr.tso_segsz);
3102 		}
3103 #endif
3104 #if defined(INET6) && defined(INET)
3105 		else
3106 #endif
3107 #ifdef INET6
3108 		{
3109 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3110 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3111 			    m_head->m_pkthdr.tso_segsz);
3112 		}
3113 #endif
3114 #endif	/* INET6 || INET */
3115 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3116 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3117 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3118 		if (m_head->m_pkthdr.csum_flags &
3119 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3120 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
3121 		} else {
3122 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
3123 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3124 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
3125 		}
3126 
3127 		if (m_head->m_pkthdr.csum_flags &
3128 		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3129 			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3130 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3131 		} else if (m_head->m_pkthdr.csum_flags &
3132 		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3133 			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3134 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3135 		}
3136 	}
3137 
3138 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3139 	/* Fixup RNDIS packet message total length */
3140 	pkt->rm_len += pkt_hlen;
3141 	/* Convert RNDIS packet message offsets */
3142 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3143 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3144 
3145 	/*
3146 	 * Fast path: Chimney sending.
3147 	 */
3148 	if (chim != NULL) {
3149 		struct hn_txdesc *tgt_txd = txd;
3150 
3151 		if (txr->hn_agg_txd != NULL) {
3152 			tgt_txd = txr->hn_agg_txd;
3153 #ifdef INVARIANTS
3154 			*m_head0 = NULL;
3155 #endif
3156 		}
3157 
3158 		KASSERT(pkt == chim,
3159 		    ("RNDIS pkt not in chimney sending buffer"));
3160 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3161 		    ("chimney sending buffer is not used"));
3162 		tgt_txd->chim_size += pkt->rm_len;
3163 
3164 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
3165 		    ((uint8_t *)chim) + pkt_hlen);
3166 
3167 		txr->hn_gpa_cnt = 0;
3168 		txr->hn_sendpkt = hn_txpkt_chim;
3169 		goto done;
3170 	}
3171 
3172 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3173 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3174 	    ("chimney buffer is used"));
3175 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3176 
3177 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3178 	if (__predict_false(error)) {
3179 		int freed;
3180 
3181 		/*
3182 		 * This mbuf is not linked w/ the txd yet, so free it now.
3183 		 */
3184 		m_freem(m_head);
3185 		*m_head0 = NULL;
3186 
3187 		freed = hn_txdesc_put(txr, txd);
3188 		KASSERT(freed != 0,
3189 		    ("fail to free txd upon txdma error"));
3190 
3191 		txr->hn_txdma_failed++;
3192 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3193 		return error;
3194 	}
3195 	*m_head0 = m_head;
3196 
3197 	/* +1 RNDIS packet message */
3198 	txr->hn_gpa_cnt = nsegs + 1;
3199 
3200 	/* send packet with page buffer */
3201 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3202 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3203 	txr->hn_gpa[0].gpa_len = pkt_hlen;
3204 
3205 	/*
3206 	 * Fill the page buffers with mbuf info after the page
3207 	 * buffer for RNDIS packet message.
3208 	 */
3209 	for (i = 0; i < nsegs; ++i) {
3210 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3211 
3212 		gpa->gpa_page = atop(segs[i].ds_addr);
3213 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3214 		gpa->gpa_len = segs[i].ds_len;
3215 	}
3216 
3217 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3218 	txd->chim_size = 0;
3219 	txr->hn_sendpkt = hn_txpkt_sglist;
3220 done:
3221 	txd->m = m_head;
3222 
3223 	/* Set the completion routine */
3224 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3225 
3226 	/* Update temporary stats for later use. */
3227 	txr->hn_stat_pkts++;
3228 	txr->hn_stat_size += m_head->m_pkthdr.len;
3229 	if (m_head->m_flags & M_MCAST)
3230 		txr->hn_stat_mcasts++;
3231 
3232 	return 0;
3233 }
3234 
3235 /*
3236  * NOTE:
3237  * If this function fails, then txd will be freed, but the mbuf
3238  * associated w/ the txd will _not_ be freed.
3239  */
3240 static int
3241 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3242 {
3243 	int error, send_failed = 0, has_bpf;
3244 
3245 again:
3246 	has_bpf = bpf_peers_present(ifp->if_bpf);
3247 	if (has_bpf) {
3248 		/*
3249 		 * Make sure that this txd and any aggregated txds are not
3250 		 * freed before ETHER_BPF_MTAP.
3251 		 */
3252 		hn_txdesc_hold(txd);
3253 	}
3254 	error = txr->hn_sendpkt(txr, txd);
3255 	if (!error) {
3256 		if (has_bpf) {
3257 			const struct hn_txdesc *tmp_txd;
3258 
3259 			ETHER_BPF_MTAP(ifp, txd->m);
3260 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3261 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
3262 		}
3263 
3264 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3265 #ifdef HN_IFSTART_SUPPORT
3266 		if (!hn_use_if_start)
3267 #endif
3268 		{
3269 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
3270 			    txr->hn_stat_size);
3271 			if (txr->hn_stat_mcasts != 0) {
3272 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3273 				    txr->hn_stat_mcasts);
3274 			}
3275 		}
3276 		txr->hn_pkts += txr->hn_stat_pkts;
3277 		txr->hn_sends++;
3278 	}
3279 	if (has_bpf)
3280 		hn_txdesc_put(txr, txd);
3281 
3282 	if (__predict_false(error)) {
3283 		int freed;
3284 
3285 		/*
3286 		 * This should "really rarely" happen.
3287 		 *
3288 		 * XXX Too many RX to be acked or too many sideband
3289 		 * commands to run?  Ask netvsc_channel_rollup()
3290 		 * to kick start later.
3291 		 */
3292 		txr->hn_has_txeof = 1;
3293 		if (!send_failed) {
3294 			txr->hn_send_failed++;
3295 			send_failed = 1;
3296 			/*
3297 			 * Try sending again after set hn_has_txeof;
3298 			 * in case that we missed the last
3299 			 * netvsc_channel_rollup().
3300 			 */
3301 			goto again;
3302 		}
3303 		if_printf(ifp, "send failed\n");
3304 
3305 		/*
3306 		 * Caller will perform further processing on the
3307 		 * associated mbuf, so don't free it in hn_txdesc_put();
3308 		 * only unload it from the DMA map in hn_txdesc_put(),
3309 		 * if it was loaded.
3310 		 */
3311 		txd->m = NULL;
3312 		freed = hn_txdesc_put(txr, txd);
3313 		KASSERT(freed != 0,
3314 		    ("fail to free txd upon send error"));
3315 
3316 		txr->hn_send_failed++;
3317 	}
3318 
3319 	/* Reset temporary stats, after this sending is done. */
3320 	txr->hn_stat_size = 0;
3321 	txr->hn_stat_pkts = 0;
3322 	txr->hn_stat_mcasts = 0;
3323 
3324 	return (error);
3325 }
3326 
3327 /*
3328  * Append the specified data to the indicated mbuf chain,
3329  * Extend the mbuf chain if the new data does not fit in
3330  * existing space.
3331  *
3332  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3333  * There should be an equivalent in the kernel mbuf code,
3334  * but there does not appear to be one yet.
3335  *
3336  * Differs from m_append() in that additional mbufs are
3337  * allocated with cluster size MJUMPAGESIZE, and filled
3338  * accordingly.
3339  *
3340  * Return 1 if able to complete the job; otherwise 0.
3341  */
3342 static int
3343 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3344 {
3345 	struct mbuf *m, *n;
3346 	int remainder, space;
3347 
3348 	for (m = m0; m->m_next != NULL; m = m->m_next)
3349 		;
3350 	remainder = len;
3351 	space = M_TRAILINGSPACE(m);
3352 	if (space > 0) {
3353 		/*
3354 		 * Copy into available space.
3355 		 */
3356 		if (space > remainder)
3357 			space = remainder;
3358 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3359 		m->m_len += space;
3360 		cp += space;
3361 		remainder -= space;
3362 	}
3363 	while (remainder > 0) {
3364 		/*
3365 		 * Allocate a new mbuf; could check space
3366 		 * and allocate a cluster instead.
3367 		 */
3368 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3369 		if (n == NULL)
3370 			break;
3371 		n->m_len = min(MJUMPAGESIZE, remainder);
3372 		bcopy(cp, mtod(n, caddr_t), n->m_len);
3373 		cp += n->m_len;
3374 		remainder -= n->m_len;
3375 		m->m_next = n;
3376 		m = n;
3377 	}
3378 	if (m0->m_flags & M_PKTHDR)
3379 		m0->m_pkthdr.len += len - remainder;
3380 
3381 	return (remainder == 0);
3382 }
3383 
3384 #if defined(INET) || defined(INET6)
3385 static __inline int
3386 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3387 {
3388 #if __FreeBSD_version >= 1100095
3389 	if (hn_lro_mbufq_depth) {
3390 		tcp_lro_queue_mbuf(lc, m);
3391 		return 0;
3392 	}
3393 #endif
3394 	return tcp_lro_rx(lc, m, 0);
3395 }
3396 #endif
3397 
3398 static int
3399 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
3400     const struct hn_rxinfo *info)
3401 {
3402 	struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3403 	struct mbuf *m_new;
3404 	int size, do_lro = 0, do_csum = 1, is_vf = 0;
3405 	int hash_type = M_HASHTYPE_NONE;
3406 	int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3407 
3408 	ifp = hn_ifp;
3409 	if (rxr->hn_rxvf_ifp != NULL) {
3410 		/*
3411 		 * Non-transparent mode VF; pretend this packet is from
3412 		 * the VF.
3413 		 */
3414 		ifp = rxr->hn_rxvf_ifp;
3415 		is_vf = 1;
3416 	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3417 		/* Transparent mode VF. */
3418 		is_vf = 1;
3419 	}
3420 
3421 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3422 		/*
3423 		 * NOTE:
3424 		 * See the NOTE of hn_rndis_init_fixat().  This
3425 		 * function can be reached, immediately after the
3426 		 * RNDIS is initialized but before the ifnet is
3427 		 * setup on the hn_attach() path; drop the unexpected
3428 		 * packets.
3429 		 */
3430 		return (0);
3431 	}
3432 
3433 	if (__predict_false(dlen < ETHER_HDR_LEN)) {
3434 		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3435 		return (0);
3436 	}
3437 
3438 	if (dlen <= MHLEN) {
3439 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3440 		if (m_new == NULL) {
3441 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3442 			return (0);
3443 		}
3444 		memcpy(mtod(m_new, void *), data, dlen);
3445 		m_new->m_pkthdr.len = m_new->m_len = dlen;
3446 		rxr->hn_small_pkts++;
3447 	} else {
3448 		/*
3449 		 * Get an mbuf with a cluster.  For packets 2K or less,
3450 		 * get a standard 2K cluster.  For anything larger, get a
3451 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3452 		 * if looped around to the Hyper-V TX channel, so avoid them.
3453 		 */
3454 		size = MCLBYTES;
3455 		if (dlen > MCLBYTES) {
3456 			/* 4096 */
3457 			size = MJUMPAGESIZE;
3458 		}
3459 
3460 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3461 		if (m_new == NULL) {
3462 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3463 			return (0);
3464 		}
3465 
3466 		hv_m_append(m_new, dlen, data);
3467 	}
3468 	m_new->m_pkthdr.rcvif = ifp;
3469 
3470 	if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3471 		do_csum = 0;
3472 
3473 	/* receive side checksum offload */
3474 	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
3475 		/* IP csum offload */
3476 		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3477 			m_new->m_pkthdr.csum_flags |=
3478 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3479 			rxr->hn_csum_ip++;
3480 		}
3481 
3482 		/* TCP/UDP csum offload */
3483 		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
3484 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3485 			m_new->m_pkthdr.csum_flags |=
3486 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3487 			m_new->m_pkthdr.csum_data = 0xffff;
3488 			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
3489 				rxr->hn_csum_tcp++;
3490 			else
3491 				rxr->hn_csum_udp++;
3492 		}
3493 
3494 		/*
3495 		 * XXX
3496 		 * As of this write (Oct 28th, 2016), host side will turn
3497 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3498 		 * the do_lro setting here is actually _not_ accurate.  We
3499 		 * depend on the RSS hash type check to reset do_lro.
3500 		 */
3501 		if ((info->csum_info &
3502 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3503 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3504 			do_lro = 1;
3505 	} else {
3506 		hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3507 		if (l3proto == ETHERTYPE_IP) {
3508 			if (l4proto == IPPROTO_TCP) {
3509 				if (do_csum &&
3510 				    (rxr->hn_trust_hcsum &
3511 				     HN_TRUST_HCSUM_TCP)) {
3512 					rxr->hn_csum_trusted++;
3513 					m_new->m_pkthdr.csum_flags |=
3514 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3515 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3516 					m_new->m_pkthdr.csum_data = 0xffff;
3517 				}
3518 				do_lro = 1;
3519 			} else if (l4proto == IPPROTO_UDP) {
3520 				if (do_csum &&
3521 				    (rxr->hn_trust_hcsum &
3522 				     HN_TRUST_HCSUM_UDP)) {
3523 					rxr->hn_csum_trusted++;
3524 					m_new->m_pkthdr.csum_flags |=
3525 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3526 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3527 					m_new->m_pkthdr.csum_data = 0xffff;
3528 				}
3529 			} else if (l4proto != IPPROTO_DONE && do_csum &&
3530 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3531 				rxr->hn_csum_trusted++;
3532 				m_new->m_pkthdr.csum_flags |=
3533 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3534 			}
3535 		}
3536 	}
3537 
3538 	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
3539 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3540 		    NDIS_VLAN_INFO_ID(info->vlan_info),
3541 		    NDIS_VLAN_INFO_PRI(info->vlan_info),
3542 		    NDIS_VLAN_INFO_CFI(info->vlan_info));
3543 		m_new->m_flags |= M_VLANTAG;
3544 	}
3545 
3546 	/*
3547 	 * If VF is activated (tranparent/non-transparent mode does not
3548 	 * matter here).
3549 	 *
3550 	 * - Disable LRO
3551 	 *
3552 	 *   hn(4) will only receive broadcast packets, multicast packets,
3553 	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3554 	 *   packet types.
3555 	 *
3556 	 *   For non-transparent, we definitely _cannot_ enable LRO at
3557 	 *   all, since the LRO flush will use hn(4) as the receiving
3558 	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3559 	 */
3560 	if (is_vf)
3561 		do_lro = 0;
3562 
3563 	/*
3564 	 * If VF is activated (tranparent/non-transparent mode does not
3565 	 * matter here), do _not_ mess with unsupported hash types or
3566 	 * functions.
3567 	 */
3568 	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
3569 		rxr->hn_rss_pkts++;
3570 		m_new->m_pkthdr.flowid = info->hash_value;
3571 		if (!is_vf)
3572 			hash_type = M_HASHTYPE_OPAQUE_HASH;
3573 		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
3574 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3575 			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK &
3576 			    rxr->hn_mbuf_hash);
3577 
3578 			/*
3579 			 * NOTE:
3580 			 * do_lro is resetted, if the hash types are not TCP
3581 			 * related.  See the comment in the above csum_flags
3582 			 * setup section.
3583 			 */
3584 			switch (type) {
3585 			case NDIS_HASH_IPV4:
3586 				hash_type = M_HASHTYPE_RSS_IPV4;
3587 				do_lro = 0;
3588 				break;
3589 
3590 			case NDIS_HASH_TCP_IPV4:
3591 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3592 				if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3593 					int def_htype = M_HASHTYPE_OPAQUE_HASH;
3594 
3595 					if (is_vf)
3596 						def_htype = M_HASHTYPE_NONE;
3597 
3598 					/*
3599 					 * UDP 4-tuple hash is delivered as
3600 					 * TCP 4-tuple hash.
3601 					 */
3602 					if (l3proto == ETHERTYPE_MAX) {
3603 						hn_rxpkt_proto(m_new,
3604 						    &l3proto, &l4proto);
3605 					}
3606 					if (l3proto == ETHERTYPE_IP) {
3607 						if (l4proto == IPPROTO_UDP &&
3608 						    (rxr->hn_mbuf_hash &
3609 						     NDIS_HASH_UDP_IPV4_X)) {
3610 							hash_type =
3611 							M_HASHTYPE_RSS_UDP_IPV4;
3612 							do_lro = 0;
3613 						} else if (l4proto !=
3614 						    IPPROTO_TCP) {
3615 							hash_type = def_htype;
3616 							do_lro = 0;
3617 						}
3618 					} else {
3619 						hash_type = def_htype;
3620 						do_lro = 0;
3621 					}
3622 				}
3623 				break;
3624 
3625 			case NDIS_HASH_IPV6:
3626 				hash_type = M_HASHTYPE_RSS_IPV6;
3627 				do_lro = 0;
3628 				break;
3629 
3630 			case NDIS_HASH_IPV6_EX:
3631 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3632 				do_lro = 0;
3633 				break;
3634 
3635 			case NDIS_HASH_TCP_IPV6:
3636 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3637 				break;
3638 
3639 			case NDIS_HASH_TCP_IPV6_EX:
3640 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3641 				break;
3642 			}
3643 		}
3644 	} else if (!is_vf) {
3645 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3646 		hash_type = M_HASHTYPE_OPAQUE;
3647 	}
3648 	M_HASHTYPE_SET(m_new, hash_type);
3649 
3650 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3651 	if (hn_ifp != ifp) {
3652 		const struct ether_header *eh;
3653 
3654 		/*
3655 		 * Non-transparent mode VF is activated.
3656 		 */
3657 
3658 		/*
3659 		 * Allow tapping on hn(4).
3660 		 */
3661 		ETHER_BPF_MTAP(hn_ifp, m_new);
3662 
3663 		/*
3664 		 * Update hn(4)'s stats.
3665 		 */
3666 		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3667 		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3668 		/* Checked at the beginning of this function. */
3669 		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3670 		eh = mtod(m_new, struct ether_header *);
3671 		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3672 			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3673 	}
3674 	rxr->hn_pkts++;
3675 
3676 	if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3677 #if defined(INET) || defined(INET6)
3678 		struct lro_ctrl *lro = &rxr->hn_lro;
3679 
3680 		if (lro->lro_cnt) {
3681 			rxr->hn_lro_tried++;
3682 			if (hn_lro_rx(lro, m_new) == 0) {
3683 				/* DONE! */
3684 				return 0;
3685 			}
3686 		}
3687 #endif
3688 	}
3689 	ifp->if_input(ifp, m_new);
3690 
3691 	return (0);
3692 }
3693 
3694 static int
3695 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3696 {
3697 	struct hn_softc *sc = ifp->if_softc;
3698 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3699 	struct ifnet *vf_ifp;
3700 	int mask, error = 0;
3701 	struct ifrsskey *ifrk;
3702 	struct ifrsshash *ifrh;
3703 	uint32_t mtu;
3704 
3705 	switch (cmd) {
3706 	case SIOCSIFMTU:
3707 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3708 			error = EINVAL;
3709 			break;
3710 		}
3711 
3712 		HN_LOCK(sc);
3713 
3714 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3715 			HN_UNLOCK(sc);
3716 			break;
3717 		}
3718 
3719 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3720 			/* Can't change MTU */
3721 			HN_UNLOCK(sc);
3722 			error = EOPNOTSUPP;
3723 			break;
3724 		}
3725 
3726 		if (ifp->if_mtu == ifr->ifr_mtu) {
3727 			HN_UNLOCK(sc);
3728 			break;
3729 		}
3730 
3731 		if (hn_xpnt_vf_isready(sc)) {
3732 			vf_ifp = sc->hn_vf_ifp;
3733 			ifr_vf = *ifr;
3734 			strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3735 			    sizeof(ifr_vf.ifr_name));
3736 			error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3737 			    (caddr_t)&ifr_vf);
3738 			if (error) {
3739 				HN_UNLOCK(sc);
3740 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3741 				    vf_ifp->if_xname, ifr->ifr_mtu, error);
3742 				break;
3743 			}
3744 		}
3745 
3746 		/*
3747 		 * Suspend this interface before the synthetic parts
3748 		 * are ripped.
3749 		 */
3750 		hn_suspend(sc);
3751 
3752 		/*
3753 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3754 		 */
3755 		hn_synth_detach(sc);
3756 
3757 		/*
3758 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3759 		 * with the new MTU setting.
3760 		 */
3761 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3762 		if (error) {
3763 			HN_UNLOCK(sc);
3764 			break;
3765 		}
3766 
3767 		error = hn_rndis_get_mtu(sc, &mtu);
3768 		if (error)
3769 			mtu = ifr->ifr_mtu;
3770 		else if (bootverbose)
3771 			if_printf(ifp, "RNDIS mtu %u\n", mtu);
3772 
3773 		/*
3774 		 * Commit the requested MTU, after the synthetic parts
3775 		 * have been successfully attached.
3776 		 */
3777 		if (mtu >= ifr->ifr_mtu) {
3778 			mtu = ifr->ifr_mtu;
3779 		} else {
3780 			if_printf(ifp, "fixup mtu %d -> %u\n",
3781 			    ifr->ifr_mtu, mtu);
3782 		}
3783 		ifp->if_mtu = mtu;
3784 
3785 		/*
3786 		 * Synthetic parts' reattach may change the chimney
3787 		 * sending size; update it.
3788 		 */
3789 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3790 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3791 
3792 		/*
3793 		 * Make sure that various parameters based on MTU are
3794 		 * still valid, after the MTU change.
3795 		 */
3796 		hn_mtu_change_fixup(sc);
3797 
3798 		/*
3799 		 * All done!  Resume the interface now.
3800 		 */
3801 		hn_resume(sc);
3802 
3803 		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3804 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3805 			/*
3806 			 * Since we have reattached the NVS part,
3807 			 * change the datapath to VF again; in case
3808 			 * that it is lost, after the NVS was detached.
3809 			 */
3810 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3811 		}
3812 
3813 		HN_UNLOCK(sc);
3814 		break;
3815 
3816 	case SIOCSIFFLAGS:
3817 		HN_LOCK(sc);
3818 
3819 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3820 			HN_UNLOCK(sc);
3821 			break;
3822 		}
3823 
3824 		if (hn_xpnt_vf_isready(sc))
3825 			hn_xpnt_vf_saveifflags(sc);
3826 
3827 		if (ifp->if_flags & IFF_UP) {
3828 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3829 				/*
3830 				 * Caller meight hold mutex, e.g.
3831 				 * bpf; use busy-wait for the RNDIS
3832 				 * reply.
3833 				 */
3834 				HN_NO_SLEEPING(sc);
3835 				hn_rxfilter_config(sc);
3836 				HN_SLEEPING_OK(sc);
3837 
3838 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3839 					error = hn_xpnt_vf_iocsetflags(sc);
3840 			} else {
3841 				hn_init_locked(sc);
3842 			}
3843 		} else {
3844 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3845 				hn_stop(sc, false);
3846 		}
3847 		sc->hn_if_flags = ifp->if_flags;
3848 
3849 		HN_UNLOCK(sc);
3850 		break;
3851 
3852 	case SIOCSIFCAP:
3853 		HN_LOCK(sc);
3854 
3855 		if (hn_xpnt_vf_isready(sc)) {
3856 			ifr_vf = *ifr;
3857 			strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3858 			    sizeof(ifr_vf.ifr_name));
3859 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3860 			HN_UNLOCK(sc);
3861 			break;
3862 		}
3863 
3864 		/*
3865 		 * Fix up requested capabilities w/ supported capabilities,
3866 		 * since the supported capabilities could have been changed.
3867 		 */
3868 		mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3869 		    ifp->if_capenable;
3870 
3871 		if (mask & IFCAP_TXCSUM) {
3872 			ifp->if_capenable ^= IFCAP_TXCSUM;
3873 			if (ifp->if_capenable & IFCAP_TXCSUM)
3874 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3875 			else
3876 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3877 		}
3878 		if (mask & IFCAP_TXCSUM_IPV6) {
3879 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3880 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3881 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3882 			else
3883 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3884 		}
3885 
3886 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3887 		if (mask & IFCAP_RXCSUM)
3888 			ifp->if_capenable ^= IFCAP_RXCSUM;
3889 #ifdef foo
3890 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3891 		if (mask & IFCAP_RXCSUM_IPV6)
3892 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3893 #endif
3894 
3895 		if (mask & IFCAP_LRO)
3896 			ifp->if_capenable ^= IFCAP_LRO;
3897 
3898 		if (mask & IFCAP_TSO4) {
3899 			ifp->if_capenable ^= IFCAP_TSO4;
3900 			if (ifp->if_capenable & IFCAP_TSO4)
3901 				ifp->if_hwassist |= CSUM_IP_TSO;
3902 			else
3903 				ifp->if_hwassist &= ~CSUM_IP_TSO;
3904 		}
3905 		if (mask & IFCAP_TSO6) {
3906 			ifp->if_capenable ^= IFCAP_TSO6;
3907 			if (ifp->if_capenable & IFCAP_TSO6)
3908 				ifp->if_hwassist |= CSUM_IP6_TSO;
3909 			else
3910 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
3911 		}
3912 
3913 		HN_UNLOCK(sc);
3914 		break;
3915 
3916 	case SIOCADDMULTI:
3917 	case SIOCDELMULTI:
3918 		HN_LOCK(sc);
3919 
3920 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3921 			HN_UNLOCK(sc);
3922 			break;
3923 		}
3924 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3925 			/*
3926 			 * Multicast uses mutex; use busy-wait for
3927 			 * the RNDIS reply.
3928 			 */
3929 			HN_NO_SLEEPING(sc);
3930 			hn_rxfilter_config(sc);
3931 			HN_SLEEPING_OK(sc);
3932 		}
3933 
3934 		/* XXX vlan(4) style mcast addr maintenance */
3935 		if (hn_xpnt_vf_isready(sc)) {
3936 			int old_if_flags;
3937 
3938 			old_if_flags = sc->hn_vf_ifp->if_flags;
3939 			hn_xpnt_vf_saveifflags(sc);
3940 
3941 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3942 			    ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3943 			     IFF_ALLMULTI))
3944 				error = hn_xpnt_vf_iocsetflags(sc);
3945 		}
3946 
3947 		HN_UNLOCK(sc);
3948 		break;
3949 
3950 	case SIOCSIFMEDIA:
3951 	case SIOCGIFMEDIA:
3952 		HN_LOCK(sc);
3953 		if (hn_xpnt_vf_isready(sc)) {
3954 			/*
3955 			 * SIOCGIFMEDIA expects ifmediareq, so don't
3956 			 * create and pass ifr_vf to the VF here; just
3957 			 * replace the ifr_name.
3958 			 */
3959 			vf_ifp = sc->hn_vf_ifp;
3960 			strlcpy(ifr->ifr_name, vf_ifp->if_xname,
3961 			    sizeof(ifr->ifr_name));
3962 			error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
3963 			/* Restore the ifr_name. */
3964 			strlcpy(ifr->ifr_name, ifp->if_xname,
3965 			    sizeof(ifr->ifr_name));
3966 			HN_UNLOCK(sc);
3967 			break;
3968 		}
3969 		HN_UNLOCK(sc);
3970 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
3971 		break;
3972 
3973 	case SIOCGIFRSSHASH:
3974 		ifrh = (struct ifrsshash *)data;
3975 		HN_LOCK(sc);
3976 		if (sc->hn_rx_ring_inuse == 1) {
3977 			HN_UNLOCK(sc);
3978 			ifrh->ifrh_func = RSS_FUNC_NONE;
3979 			ifrh->ifrh_types = 0;
3980 			break;
3981 		}
3982 
3983 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3984 			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
3985 		else
3986 			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
3987 		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
3988 		HN_UNLOCK(sc);
3989 		break;
3990 
3991 	case SIOCGIFRSSKEY:
3992 		ifrk = (struct ifrsskey *)data;
3993 		HN_LOCK(sc);
3994 		if (sc->hn_rx_ring_inuse == 1) {
3995 			HN_UNLOCK(sc);
3996 			ifrk->ifrk_func = RSS_FUNC_NONE;
3997 			ifrk->ifrk_keylen = 0;
3998 			break;
3999 		}
4000 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4001 			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
4002 		else
4003 			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
4004 		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
4005 		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
4006 		    NDIS_HASH_KEYSIZE_TOEPLITZ);
4007 		HN_UNLOCK(sc);
4008 		break;
4009 
4010 	default:
4011 		error = ether_ioctl(ifp, cmd, data);
4012 		break;
4013 	}
4014 	return (error);
4015 }
4016 
4017 static void
4018 hn_stop(struct hn_softc *sc, bool detaching)
4019 {
4020 	struct ifnet *ifp = sc->hn_ifp;
4021 	int i;
4022 
4023 	HN_LOCK_ASSERT(sc);
4024 
4025 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4026 	    ("synthetic parts were not attached"));
4027 
4028 	/* Clear RUNNING bit ASAP. */
4029 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4030 
4031 	/* Disable polling. */
4032 	hn_polling(sc, 0);
4033 
4034 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4035 		KASSERT(sc->hn_vf_ifp != NULL,
4036 		    ("%s: VF is not attached", ifp->if_xname));
4037 
4038 		/* Mark transparent mode VF as disabled. */
4039 		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4040 
4041 		/*
4042 		 * NOTE:
4043 		 * Datapath setting must happen _before_ bringing
4044 		 * the VF down.
4045 		 */
4046 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4047 
4048 		/*
4049 		 * Bring the VF down.
4050 		 */
4051 		hn_xpnt_vf_saveifflags(sc);
4052 		sc->hn_vf_ifp->if_flags &= ~IFF_UP;
4053 		hn_xpnt_vf_iocsetflags(sc);
4054 	}
4055 
4056 	/* Suspend data transfers. */
4057 	hn_suspend_data(sc);
4058 
4059 	/* Clear OACTIVE bit. */
4060 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4061 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4062 		sc->hn_tx_ring[i].hn_oactive = 0;
4063 
4064 	/*
4065 	 * If the non-transparent mode VF is active, make sure
4066 	 * that the RX filter still allows packet reception.
4067 	 */
4068 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4069 		hn_rxfilter_config(sc);
4070 }
4071 
4072 static void
4073 hn_init_locked(struct hn_softc *sc)
4074 {
4075 	struct ifnet *ifp = sc->hn_ifp;
4076 	int i;
4077 
4078 	HN_LOCK_ASSERT(sc);
4079 
4080 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4081 		return;
4082 
4083 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
4084 		return;
4085 
4086 	/* Configure RX filter */
4087 	hn_rxfilter_config(sc);
4088 
4089 	/* Clear OACTIVE bit. */
4090 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4091 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4092 		sc->hn_tx_ring[i].hn_oactive = 0;
4093 
4094 	/* Clear TX 'suspended' bit. */
4095 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4096 
4097 	if (hn_xpnt_vf_isready(sc)) {
4098 		/* Initialize transparent VF. */
4099 		hn_xpnt_vf_init(sc);
4100 	}
4101 
4102 	/* Everything is ready; unleash! */
4103 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4104 
4105 	/* Re-enable polling if requested. */
4106 	if (sc->hn_pollhz > 0)
4107 		hn_polling(sc, sc->hn_pollhz);
4108 }
4109 
4110 static void
4111 hn_init(void *xsc)
4112 {
4113 	struct hn_softc *sc = xsc;
4114 
4115 	HN_LOCK(sc);
4116 	hn_init_locked(sc);
4117 	HN_UNLOCK(sc);
4118 }
4119 
4120 #if __FreeBSD_version >= 1100099
4121 
4122 static int
4123 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4124 {
4125 	struct hn_softc *sc = arg1;
4126 	unsigned int lenlim;
4127 	int error;
4128 
4129 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4130 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
4131 	if (error || req->newptr == NULL)
4132 		return error;
4133 
4134 	HN_LOCK(sc);
4135 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4136 	    lenlim > TCP_LRO_LENGTH_MAX) {
4137 		HN_UNLOCK(sc);
4138 		return EINVAL;
4139 	}
4140 	hn_set_lro_lenlim(sc, lenlim);
4141 	HN_UNLOCK(sc);
4142 
4143 	return 0;
4144 }
4145 
4146 static int
4147 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4148 {
4149 	struct hn_softc *sc = arg1;
4150 	int ackcnt, error, i;
4151 
4152 	/*
4153 	 * lro_ackcnt_lim is append count limit,
4154 	 * +1 to turn it into aggregation limit.
4155 	 */
4156 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4157 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4158 	if (error || req->newptr == NULL)
4159 		return error;
4160 
4161 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4162 		return EINVAL;
4163 
4164 	/*
4165 	 * Convert aggregation limit back to append
4166 	 * count limit.
4167 	 */
4168 	--ackcnt;
4169 	HN_LOCK(sc);
4170 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4171 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4172 	HN_UNLOCK(sc);
4173 	return 0;
4174 }
4175 
4176 #endif
4177 
4178 static int
4179 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4180 {
4181 	struct hn_softc *sc = arg1;
4182 	int hcsum = arg2;
4183 	int on, error, i;
4184 
4185 	on = 0;
4186 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4187 		on = 1;
4188 
4189 	error = sysctl_handle_int(oidp, &on, 0, req);
4190 	if (error || req->newptr == NULL)
4191 		return error;
4192 
4193 	HN_LOCK(sc);
4194 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4195 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4196 
4197 		if (on)
4198 			rxr->hn_trust_hcsum |= hcsum;
4199 		else
4200 			rxr->hn_trust_hcsum &= ~hcsum;
4201 	}
4202 	HN_UNLOCK(sc);
4203 	return 0;
4204 }
4205 
4206 static int
4207 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4208 {
4209 	struct hn_softc *sc = arg1;
4210 	int chim_size, error;
4211 
4212 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
4213 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
4214 	if (error || req->newptr == NULL)
4215 		return error;
4216 
4217 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4218 		return EINVAL;
4219 
4220 	HN_LOCK(sc);
4221 	hn_set_chim_size(sc, chim_size);
4222 	HN_UNLOCK(sc);
4223 	return 0;
4224 }
4225 
4226 #if __FreeBSD_version < 1100095
4227 static int
4228 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4229 {
4230 	struct hn_softc *sc = arg1;
4231 	int ofs = arg2, i, error;
4232 	struct hn_rx_ring *rxr;
4233 	uint64_t stat;
4234 
4235 	stat = 0;
4236 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4237 		rxr = &sc->hn_rx_ring[i];
4238 		stat += *((int *)((uint8_t *)rxr + ofs));
4239 	}
4240 
4241 	error = sysctl_handle_64(oidp, &stat, 0, req);
4242 	if (error || req->newptr == NULL)
4243 		return error;
4244 
4245 	/* Zero out this stat. */
4246 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4247 		rxr = &sc->hn_rx_ring[i];
4248 		*((int *)((uint8_t *)rxr + ofs)) = 0;
4249 	}
4250 	return 0;
4251 }
4252 #else
4253 static int
4254 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4255 {
4256 	struct hn_softc *sc = arg1;
4257 	int ofs = arg2, i, error;
4258 	struct hn_rx_ring *rxr;
4259 	uint64_t stat;
4260 
4261 	stat = 0;
4262 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4263 		rxr = &sc->hn_rx_ring[i];
4264 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4265 	}
4266 
4267 	error = sysctl_handle_64(oidp, &stat, 0, req);
4268 	if (error || req->newptr == NULL)
4269 		return error;
4270 
4271 	/* Zero out this stat. */
4272 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4273 		rxr = &sc->hn_rx_ring[i];
4274 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4275 	}
4276 	return 0;
4277 }
4278 
4279 #endif
4280 
4281 static int
4282 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4283 {
4284 	struct hn_softc *sc = arg1;
4285 	int ofs = arg2, i, error;
4286 	struct hn_rx_ring *rxr;
4287 	u_long stat;
4288 
4289 	stat = 0;
4290 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4291 		rxr = &sc->hn_rx_ring[i];
4292 		stat += *((u_long *)((uint8_t *)rxr + ofs));
4293 	}
4294 
4295 	error = sysctl_handle_long(oidp, &stat, 0, req);
4296 	if (error || req->newptr == NULL)
4297 		return error;
4298 
4299 	/* Zero out this stat. */
4300 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4301 		rxr = &sc->hn_rx_ring[i];
4302 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
4303 	}
4304 	return 0;
4305 }
4306 
4307 static int
4308 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4309 {
4310 	struct hn_softc *sc = arg1;
4311 	int ofs = arg2, i, error;
4312 	struct hn_tx_ring *txr;
4313 	u_long stat;
4314 
4315 	stat = 0;
4316 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4317 		txr = &sc->hn_tx_ring[i];
4318 		stat += *((u_long *)((uint8_t *)txr + ofs));
4319 	}
4320 
4321 	error = sysctl_handle_long(oidp, &stat, 0, req);
4322 	if (error || req->newptr == NULL)
4323 		return error;
4324 
4325 	/* Zero out this stat. */
4326 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4327 		txr = &sc->hn_tx_ring[i];
4328 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
4329 	}
4330 	return 0;
4331 }
4332 
4333 static int
4334 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4335 {
4336 	struct hn_softc *sc = arg1;
4337 	int ofs = arg2, i, error, conf;
4338 	struct hn_tx_ring *txr;
4339 
4340 	txr = &sc->hn_tx_ring[0];
4341 	conf = *((int *)((uint8_t *)txr + ofs));
4342 
4343 	error = sysctl_handle_int(oidp, &conf, 0, req);
4344 	if (error || req->newptr == NULL)
4345 		return error;
4346 
4347 	HN_LOCK(sc);
4348 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4349 		txr = &sc->hn_tx_ring[i];
4350 		*((int *)((uint8_t *)txr + ofs)) = conf;
4351 	}
4352 	HN_UNLOCK(sc);
4353 
4354 	return 0;
4355 }
4356 
4357 static int
4358 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4359 {
4360 	struct hn_softc *sc = arg1;
4361 	int error, size;
4362 
4363 	size = sc->hn_agg_size;
4364 	error = sysctl_handle_int(oidp, &size, 0, req);
4365 	if (error || req->newptr == NULL)
4366 		return (error);
4367 
4368 	HN_LOCK(sc);
4369 	sc->hn_agg_size = size;
4370 	hn_set_txagg(sc);
4371 	HN_UNLOCK(sc);
4372 
4373 	return (0);
4374 }
4375 
4376 static int
4377 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4378 {
4379 	struct hn_softc *sc = arg1;
4380 	int error, pkts;
4381 
4382 	pkts = sc->hn_agg_pkts;
4383 	error = sysctl_handle_int(oidp, &pkts, 0, req);
4384 	if (error || req->newptr == NULL)
4385 		return (error);
4386 
4387 	HN_LOCK(sc);
4388 	sc->hn_agg_pkts = pkts;
4389 	hn_set_txagg(sc);
4390 	HN_UNLOCK(sc);
4391 
4392 	return (0);
4393 }
4394 
4395 static int
4396 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4397 {
4398 	struct hn_softc *sc = arg1;
4399 	int pkts;
4400 
4401 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4402 	return (sysctl_handle_int(oidp, &pkts, 0, req));
4403 }
4404 
4405 static int
4406 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4407 {
4408 	struct hn_softc *sc = arg1;
4409 	int align;
4410 
4411 	align = sc->hn_tx_ring[0].hn_agg_align;
4412 	return (sysctl_handle_int(oidp, &align, 0, req));
4413 }
4414 
4415 static void
4416 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4417 {
4418 	if (pollhz == 0)
4419 		vmbus_chan_poll_disable(chan);
4420 	else
4421 		vmbus_chan_poll_enable(chan, pollhz);
4422 }
4423 
4424 static void
4425 hn_polling(struct hn_softc *sc, u_int pollhz)
4426 {
4427 	int nsubch = sc->hn_rx_ring_inuse - 1;
4428 
4429 	HN_LOCK_ASSERT(sc);
4430 
4431 	if (nsubch > 0) {
4432 		struct vmbus_channel **subch;
4433 		int i;
4434 
4435 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4436 		for (i = 0; i < nsubch; ++i)
4437 			hn_chan_polling(subch[i], pollhz);
4438 		vmbus_subchan_rel(subch, nsubch);
4439 	}
4440 	hn_chan_polling(sc->hn_prichan, pollhz);
4441 }
4442 
4443 static int
4444 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4445 {
4446 	struct hn_softc *sc = arg1;
4447 	int pollhz, error;
4448 
4449 	pollhz = sc->hn_pollhz;
4450 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4451 	if (error || req->newptr == NULL)
4452 		return (error);
4453 
4454 	if (pollhz != 0 &&
4455 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4456 		return (EINVAL);
4457 
4458 	HN_LOCK(sc);
4459 	if (sc->hn_pollhz != pollhz) {
4460 		sc->hn_pollhz = pollhz;
4461 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4462 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4463 			hn_polling(sc, sc->hn_pollhz);
4464 	}
4465 	HN_UNLOCK(sc);
4466 
4467 	return (0);
4468 }
4469 
4470 static int
4471 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4472 {
4473 	struct hn_softc *sc = arg1;
4474 	char verstr[16];
4475 
4476 	snprintf(verstr, sizeof(verstr), "%u.%u",
4477 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4478 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4479 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4480 }
4481 
4482 static int
4483 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4484 {
4485 	struct hn_softc *sc = arg1;
4486 	char caps_str[128];
4487 	uint32_t caps;
4488 
4489 	HN_LOCK(sc);
4490 	caps = sc->hn_caps;
4491 	HN_UNLOCK(sc);
4492 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4493 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4494 }
4495 
4496 static int
4497 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4498 {
4499 	struct hn_softc *sc = arg1;
4500 	char assist_str[128];
4501 	uint32_t hwassist;
4502 
4503 	HN_LOCK(sc);
4504 	hwassist = sc->hn_ifp->if_hwassist;
4505 	HN_UNLOCK(sc);
4506 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4507 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4508 }
4509 
4510 static int
4511 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4512 {
4513 	struct hn_softc *sc = arg1;
4514 	char filter_str[128];
4515 	uint32_t filter;
4516 
4517 	HN_LOCK(sc);
4518 	filter = sc->hn_rx_filter;
4519 	HN_UNLOCK(sc);
4520 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4521 	    NDIS_PACKET_TYPES);
4522 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4523 }
4524 
4525 #ifndef RSS
4526 
4527 static int
4528 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4529 {
4530 	struct hn_softc *sc = arg1;
4531 	int error;
4532 
4533 	HN_LOCK(sc);
4534 
4535 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4536 	if (error || req->newptr == NULL)
4537 		goto back;
4538 
4539 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
4540 	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4541 		/*
4542 		 * RSS key is synchronized w/ VF's, don't allow users
4543 		 * to change it.
4544 		 */
4545 		error = EBUSY;
4546 		goto back;
4547 	}
4548 
4549 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4550 	if (error)
4551 		goto back;
4552 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4553 
4554 	if (sc->hn_rx_ring_inuse > 1) {
4555 		error = hn_rss_reconfig(sc);
4556 	} else {
4557 		/* Not RSS capable, at least for now; just save the RSS key. */
4558 		error = 0;
4559 	}
4560 back:
4561 	HN_UNLOCK(sc);
4562 	return (error);
4563 }
4564 
4565 static int
4566 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4567 {
4568 	struct hn_softc *sc = arg1;
4569 	int error;
4570 
4571 	HN_LOCK(sc);
4572 
4573 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4574 	if (error || req->newptr == NULL)
4575 		goto back;
4576 
4577 	/*
4578 	 * Don't allow RSS indirect table change, if this interface is not
4579 	 * RSS capable currently.
4580 	 */
4581 	if (sc->hn_rx_ring_inuse == 1) {
4582 		error = EOPNOTSUPP;
4583 		goto back;
4584 	}
4585 
4586 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4587 	if (error)
4588 		goto back;
4589 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4590 
4591 	hn_rss_ind_fixup(sc);
4592 	error = hn_rss_reconfig(sc);
4593 back:
4594 	HN_UNLOCK(sc);
4595 	return (error);
4596 }
4597 
4598 #endif	/* !RSS */
4599 
4600 static int
4601 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4602 {
4603 	struct hn_softc *sc = arg1;
4604 	char hash_str[128];
4605 	uint32_t hash;
4606 
4607 	HN_LOCK(sc);
4608 	hash = sc->hn_rss_hash;
4609 	HN_UNLOCK(sc);
4610 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4611 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4612 }
4613 
4614 static int
4615 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4616 {
4617 	struct hn_softc *sc = arg1;
4618 	char hash_str[128];
4619 	uint32_t hash;
4620 
4621 	HN_LOCK(sc);
4622 	hash = sc->hn_rss_hcap;
4623 	HN_UNLOCK(sc);
4624 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4625 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4626 }
4627 
4628 static int
4629 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4630 {
4631 	struct hn_softc *sc = arg1;
4632 	char hash_str[128];
4633 	uint32_t hash;
4634 
4635 	HN_LOCK(sc);
4636 	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4637 	HN_UNLOCK(sc);
4638 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4639 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4640 }
4641 
4642 static int
4643 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4644 {
4645 	struct hn_softc *sc = arg1;
4646 	char vf_name[IFNAMSIZ + 1];
4647 	struct ifnet *vf_ifp;
4648 
4649 	HN_LOCK(sc);
4650 	vf_name[0] = '\0';
4651 	vf_ifp = sc->hn_vf_ifp;
4652 	if (vf_ifp != NULL)
4653 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4654 	HN_UNLOCK(sc);
4655 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4656 }
4657 
4658 static int
4659 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4660 {
4661 	struct hn_softc *sc = arg1;
4662 	char vf_name[IFNAMSIZ + 1];
4663 	struct ifnet *vf_ifp;
4664 
4665 	HN_LOCK(sc);
4666 	vf_name[0] = '\0';
4667 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4668 	if (vf_ifp != NULL)
4669 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4670 	HN_UNLOCK(sc);
4671 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4672 }
4673 
4674 static int
4675 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4676 {
4677 	struct rm_priotracker pt;
4678 	struct sbuf *sb;
4679 	int error, i;
4680 	bool first;
4681 
4682 	error = sysctl_wire_old_buffer(req, 0);
4683 	if (error != 0)
4684 		return (error);
4685 
4686 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4687 	if (sb == NULL)
4688 		return (ENOMEM);
4689 
4690 	rm_rlock(&hn_vfmap_lock, &pt);
4691 
4692 	first = true;
4693 	for (i = 0; i < hn_vfmap_size; ++i) {
4694 		struct ifnet *ifp;
4695 
4696 		if (hn_vfmap[i] == NULL)
4697 			continue;
4698 
4699 		ifp = ifnet_byindex(i);
4700 		if (ifp != NULL) {
4701 			if (first)
4702 				sbuf_printf(sb, "%s", ifp->if_xname);
4703 			else
4704 				sbuf_printf(sb, " %s", ifp->if_xname);
4705 			first = false;
4706 		}
4707 	}
4708 
4709 	rm_runlock(&hn_vfmap_lock, &pt);
4710 
4711 	error = sbuf_finish(sb);
4712 	sbuf_delete(sb);
4713 	return (error);
4714 }
4715 
4716 static int
4717 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4718 {
4719 	struct rm_priotracker pt;
4720 	struct sbuf *sb;
4721 	int error, i;
4722 	bool first;
4723 
4724 	error = sysctl_wire_old_buffer(req, 0);
4725 	if (error != 0)
4726 		return (error);
4727 
4728 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4729 	if (sb == NULL)
4730 		return (ENOMEM);
4731 
4732 	rm_rlock(&hn_vfmap_lock, &pt);
4733 
4734 	first = true;
4735 	for (i = 0; i < hn_vfmap_size; ++i) {
4736 		struct ifnet *ifp, *hn_ifp;
4737 
4738 		hn_ifp = hn_vfmap[i];
4739 		if (hn_ifp == NULL)
4740 			continue;
4741 
4742 		ifp = ifnet_byindex(i);
4743 		if (ifp != NULL) {
4744 			if (first) {
4745 				sbuf_printf(sb, "%s:%s", ifp->if_xname,
4746 				    hn_ifp->if_xname);
4747 			} else {
4748 				sbuf_printf(sb, " %s:%s", ifp->if_xname,
4749 				    hn_ifp->if_xname);
4750 			}
4751 			first = false;
4752 		}
4753 	}
4754 
4755 	rm_runlock(&hn_vfmap_lock, &pt);
4756 
4757 	error = sbuf_finish(sb);
4758 	sbuf_delete(sb);
4759 	return (error);
4760 }
4761 
4762 static int
4763 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4764 {
4765 	struct hn_softc *sc = arg1;
4766 	int error, onoff = 0;
4767 
4768 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4769 		onoff = 1;
4770 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4771 	if (error || req->newptr == NULL)
4772 		return (error);
4773 
4774 	HN_LOCK(sc);
4775 	/* NOTE: hn_vf_lock for hn_transmit() */
4776 	rm_wlock(&sc->hn_vf_lock);
4777 	if (onoff)
4778 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4779 	else
4780 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4781 	rm_wunlock(&sc->hn_vf_lock);
4782 	HN_UNLOCK(sc);
4783 
4784 	return (0);
4785 }
4786 
4787 static int
4788 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4789 {
4790 	struct hn_softc *sc = arg1;
4791 	int enabled = 0;
4792 
4793 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4794 		enabled = 1;
4795 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4796 }
4797 
4798 static int
4799 hn_check_iplen(const struct mbuf *m, int hoff)
4800 {
4801 	const struct ip *ip;
4802 	int len, iphlen, iplen;
4803 	const struct tcphdr *th;
4804 	int thoff;				/* TCP data offset */
4805 
4806 	len = hoff + sizeof(struct ip);
4807 
4808 	/* The packet must be at least the size of an IP header. */
4809 	if (m->m_pkthdr.len < len)
4810 		return IPPROTO_DONE;
4811 
4812 	/* The fixed IP header must reside completely in the first mbuf. */
4813 	if (m->m_len < len)
4814 		return IPPROTO_DONE;
4815 
4816 	ip = mtodo(m, hoff);
4817 
4818 	/* Bound check the packet's stated IP header length. */
4819 	iphlen = ip->ip_hl << 2;
4820 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4821 		return IPPROTO_DONE;
4822 
4823 	/* The full IP header must reside completely in the one mbuf. */
4824 	if (m->m_len < hoff + iphlen)
4825 		return IPPROTO_DONE;
4826 
4827 	iplen = ntohs(ip->ip_len);
4828 
4829 	/*
4830 	 * Check that the amount of data in the buffers is as
4831 	 * at least much as the IP header would have us expect.
4832 	 */
4833 	if (m->m_pkthdr.len < hoff + iplen)
4834 		return IPPROTO_DONE;
4835 
4836 	/*
4837 	 * Ignore IP fragments.
4838 	 */
4839 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4840 		return IPPROTO_DONE;
4841 
4842 	/*
4843 	 * The TCP/IP or UDP/IP header must be entirely contained within
4844 	 * the first fragment of a packet.
4845 	 */
4846 	switch (ip->ip_p) {
4847 	case IPPROTO_TCP:
4848 		if (iplen < iphlen + sizeof(struct tcphdr))
4849 			return IPPROTO_DONE;
4850 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4851 			return IPPROTO_DONE;
4852 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4853 		thoff = th->th_off << 2;
4854 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4855 			return IPPROTO_DONE;
4856 		if (m->m_len < hoff + iphlen + thoff)
4857 			return IPPROTO_DONE;
4858 		break;
4859 	case IPPROTO_UDP:
4860 		if (iplen < iphlen + sizeof(struct udphdr))
4861 			return IPPROTO_DONE;
4862 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4863 			return IPPROTO_DONE;
4864 		break;
4865 	default:
4866 		if (iplen < iphlen)
4867 			return IPPROTO_DONE;
4868 		break;
4869 	}
4870 	return ip->ip_p;
4871 }
4872 
4873 static void
4874 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4875 {
4876 	const struct ether_header *eh;
4877 	uint16_t etype;
4878 	int hoff;
4879 
4880 	hoff = sizeof(*eh);
4881 	/* Checked at the beginning of this function. */
4882 	KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4883 
4884 	eh = mtod(m_new, const struct ether_header *);
4885 	etype = ntohs(eh->ether_type);
4886 	if (etype == ETHERTYPE_VLAN) {
4887 		const struct ether_vlan_header *evl;
4888 
4889 		hoff = sizeof(*evl);
4890 		if (m_new->m_len < hoff)
4891 			return;
4892 		evl = mtod(m_new, const struct ether_vlan_header *);
4893 		etype = ntohs(evl->evl_proto);
4894 	}
4895 	*l3proto = etype;
4896 
4897 	if (etype == ETHERTYPE_IP)
4898 		*l4proto = hn_check_iplen(m_new, hoff);
4899 	else
4900 		*l4proto = IPPROTO_DONE;
4901 }
4902 
4903 static int
4904 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4905 {
4906 	struct sysctl_oid_list *child;
4907 	struct sysctl_ctx_list *ctx;
4908 	device_t dev = sc->hn_dev;
4909 #if defined(INET) || defined(INET6)
4910 #if __FreeBSD_version >= 1100095
4911 	int lroent_cnt;
4912 #endif
4913 #endif
4914 	int i;
4915 
4916 	/*
4917 	 * Create RXBUF for reception.
4918 	 *
4919 	 * NOTE:
4920 	 * - It is shared by all channels.
4921 	 * - A large enough buffer is allocated, certain version of NVSes
4922 	 *   may further limit the usable space.
4923 	 */
4924 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4925 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4926 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
4927 	if (sc->hn_rxbuf == NULL) {
4928 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4929 		return (ENOMEM);
4930 	}
4931 
4932 	sc->hn_rx_ring_cnt = ring_cnt;
4933 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4934 
4935 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4936 	    M_DEVBUF, M_WAITOK | M_ZERO);
4937 
4938 #if defined(INET) || defined(INET6)
4939 #if __FreeBSD_version >= 1100095
4940 	lroent_cnt = hn_lro_entry_count;
4941 	if (lroent_cnt < TCP_LRO_ENTRIES)
4942 		lroent_cnt = TCP_LRO_ENTRIES;
4943 	if (bootverbose)
4944 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4945 #endif
4946 #endif	/* INET || INET6 */
4947 
4948 	ctx = device_get_sysctl_ctx(dev);
4949 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4950 
4951 	/* Create dev.hn.UNIT.rx sysctl tree */
4952 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4953 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4954 
4955 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4956 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4957 
4958 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4959 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4960 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
4961 		if (rxr->hn_br == NULL) {
4962 			device_printf(dev, "allocate bufring failed\n");
4963 			return (ENOMEM);
4964 		}
4965 
4966 		if (hn_trust_hosttcp)
4967 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4968 		if (hn_trust_hostudp)
4969 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4970 		if (hn_trust_hostip)
4971 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4972 		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
4973 		rxr->hn_ifp = sc->hn_ifp;
4974 		if (i < sc->hn_tx_ring_cnt)
4975 			rxr->hn_txr = &sc->hn_tx_ring[i];
4976 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4977 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4978 		rxr->hn_rx_idx = i;
4979 		rxr->hn_rxbuf = sc->hn_rxbuf;
4980 
4981 		/*
4982 		 * Initialize LRO.
4983 		 */
4984 #if defined(INET) || defined(INET6)
4985 #if __FreeBSD_version >= 1100095
4986 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
4987 		    hn_lro_mbufq_depth);
4988 #else
4989 		tcp_lro_init(&rxr->hn_lro);
4990 		rxr->hn_lro.ifp = sc->hn_ifp;
4991 #endif
4992 #if __FreeBSD_version >= 1100099
4993 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
4994 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
4995 #endif
4996 #endif	/* INET || INET6 */
4997 
4998 		if (sc->hn_rx_sysctl_tree != NULL) {
4999 			char name[16];
5000 
5001 			/*
5002 			 * Create per RX ring sysctl tree:
5003 			 * dev.hn.UNIT.rx.RINGID
5004 			 */
5005 			snprintf(name, sizeof(name), "%d", i);
5006 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
5007 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
5008 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5009 
5010 			if (rxr->hn_rx_sysctl_tree != NULL) {
5011 				SYSCTL_ADD_ULONG(ctx,
5012 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5013 				    OID_AUTO, "packets", CTLFLAG_RW,
5014 				    &rxr->hn_pkts, "# of packets received");
5015 				SYSCTL_ADD_ULONG(ctx,
5016 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5017 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
5018 				    &rxr->hn_rss_pkts,
5019 				    "# of packets w/ RSS info received");
5020 				SYSCTL_ADD_INT(ctx,
5021 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5022 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5023 				    &rxr->hn_pktbuf_len, 0,
5024 				    "Temporary channel packet buffer length");
5025 			}
5026 		}
5027 	}
5028 
5029 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5030 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5031 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5032 #if __FreeBSD_version < 1100095
5033 	    hn_rx_stat_int_sysctl,
5034 #else
5035 	    hn_rx_stat_u64_sysctl,
5036 #endif
5037 	    "LU", "LRO queued");
5038 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5039 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5040 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5041 #if __FreeBSD_version < 1100095
5042 	    hn_rx_stat_int_sysctl,
5043 #else
5044 	    hn_rx_stat_u64_sysctl,
5045 #endif
5046 	    "LU", "LRO flushed");
5047 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5048 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5049 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
5050 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5051 #if __FreeBSD_version >= 1100099
5052 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5053 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5054 	    hn_lro_lenlim_sysctl, "IU",
5055 	    "Max # of data bytes to be aggregated by LRO");
5056 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5057 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5058 	    hn_lro_ackcnt_sysctl, "I",
5059 	    "Max # of ACKs to be aggregated by LRO");
5060 #endif
5061 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5062 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5063 	    hn_trust_hcsum_sysctl, "I",
5064 	    "Trust tcp segement verification on host side, "
5065 	    "when csum info is missing");
5066 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5067 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5068 	    hn_trust_hcsum_sysctl, "I",
5069 	    "Trust udp datagram verification on host side, "
5070 	    "when csum info is missing");
5071 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5072 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5073 	    hn_trust_hcsum_sysctl, "I",
5074 	    "Trust ip packet verification on host side, "
5075 	    "when csum info is missing");
5076 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5077 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5078 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
5079 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5080 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5081 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5082 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
5083 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5084 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5085 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5086 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
5087 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5088 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5089 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5090 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
5091 	    hn_rx_stat_ulong_sysctl, "LU",
5092 	    "# of packets that we trust host's csum verification");
5093 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5094 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5095 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
5096 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5097 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5098 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5099 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
5100 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5101 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5102 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5103 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5104 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5105 
5106 	return (0);
5107 }
5108 
5109 static void
5110 hn_destroy_rx_data(struct hn_softc *sc)
5111 {
5112 	int i;
5113 
5114 	if (sc->hn_rxbuf != NULL) {
5115 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5116 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
5117 		else
5118 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
5119 		sc->hn_rxbuf = NULL;
5120 	}
5121 
5122 	if (sc->hn_rx_ring_cnt == 0)
5123 		return;
5124 
5125 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5126 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5127 
5128 		if (rxr->hn_br == NULL)
5129 			continue;
5130 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5131 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5132 		} else {
5133 			device_printf(sc->hn_dev,
5134 			    "%dth channel bufring is referenced", i);
5135 		}
5136 		rxr->hn_br = NULL;
5137 
5138 #if defined(INET) || defined(INET6)
5139 		tcp_lro_free(&rxr->hn_lro);
5140 #endif
5141 		free(rxr->hn_pktbuf, M_DEVBUF);
5142 	}
5143 	free(sc->hn_rx_ring, M_DEVBUF);
5144 	sc->hn_rx_ring = NULL;
5145 
5146 	sc->hn_rx_ring_cnt = 0;
5147 	sc->hn_rx_ring_inuse = 0;
5148 }
5149 
5150 static int
5151 hn_tx_ring_create(struct hn_softc *sc, int id)
5152 {
5153 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5154 	device_t dev = sc->hn_dev;
5155 	bus_dma_tag_t parent_dtag;
5156 	int error, i;
5157 
5158 	txr->hn_sc = sc;
5159 	txr->hn_tx_idx = id;
5160 
5161 #ifndef HN_USE_TXDESC_BUFRING
5162 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5163 #endif
5164 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5165 
5166 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5167 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5168 	    M_DEVBUF, M_WAITOK | M_ZERO);
5169 #ifndef HN_USE_TXDESC_BUFRING
5170 	SLIST_INIT(&txr->hn_txlist);
5171 #else
5172 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5173 	    M_WAITOK, &txr->hn_tx_lock);
5174 #endif
5175 
5176 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5177 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5178 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5179 	} else {
5180 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5181 	}
5182 
5183 #ifdef HN_IFSTART_SUPPORT
5184 	if (hn_use_if_start) {
5185 		txr->hn_txeof = hn_start_txeof;
5186 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5187 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5188 	} else
5189 #endif
5190 	{
5191 		int br_depth;
5192 
5193 		txr->hn_txeof = hn_xmit_txeof;
5194 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5195 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5196 
5197 		br_depth = hn_get_txswq_depth(txr);
5198 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5199 		    M_WAITOK, &txr->hn_tx_lock);
5200 	}
5201 
5202 	txr->hn_direct_tx_size = hn_direct_tx_size;
5203 
5204 	/*
5205 	 * Always schedule transmission instead of trying to do direct
5206 	 * transmission.  This one gives the best performance so far.
5207 	 */
5208 	txr->hn_sched_tx = 1;
5209 
5210 	parent_dtag = bus_get_dma_tag(dev);
5211 
5212 	/* DMA tag for RNDIS packet messages. */
5213 	error = bus_dma_tag_create(parent_dtag, /* parent */
5214 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
5215 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
5216 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5217 	    BUS_SPACE_MAXADDR,		/* highaddr */
5218 	    NULL, NULL,			/* filter, filterarg */
5219 	    HN_RNDIS_PKT_LEN,		/* maxsize */
5220 	    1,				/* nsegments */
5221 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
5222 	    0,				/* flags */
5223 	    NULL,			/* lockfunc */
5224 	    NULL,			/* lockfuncarg */
5225 	    &txr->hn_tx_rndis_dtag);
5226 	if (error) {
5227 		device_printf(dev, "failed to create rndis dmatag\n");
5228 		return error;
5229 	}
5230 
5231 	/* DMA tag for data. */
5232 	error = bus_dma_tag_create(parent_dtag, /* parent */
5233 	    1,				/* alignment */
5234 	    HN_TX_DATA_BOUNDARY,	/* boundary */
5235 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5236 	    BUS_SPACE_MAXADDR,		/* highaddr */
5237 	    NULL, NULL,			/* filter, filterarg */
5238 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
5239 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
5240 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
5241 	    0,				/* flags */
5242 	    NULL,			/* lockfunc */
5243 	    NULL,			/* lockfuncarg */
5244 	    &txr->hn_tx_data_dtag);
5245 	if (error) {
5246 		device_printf(dev, "failed to create data dmatag\n");
5247 		return error;
5248 	}
5249 
5250 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5251 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
5252 
5253 		txd->txr = txr;
5254 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5255 		STAILQ_INIT(&txd->agg_list);
5256 
5257 		/*
5258 		 * Allocate and load RNDIS packet message.
5259 		 */
5260         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5261 		    (void **)&txd->rndis_pkt,
5262 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5263 		    &txd->rndis_pkt_dmap);
5264 		if (error) {
5265 			device_printf(dev,
5266 			    "failed to allocate rndis_packet_msg, %d\n", i);
5267 			return error;
5268 		}
5269 
5270 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5271 		    txd->rndis_pkt_dmap,
5272 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5273 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5274 		    BUS_DMA_NOWAIT);
5275 		if (error) {
5276 			device_printf(dev,
5277 			    "failed to load rndis_packet_msg, %d\n", i);
5278 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5279 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5280 			return error;
5281 		}
5282 
5283 		/* DMA map for TX data. */
5284 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5285 		    &txd->data_dmap);
5286 		if (error) {
5287 			device_printf(dev,
5288 			    "failed to allocate tx data dmamap\n");
5289 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5290 			    txd->rndis_pkt_dmap);
5291 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5292 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5293 			return error;
5294 		}
5295 
5296 		/* All set, put it to list */
5297 		txd->flags |= HN_TXD_FLAG_ONLIST;
5298 #ifndef HN_USE_TXDESC_BUFRING
5299 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5300 #else
5301 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
5302 #endif
5303 	}
5304 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5305 
5306 	if (sc->hn_tx_sysctl_tree != NULL) {
5307 		struct sysctl_oid_list *child;
5308 		struct sysctl_ctx_list *ctx;
5309 		char name[16];
5310 
5311 		/*
5312 		 * Create per TX ring sysctl tree:
5313 		 * dev.hn.UNIT.tx.RINGID
5314 		 */
5315 		ctx = device_get_sysctl_ctx(dev);
5316 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5317 
5318 		snprintf(name, sizeof(name), "%d", id);
5319 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5320 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5321 
5322 		if (txr->hn_tx_sysctl_tree != NULL) {
5323 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5324 
5325 #ifdef HN_DEBUG
5326 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5327 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5328 			    "# of available TX descs");
5329 #endif
5330 #ifdef HN_IFSTART_SUPPORT
5331 			if (!hn_use_if_start)
5332 #endif
5333 			{
5334 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5335 				    CTLFLAG_RD, &txr->hn_oactive, 0,
5336 				    "over active");
5337 			}
5338 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5339 			    CTLFLAG_RW, &txr->hn_pkts,
5340 			    "# of packets transmitted");
5341 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5342 			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
5343 		}
5344 	}
5345 
5346 	return 0;
5347 }
5348 
5349 static void
5350 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5351 {
5352 	struct hn_tx_ring *txr = txd->txr;
5353 
5354 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
5355 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5356 
5357 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5358 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5359 	    txd->rndis_pkt_dmap);
5360 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5361 }
5362 
5363 static void
5364 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5365 {
5366 
5367 	KASSERT(txd->refs == 0 || txd->refs == 1,
5368 	    ("invalid txd refs %d", txd->refs));
5369 
5370 	/* Aggregated txds will be freed by their aggregating txd. */
5371 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5372 		int freed;
5373 
5374 		freed = hn_txdesc_put(txr, txd);
5375 		KASSERT(freed, ("can't free txdesc"));
5376 	}
5377 }
5378 
5379 static void
5380 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5381 {
5382 	int i;
5383 
5384 	if (txr->hn_txdesc == NULL)
5385 		return;
5386 
5387 	/*
5388 	 * NOTE:
5389 	 * Because the freeing of aggregated txds will be deferred
5390 	 * to the aggregating txd, two passes are used here:
5391 	 * - The first pass GCes any pending txds.  This GC is necessary,
5392 	 *   since if the channels are revoked, hypervisor will not
5393 	 *   deliver send-done for all pending txds.
5394 	 * - The second pass frees the busdma stuffs, i.e. after all txds
5395 	 *   were freed.
5396 	 */
5397 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5398 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5399 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5400 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5401 
5402 	if (txr->hn_tx_data_dtag != NULL)
5403 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5404 	if (txr->hn_tx_rndis_dtag != NULL)
5405 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5406 
5407 #ifdef HN_USE_TXDESC_BUFRING
5408 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5409 #endif
5410 
5411 	free(txr->hn_txdesc, M_DEVBUF);
5412 	txr->hn_txdesc = NULL;
5413 
5414 	if (txr->hn_mbuf_br != NULL)
5415 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5416 
5417 #ifndef HN_USE_TXDESC_BUFRING
5418 	mtx_destroy(&txr->hn_txlist_spin);
5419 #endif
5420 	mtx_destroy(&txr->hn_tx_lock);
5421 }
5422 
5423 static int
5424 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5425 {
5426 	struct sysctl_oid_list *child;
5427 	struct sysctl_ctx_list *ctx;
5428 	int i;
5429 
5430 	/*
5431 	 * Create TXBUF for chimney sending.
5432 	 *
5433 	 * NOTE: It is shared by all channels.
5434 	 */
5435 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5436 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5437 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
5438 	if (sc->hn_chim == NULL) {
5439 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
5440 		return (ENOMEM);
5441 	}
5442 
5443 	sc->hn_tx_ring_cnt = ring_cnt;
5444 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5445 
5446 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5447 	    M_DEVBUF, M_WAITOK | M_ZERO);
5448 
5449 	ctx = device_get_sysctl_ctx(sc->hn_dev);
5450 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5451 
5452 	/* Create dev.hn.UNIT.tx sysctl tree */
5453 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5454 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5455 
5456 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5457 		int error;
5458 
5459 		error = hn_tx_ring_create(sc, i);
5460 		if (error)
5461 			return error;
5462 	}
5463 
5464 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5465 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5466 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5467 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5468 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5469 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5470 	    __offsetof(struct hn_tx_ring, hn_send_failed),
5471 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5472 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5473 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5474 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5475 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5476 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5477 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5478 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5479 	    hn_tx_stat_ulong_sysctl, "LU",
5480 	    "# of packet transmission aggregation flush failure");
5481 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5482 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5483 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5484 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5485 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5486 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5487 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5488 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5489 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5490 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5491 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5492 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5493 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5494 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5495 	    "# of total TX descs");
5496 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5497 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5498 	    "Chimney send packet size upper boundary");
5499 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5500 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5501 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5502 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5503 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5504 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5505 	    hn_tx_conf_int_sysctl, "I",
5506 	    "Size of the packet for direct transmission");
5507 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5508 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5509 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5510 	    hn_tx_conf_int_sysctl, "I",
5511 	    "Always schedule transmission "
5512 	    "instead of doing direct transmission");
5513 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5514 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5515 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5516 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5517 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5518 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5519 	    "Applied packet transmission aggregation size");
5520 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5521 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5522 	    hn_txagg_pktmax_sysctl, "I",
5523 	    "Applied packet transmission aggregation packets");
5524 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5525 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5526 	    hn_txagg_align_sysctl, "I",
5527 	    "Applied packet transmission aggregation alignment");
5528 
5529 	return 0;
5530 }
5531 
5532 static void
5533 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5534 {
5535 	int i;
5536 
5537 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5538 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5539 }
5540 
5541 static void
5542 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5543 {
5544 	struct ifnet *ifp = sc->hn_ifp;
5545 	u_int hw_tsomax;
5546 	int tso_minlen;
5547 
5548 	HN_LOCK_ASSERT(sc);
5549 
5550 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5551 		return;
5552 
5553 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5554 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5555 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5556 
5557 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5558 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5559 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5560 
5561 	if (tso_maxlen < tso_minlen)
5562 		tso_maxlen = tso_minlen;
5563 	else if (tso_maxlen > IP_MAXPACKET)
5564 		tso_maxlen = IP_MAXPACKET;
5565 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5566 		tso_maxlen = sc->hn_ndis_tso_szmax;
5567 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5568 
5569 	if (hn_xpnt_vf_isready(sc)) {
5570 		if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5571 			hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5572 	}
5573 	ifp->if_hw_tsomax = hw_tsomax;
5574 	if (bootverbose)
5575 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5576 }
5577 
5578 static void
5579 hn_fixup_tx_data(struct hn_softc *sc)
5580 {
5581 	uint64_t csum_assist;
5582 	int i;
5583 
5584 	hn_set_chim_size(sc, sc->hn_chim_szmax);
5585 	if (hn_tx_chimney_size > 0 &&
5586 	    hn_tx_chimney_size < sc->hn_chim_szmax)
5587 		hn_set_chim_size(sc, hn_tx_chimney_size);
5588 
5589 	csum_assist = 0;
5590 	if (sc->hn_caps & HN_CAP_IPCS)
5591 		csum_assist |= CSUM_IP;
5592 	if (sc->hn_caps & HN_CAP_TCP4CS)
5593 		csum_assist |= CSUM_IP_TCP;
5594 	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5595 		csum_assist |= CSUM_IP_UDP;
5596 	if (sc->hn_caps & HN_CAP_TCP6CS)
5597 		csum_assist |= CSUM_IP6_TCP;
5598 	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5599 		csum_assist |= CSUM_IP6_UDP;
5600 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5601 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5602 
5603 	if (sc->hn_caps & HN_CAP_HASHVAL) {
5604 		/*
5605 		 * Support HASHVAL pktinfo on TX path.
5606 		 */
5607 		if (bootverbose)
5608 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5609 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5610 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5611 	}
5612 }
5613 
5614 static void
5615 hn_fixup_rx_data(struct hn_softc *sc)
5616 {
5617 
5618 	if (sc->hn_caps & HN_CAP_UDPHASH) {
5619 		int i;
5620 
5621 		for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5622 			sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5623 	}
5624 }
5625 
5626 static void
5627 hn_destroy_tx_data(struct hn_softc *sc)
5628 {
5629 	int i;
5630 
5631 	if (sc->hn_chim != NULL) {
5632 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5633 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5634 		} else {
5635 			device_printf(sc->hn_dev,
5636 			    "chimney sending buffer is referenced");
5637 		}
5638 		sc->hn_chim = NULL;
5639 	}
5640 
5641 	if (sc->hn_tx_ring_cnt == 0)
5642 		return;
5643 
5644 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5645 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5646 
5647 	free(sc->hn_tx_ring, M_DEVBUF);
5648 	sc->hn_tx_ring = NULL;
5649 
5650 	sc->hn_tx_ring_cnt = 0;
5651 	sc->hn_tx_ring_inuse = 0;
5652 }
5653 
5654 #ifdef HN_IFSTART_SUPPORT
5655 
5656 static void
5657 hn_start_taskfunc(void *xtxr, int pending __unused)
5658 {
5659 	struct hn_tx_ring *txr = xtxr;
5660 
5661 	mtx_lock(&txr->hn_tx_lock);
5662 	hn_start_locked(txr, 0);
5663 	mtx_unlock(&txr->hn_tx_lock);
5664 }
5665 
5666 static int
5667 hn_start_locked(struct hn_tx_ring *txr, int len)
5668 {
5669 	struct hn_softc *sc = txr->hn_sc;
5670 	struct ifnet *ifp = sc->hn_ifp;
5671 	int sched = 0;
5672 
5673 	KASSERT(hn_use_if_start,
5674 	    ("hn_start_locked is called, when if_start is disabled"));
5675 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5676 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5677 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5678 
5679 	if (__predict_false(txr->hn_suspended))
5680 		return (0);
5681 
5682 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5683 	    IFF_DRV_RUNNING)
5684 		return (0);
5685 
5686 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5687 		struct hn_txdesc *txd;
5688 		struct mbuf *m_head;
5689 		int error;
5690 
5691 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5692 		if (m_head == NULL)
5693 			break;
5694 
5695 		if (len > 0 && m_head->m_pkthdr.len > len) {
5696 			/*
5697 			 * This sending could be time consuming; let callers
5698 			 * dispatch this packet sending (and sending of any
5699 			 * following up packets) to tx taskqueue.
5700 			 */
5701 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5702 			sched = 1;
5703 			break;
5704 		}
5705 
5706 #if defined(INET6) || defined(INET)
5707 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5708 			m_head = hn_tso_fixup(m_head);
5709 			if (__predict_false(m_head == NULL)) {
5710 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5711 				continue;
5712 			}
5713 		} else if (m_head->m_pkthdr.csum_flags &
5714 		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5715 			m_head = hn_set_hlen(m_head);
5716 			if (__predict_false(m_head == NULL)) {
5717 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5718 				continue;
5719 			}
5720 		}
5721 #endif
5722 
5723 		txd = hn_txdesc_get(txr);
5724 		if (txd == NULL) {
5725 			txr->hn_no_txdescs++;
5726 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5727 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5728 			break;
5729 		}
5730 
5731 		error = hn_encap(ifp, txr, txd, &m_head);
5732 		if (error) {
5733 			/* Both txd and m_head are freed */
5734 			KASSERT(txr->hn_agg_txd == NULL,
5735 			    ("encap failed w/ pending aggregating txdesc"));
5736 			continue;
5737 		}
5738 
5739 		if (txr->hn_agg_pktleft == 0) {
5740 			if (txr->hn_agg_txd != NULL) {
5741 				KASSERT(m_head == NULL,
5742 				    ("pending mbuf for aggregating txdesc"));
5743 				error = hn_flush_txagg(ifp, txr);
5744 				if (__predict_false(error)) {
5745 					atomic_set_int(&ifp->if_drv_flags,
5746 					    IFF_DRV_OACTIVE);
5747 					break;
5748 				}
5749 			} else {
5750 				KASSERT(m_head != NULL, ("mbuf was freed"));
5751 				error = hn_txpkt(ifp, txr, txd);
5752 				if (__predict_false(error)) {
5753 					/* txd is freed, but m_head is not */
5754 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5755 					atomic_set_int(&ifp->if_drv_flags,
5756 					    IFF_DRV_OACTIVE);
5757 					break;
5758 				}
5759 			}
5760 		}
5761 #ifdef INVARIANTS
5762 		else {
5763 			KASSERT(txr->hn_agg_txd != NULL,
5764 			    ("no aggregating txdesc"));
5765 			KASSERT(m_head == NULL,
5766 			    ("pending mbuf for aggregating txdesc"));
5767 		}
5768 #endif
5769 	}
5770 
5771 	/* Flush pending aggerated transmission. */
5772 	if (txr->hn_agg_txd != NULL)
5773 		hn_flush_txagg(ifp, txr);
5774 	return (sched);
5775 }
5776 
5777 static void
5778 hn_start(struct ifnet *ifp)
5779 {
5780 	struct hn_softc *sc = ifp->if_softc;
5781 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5782 
5783 	if (txr->hn_sched_tx)
5784 		goto do_sched;
5785 
5786 	if (mtx_trylock(&txr->hn_tx_lock)) {
5787 		int sched;
5788 
5789 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5790 		mtx_unlock(&txr->hn_tx_lock);
5791 		if (!sched)
5792 			return;
5793 	}
5794 do_sched:
5795 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5796 }
5797 
5798 static void
5799 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5800 {
5801 	struct hn_tx_ring *txr = xtxr;
5802 
5803 	mtx_lock(&txr->hn_tx_lock);
5804 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5805 	hn_start_locked(txr, 0);
5806 	mtx_unlock(&txr->hn_tx_lock);
5807 }
5808 
5809 static void
5810 hn_start_txeof(struct hn_tx_ring *txr)
5811 {
5812 	struct hn_softc *sc = txr->hn_sc;
5813 	struct ifnet *ifp = sc->hn_ifp;
5814 
5815 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5816 
5817 	if (txr->hn_sched_tx)
5818 		goto do_sched;
5819 
5820 	if (mtx_trylock(&txr->hn_tx_lock)) {
5821 		int sched;
5822 
5823 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5824 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5825 		mtx_unlock(&txr->hn_tx_lock);
5826 		if (sched) {
5827 			taskqueue_enqueue(txr->hn_tx_taskq,
5828 			    &txr->hn_tx_task);
5829 		}
5830 	} else {
5831 do_sched:
5832 		/*
5833 		 * Release the OACTIVE earlier, with the hope, that
5834 		 * others could catch up.  The task will clear the
5835 		 * flag again with the hn_tx_lock to avoid possible
5836 		 * races.
5837 		 */
5838 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5839 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5840 	}
5841 }
5842 
5843 #endif	/* HN_IFSTART_SUPPORT */
5844 
5845 static int
5846 hn_xmit(struct hn_tx_ring *txr, int len)
5847 {
5848 	struct hn_softc *sc = txr->hn_sc;
5849 	struct ifnet *ifp = sc->hn_ifp;
5850 	struct mbuf *m_head;
5851 	int sched = 0;
5852 
5853 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5854 #ifdef HN_IFSTART_SUPPORT
5855 	KASSERT(hn_use_if_start == 0,
5856 	    ("hn_xmit is called, when if_start is enabled"));
5857 #endif
5858 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5859 
5860 	if (__predict_false(txr->hn_suspended))
5861 		return (0);
5862 
5863 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5864 		return (0);
5865 
5866 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5867 		struct hn_txdesc *txd;
5868 		int error;
5869 
5870 		if (len > 0 && m_head->m_pkthdr.len > len) {
5871 			/*
5872 			 * This sending could be time consuming; let callers
5873 			 * dispatch this packet sending (and sending of any
5874 			 * following up packets) to tx taskqueue.
5875 			 */
5876 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5877 			sched = 1;
5878 			break;
5879 		}
5880 
5881 		txd = hn_txdesc_get(txr);
5882 		if (txd == NULL) {
5883 			txr->hn_no_txdescs++;
5884 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5885 			txr->hn_oactive = 1;
5886 			break;
5887 		}
5888 
5889 		error = hn_encap(ifp, txr, txd, &m_head);
5890 		if (error) {
5891 			/* Both txd and m_head are freed; discard */
5892 			KASSERT(txr->hn_agg_txd == NULL,
5893 			    ("encap failed w/ pending aggregating txdesc"));
5894 			drbr_advance(ifp, txr->hn_mbuf_br);
5895 			continue;
5896 		}
5897 
5898 		if (txr->hn_agg_pktleft == 0) {
5899 			if (txr->hn_agg_txd != NULL) {
5900 				KASSERT(m_head == NULL,
5901 				    ("pending mbuf for aggregating txdesc"));
5902 				error = hn_flush_txagg(ifp, txr);
5903 				if (__predict_false(error)) {
5904 					txr->hn_oactive = 1;
5905 					break;
5906 				}
5907 			} else {
5908 				KASSERT(m_head != NULL, ("mbuf was freed"));
5909 				error = hn_txpkt(ifp, txr, txd);
5910 				if (__predict_false(error)) {
5911 					/* txd is freed, but m_head is not */
5912 					drbr_putback(ifp, txr->hn_mbuf_br,
5913 					    m_head);
5914 					txr->hn_oactive = 1;
5915 					break;
5916 				}
5917 			}
5918 		}
5919 #ifdef INVARIANTS
5920 		else {
5921 			KASSERT(txr->hn_agg_txd != NULL,
5922 			    ("no aggregating txdesc"));
5923 			KASSERT(m_head == NULL,
5924 			    ("pending mbuf for aggregating txdesc"));
5925 		}
5926 #endif
5927 
5928 		/* Sent */
5929 		drbr_advance(ifp, txr->hn_mbuf_br);
5930 	}
5931 
5932 	/* Flush pending aggerated transmission. */
5933 	if (txr->hn_agg_txd != NULL)
5934 		hn_flush_txagg(ifp, txr);
5935 	return (sched);
5936 }
5937 
5938 static int
5939 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5940 {
5941 	struct hn_softc *sc = ifp->if_softc;
5942 	struct hn_tx_ring *txr;
5943 	int error, idx = 0;
5944 
5945 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5946 		struct rm_priotracker pt;
5947 
5948 		rm_rlock(&sc->hn_vf_lock, &pt);
5949 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5950 			struct mbuf *m_bpf = NULL;
5951 			int obytes, omcast;
5952 
5953 			obytes = m->m_pkthdr.len;
5954 			omcast = (m->m_flags & M_MCAST) != 0;
5955 
5956 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5957 				if (bpf_peers_present(ifp->if_bpf)) {
5958 					m_bpf = m_copypacket(m, M_NOWAIT);
5959 					if (m_bpf == NULL) {
5960 						/*
5961 						 * Failed to grab a shallow
5962 						 * copy; tap now.
5963 						 */
5964 						ETHER_BPF_MTAP(ifp, m);
5965 					}
5966 				}
5967 			} else {
5968 				ETHER_BPF_MTAP(ifp, m);
5969 			}
5970 
5971 			error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
5972 			rm_runlock(&sc->hn_vf_lock, &pt);
5973 
5974 			if (m_bpf != NULL) {
5975 				if (!error)
5976 					ETHER_BPF_MTAP(ifp, m_bpf);
5977 				m_freem(m_bpf);
5978 			}
5979 
5980 			if (error == ENOBUFS) {
5981 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5982 			} else if (error) {
5983 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5984 			} else {
5985 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
5986 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
5987 				if (omcast) {
5988 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
5989 					    omcast);
5990 				}
5991 			}
5992 			return (error);
5993 		}
5994 		rm_runlock(&sc->hn_vf_lock, &pt);
5995 	}
5996 
5997 #if defined(INET6) || defined(INET)
5998 	/*
5999 	 * Perform TSO packet header fixup or get l2/l3 header length now,
6000 	 * since packet headers should be cache-hot.
6001 	 */
6002 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
6003 		m = hn_tso_fixup(m);
6004 		if (__predict_false(m == NULL)) {
6005 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6006 			return EIO;
6007 		}
6008 	} else if (m->m_pkthdr.csum_flags &
6009 	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6010 		m = hn_set_hlen(m);
6011 		if (__predict_false(m == NULL)) {
6012 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6013 			return EIO;
6014 		}
6015 	}
6016 #endif
6017 
6018 	/*
6019 	 * Select the TX ring based on flowid
6020 	 */
6021 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6022 #ifdef RSS
6023 		uint32_t bid;
6024 
6025 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
6026 		    &bid) == 0)
6027 			idx = bid % sc->hn_tx_ring_inuse;
6028 		else
6029 #endif
6030 		{
6031 #if defined(INET6) || defined(INET)
6032 			int tcpsyn = 0;
6033 
6034 			if (m->m_pkthdr.len < 128 &&
6035 			    (m->m_pkthdr.csum_flags &
6036 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6037 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6038 				m = hn_check_tcpsyn(m, &tcpsyn);
6039 				if (__predict_false(m == NULL)) {
6040 					if_inc_counter(ifp,
6041 					    IFCOUNTER_OERRORS, 1);
6042 					return (EIO);
6043 				}
6044 			}
6045 #else
6046 			const int tcpsyn = 0;
6047 #endif
6048 			if (tcpsyn)
6049 				idx = 0;
6050 			else
6051 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6052 		}
6053 	}
6054 	txr = &sc->hn_tx_ring[idx];
6055 
6056 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6057 	if (error) {
6058 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6059 		return error;
6060 	}
6061 
6062 	if (txr->hn_oactive)
6063 		return 0;
6064 
6065 	if (txr->hn_sched_tx)
6066 		goto do_sched;
6067 
6068 	if (mtx_trylock(&txr->hn_tx_lock)) {
6069 		int sched;
6070 
6071 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6072 		mtx_unlock(&txr->hn_tx_lock);
6073 		if (!sched)
6074 			return 0;
6075 	}
6076 do_sched:
6077 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6078 	return 0;
6079 }
6080 
6081 static void
6082 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6083 {
6084 	struct mbuf *m;
6085 
6086 	mtx_lock(&txr->hn_tx_lock);
6087 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6088 		m_freem(m);
6089 	mtx_unlock(&txr->hn_tx_lock);
6090 }
6091 
6092 static void
6093 hn_xmit_qflush(struct ifnet *ifp)
6094 {
6095 	struct hn_softc *sc = ifp->if_softc;
6096 	struct rm_priotracker pt;
6097 	int i;
6098 
6099 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6100 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6101 	if_qflush(ifp);
6102 
6103 	rm_rlock(&sc->hn_vf_lock, &pt);
6104 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6105 		sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
6106 	rm_runlock(&sc->hn_vf_lock, &pt);
6107 }
6108 
6109 static void
6110 hn_xmit_txeof(struct hn_tx_ring *txr)
6111 {
6112 
6113 	if (txr->hn_sched_tx)
6114 		goto do_sched;
6115 
6116 	if (mtx_trylock(&txr->hn_tx_lock)) {
6117 		int sched;
6118 
6119 		txr->hn_oactive = 0;
6120 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6121 		mtx_unlock(&txr->hn_tx_lock);
6122 		if (sched) {
6123 			taskqueue_enqueue(txr->hn_tx_taskq,
6124 			    &txr->hn_tx_task);
6125 		}
6126 	} else {
6127 do_sched:
6128 		/*
6129 		 * Release the oactive earlier, with the hope, that
6130 		 * others could catch up.  The task will clear the
6131 		 * oactive again with the hn_tx_lock to avoid possible
6132 		 * races.
6133 		 */
6134 		txr->hn_oactive = 0;
6135 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6136 	}
6137 }
6138 
6139 static void
6140 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6141 {
6142 	struct hn_tx_ring *txr = xtxr;
6143 
6144 	mtx_lock(&txr->hn_tx_lock);
6145 	hn_xmit(txr, 0);
6146 	mtx_unlock(&txr->hn_tx_lock);
6147 }
6148 
6149 static void
6150 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6151 {
6152 	struct hn_tx_ring *txr = xtxr;
6153 
6154 	mtx_lock(&txr->hn_tx_lock);
6155 	txr->hn_oactive = 0;
6156 	hn_xmit(txr, 0);
6157 	mtx_unlock(&txr->hn_tx_lock);
6158 }
6159 
6160 static int
6161 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6162 {
6163 	struct vmbus_chan_br cbr;
6164 	struct hn_rx_ring *rxr;
6165 	struct hn_tx_ring *txr = NULL;
6166 	int idx, error;
6167 
6168 	idx = vmbus_chan_subidx(chan);
6169 
6170 	/*
6171 	 * Link this channel to RX/TX ring.
6172 	 */
6173 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6174 	    ("invalid channel index %d, should > 0 && < %d",
6175 	     idx, sc->hn_rx_ring_inuse));
6176 	rxr = &sc->hn_rx_ring[idx];
6177 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6178 	    ("RX ring %d already attached", idx));
6179 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6180 	rxr->hn_chan = chan;
6181 
6182 	if (bootverbose) {
6183 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6184 		    idx, vmbus_chan_id(chan));
6185 	}
6186 
6187 	if (idx < sc->hn_tx_ring_inuse) {
6188 		txr = &sc->hn_tx_ring[idx];
6189 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6190 		    ("TX ring %d already attached", idx));
6191 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6192 
6193 		txr->hn_chan = chan;
6194 		if (bootverbose) {
6195 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6196 			    idx, vmbus_chan_id(chan));
6197 		}
6198 	}
6199 
6200 	/* Bind this channel to a proper CPU. */
6201 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6202 
6203 	/*
6204 	 * Open this channel
6205 	 */
6206 	cbr.cbr = rxr->hn_br;
6207 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6208 	cbr.cbr_txsz = HN_TXBR_SIZE;
6209 	cbr.cbr_rxsz = HN_RXBR_SIZE;
6210 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6211 	if (error) {
6212 		if (error == EISCONN) {
6213 			if_printf(sc->hn_ifp, "bufring is connected after "
6214 			    "chan%u open failure\n", vmbus_chan_id(chan));
6215 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6216 		} else {
6217 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6218 			    vmbus_chan_id(chan), error);
6219 		}
6220 	}
6221 	return (error);
6222 }
6223 
6224 static void
6225 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6226 {
6227 	struct hn_rx_ring *rxr;
6228 	int idx, error;
6229 
6230 	idx = vmbus_chan_subidx(chan);
6231 
6232 	/*
6233 	 * Link this channel to RX/TX ring.
6234 	 */
6235 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6236 	    ("invalid channel index %d, should > 0 && < %d",
6237 	     idx, sc->hn_rx_ring_inuse));
6238 	rxr = &sc->hn_rx_ring[idx];
6239 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6240 	    ("RX ring %d is not attached", idx));
6241 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6242 
6243 	if (idx < sc->hn_tx_ring_inuse) {
6244 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6245 
6246 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6247 		    ("TX ring %d is not attached attached", idx));
6248 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6249 	}
6250 
6251 	/*
6252 	 * Close this channel.
6253 	 *
6254 	 * NOTE:
6255 	 * Channel closing does _not_ destroy the target channel.
6256 	 */
6257 	error = vmbus_chan_close_direct(chan);
6258 	if (error == EISCONN) {
6259 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
6260 		    "after being closed\n", vmbus_chan_id(chan));
6261 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6262 	} else if (error) {
6263 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6264 		    vmbus_chan_id(chan), error);
6265 	}
6266 }
6267 
6268 static int
6269 hn_attach_subchans(struct hn_softc *sc)
6270 {
6271 	struct vmbus_channel **subchans;
6272 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6273 	int i, error = 0;
6274 
6275 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
6276 
6277 	/* Attach the sub-channels. */
6278 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6279 	for (i = 0; i < subchan_cnt; ++i) {
6280 		int error1;
6281 
6282 		error1 = hn_chan_attach(sc, subchans[i]);
6283 		if (error1) {
6284 			error = error1;
6285 			/* Move on; all channels will be detached later. */
6286 		}
6287 	}
6288 	vmbus_subchan_rel(subchans, subchan_cnt);
6289 
6290 	if (error) {
6291 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6292 	} else {
6293 		if (bootverbose) {
6294 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6295 			    subchan_cnt);
6296 		}
6297 	}
6298 	return (error);
6299 }
6300 
6301 static void
6302 hn_detach_allchans(struct hn_softc *sc)
6303 {
6304 	struct vmbus_channel **subchans;
6305 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6306 	int i;
6307 
6308 	if (subchan_cnt == 0)
6309 		goto back;
6310 
6311 	/* Detach the sub-channels. */
6312 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6313 	for (i = 0; i < subchan_cnt; ++i)
6314 		hn_chan_detach(sc, subchans[i]);
6315 	vmbus_subchan_rel(subchans, subchan_cnt);
6316 
6317 back:
6318 	/*
6319 	 * Detach the primary channel, _after_ all sub-channels
6320 	 * are detached.
6321 	 */
6322 	hn_chan_detach(sc, sc->hn_prichan);
6323 
6324 	/* Wait for sub-channels to be destroyed, if any. */
6325 	vmbus_subchan_drain(sc->hn_prichan);
6326 
6327 #ifdef INVARIANTS
6328 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6329 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6330 		    HN_RX_FLAG_ATTACHED) == 0,
6331 		    ("%dth RX ring is still attached", i));
6332 	}
6333 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6334 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6335 		    HN_TX_FLAG_ATTACHED) == 0,
6336 		    ("%dth TX ring is still attached", i));
6337 	}
6338 #endif
6339 }
6340 
6341 static int
6342 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6343 {
6344 	struct vmbus_channel **subchans;
6345 	int nchan, rxr_cnt, error;
6346 
6347 	nchan = *nsubch + 1;
6348 	if (nchan == 1) {
6349 		/*
6350 		 * Multiple RX/TX rings are not requested.
6351 		 */
6352 		*nsubch = 0;
6353 		return (0);
6354 	}
6355 
6356 	/*
6357 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6358 	 * table entries.
6359 	 */
6360 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6361 	if (error) {
6362 		/* No RSS; this is benign. */
6363 		*nsubch = 0;
6364 		return (0);
6365 	}
6366 	if (bootverbose) {
6367 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6368 		    rxr_cnt, nchan);
6369 	}
6370 
6371 	if (nchan > rxr_cnt)
6372 		nchan = rxr_cnt;
6373 	if (nchan == 1) {
6374 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6375 		*nsubch = 0;
6376 		return (0);
6377 	}
6378 
6379 	/*
6380 	 * Allocate sub-channels from NVS.
6381 	 */
6382 	*nsubch = nchan - 1;
6383 	error = hn_nvs_alloc_subchans(sc, nsubch);
6384 	if (error || *nsubch == 0) {
6385 		/* Failed to allocate sub-channels. */
6386 		*nsubch = 0;
6387 		return (0);
6388 	}
6389 
6390 	/*
6391 	 * Wait for all sub-channels to become ready before moving on.
6392 	 */
6393 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6394 	vmbus_subchan_rel(subchans, *nsubch);
6395 	return (0);
6396 }
6397 
6398 static bool
6399 hn_synth_attachable(const struct hn_softc *sc)
6400 {
6401 	int i;
6402 
6403 	if (sc->hn_flags & HN_FLAG_ERRORS)
6404 		return (false);
6405 
6406 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6407 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6408 
6409 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6410 			return (false);
6411 	}
6412 	return (true);
6413 }
6414 
6415 /*
6416  * Make sure that the RX filter is zero after the successful
6417  * RNDIS initialization.
6418  *
6419  * NOTE:
6420  * Under certain conditions on certain versions of Hyper-V,
6421  * the RNDIS rxfilter is _not_ zero on the hypervisor side
6422  * after the successful RNDIS initialization, which breaks
6423  * the assumption of any following code (well, it breaks the
6424  * RNDIS API contract actually).  Clear the RNDIS rxfilter
6425  * explicitly, drain packets sneaking through, and drain the
6426  * interrupt taskqueues scheduled due to the stealth packets.
6427  */
6428 static void
6429 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6430 {
6431 
6432 	hn_disable_rx(sc);
6433 	hn_drain_rxtx(sc, nchan);
6434 }
6435 
6436 static int
6437 hn_synth_attach(struct hn_softc *sc, int mtu)
6438 {
6439 #define ATTACHED_NVS		0x0002
6440 #define ATTACHED_RNDIS		0x0004
6441 
6442 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6443 	int error, nsubch, nchan = 1, i, rndis_inited;
6444 	uint32_t old_caps, attached = 0;
6445 
6446 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6447 	    ("synthetic parts were attached"));
6448 
6449 	if (!hn_synth_attachable(sc))
6450 		return (ENXIO);
6451 
6452 	/* Save capabilities for later verification. */
6453 	old_caps = sc->hn_caps;
6454 	sc->hn_caps = 0;
6455 
6456 	/* Clear RSS stuffs. */
6457 	sc->hn_rss_ind_size = 0;
6458 	sc->hn_rss_hash = 0;
6459 	sc->hn_rss_hcap = 0;
6460 
6461 	/*
6462 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
6463 	 */
6464 	error = hn_chan_attach(sc, sc->hn_prichan);
6465 	if (error)
6466 		goto failed;
6467 
6468 	/*
6469 	 * Attach NVS.
6470 	 */
6471 	error = hn_nvs_attach(sc, mtu);
6472 	if (error)
6473 		goto failed;
6474 	attached |= ATTACHED_NVS;
6475 
6476 	/*
6477 	 * Attach RNDIS _after_ NVS is attached.
6478 	 */
6479 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6480 	if (rndis_inited)
6481 		attached |= ATTACHED_RNDIS;
6482 	if (error)
6483 		goto failed;
6484 
6485 	/*
6486 	 * Make sure capabilities are not changed.
6487 	 */
6488 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6489 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6490 		    old_caps, sc->hn_caps);
6491 		error = ENXIO;
6492 		goto failed;
6493 	}
6494 
6495 	/*
6496 	 * Allocate sub-channels for multi-TX/RX rings.
6497 	 *
6498 	 * NOTE:
6499 	 * The # of RX rings that can be used is equivalent to the # of
6500 	 * channels to be requested.
6501 	 */
6502 	nsubch = sc->hn_rx_ring_cnt - 1;
6503 	error = hn_synth_alloc_subchans(sc, &nsubch);
6504 	if (error)
6505 		goto failed;
6506 	/* NOTE: _Full_ synthetic parts detach is required now. */
6507 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6508 
6509 	/*
6510 	 * Set the # of TX/RX rings that could be used according to
6511 	 * the # of channels that NVS offered.
6512 	 */
6513 	nchan = nsubch + 1;
6514 	hn_set_ring_inuse(sc, nchan);
6515 	if (nchan == 1) {
6516 		/* Only the primary channel can be used; done */
6517 		goto back;
6518 	}
6519 
6520 	/*
6521 	 * Attach the sub-channels.
6522 	 *
6523 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6524 	 */
6525 	error = hn_attach_subchans(sc);
6526 	if (error)
6527 		goto failed;
6528 
6529 	/*
6530 	 * Configure RSS key and indirect table _after_ all sub-channels
6531 	 * are attached.
6532 	 */
6533 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6534 		/*
6535 		 * RSS key is not set yet; set it to the default RSS key.
6536 		 */
6537 		if (bootverbose)
6538 			if_printf(sc->hn_ifp, "setup default RSS key\n");
6539 #ifdef RSS
6540 		rss_getkey(rss->rss_key);
6541 #else
6542 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6543 #endif
6544 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6545 	}
6546 
6547 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6548 		/*
6549 		 * RSS indirect table is not set yet; set it up in round-
6550 		 * robin fashion.
6551 		 */
6552 		if (bootverbose) {
6553 			if_printf(sc->hn_ifp, "setup default RSS indirect "
6554 			    "table\n");
6555 		}
6556 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6557 			uint32_t subidx;
6558 
6559 #ifdef RSS
6560 			subidx = rss_get_indirection_to_bucket(i);
6561 #else
6562 			subidx = i;
6563 #endif
6564 			rss->rss_ind[i] = subidx % nchan;
6565 		}
6566 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6567 	} else {
6568 		/*
6569 		 * # of usable channels may be changed, so we have to
6570 		 * make sure that all entries in RSS indirect table
6571 		 * are valid.
6572 		 *
6573 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6574 		 */
6575 		hn_rss_ind_fixup(sc);
6576 	}
6577 
6578 	sc->hn_rss_hash = sc->hn_rss_hcap;
6579 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
6580 	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6581 		/* NOTE: Don't reconfigure RSS; will do immediately. */
6582 		hn_vf_rss_fixup(sc, false);
6583 	}
6584 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6585 	if (error)
6586 		goto failed;
6587 back:
6588 	/*
6589 	 * Fixup transmission aggregation setup.
6590 	 */
6591 	hn_set_txagg(sc);
6592 	hn_rndis_init_fixat(sc, nchan);
6593 	return (0);
6594 
6595 failed:
6596 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6597 		hn_rndis_init_fixat(sc, nchan);
6598 		hn_synth_detach(sc);
6599 	} else {
6600 		if (attached & ATTACHED_RNDIS) {
6601 			hn_rndis_init_fixat(sc, nchan);
6602 			hn_rndis_detach(sc);
6603 		}
6604 		if (attached & ATTACHED_NVS)
6605 			hn_nvs_detach(sc);
6606 		hn_chan_detach(sc, sc->hn_prichan);
6607 		/* Restore old capabilities. */
6608 		sc->hn_caps = old_caps;
6609 	}
6610 	return (error);
6611 
6612 #undef ATTACHED_RNDIS
6613 #undef ATTACHED_NVS
6614 }
6615 
6616 /*
6617  * NOTE:
6618  * The interface must have been suspended though hn_suspend(), before
6619  * this function get called.
6620  */
6621 static void
6622 hn_synth_detach(struct hn_softc *sc)
6623 {
6624 
6625 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6626 	    ("synthetic parts were not attached"));
6627 
6628 	/* Detach the RNDIS first. */
6629 	hn_rndis_detach(sc);
6630 
6631 	/* Detach NVS. */
6632 	hn_nvs_detach(sc);
6633 
6634 	/* Detach all of the channels. */
6635 	hn_detach_allchans(sc);
6636 
6637 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
6638 		/*
6639 		 * Host is post-Win2016, disconnect RXBUF from primary channel here.
6640 		 */
6641 		int error;
6642 
6643 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6644 		    sc->hn_rxbuf_gpadl);
6645 		if (error) {
6646 			if_printf(sc->hn_ifp,
6647 			    "rxbuf gpadl disconn failed: %d\n", error);
6648 			sc->hn_flags |= HN_FLAG_RXBUF_REF;
6649 		}
6650 		sc->hn_rxbuf_gpadl = 0;
6651 	}
6652 
6653 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
6654 		/*
6655 		 * Host is post-Win2016, disconnect chimney sending buffer from
6656 		 * primary channel here.
6657 		 */
6658 		int error;
6659 
6660 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6661 		    sc->hn_chim_gpadl);
6662 		if (error) {
6663 			if_printf(sc->hn_ifp,
6664 			    "chim gpadl disconn failed: %d\n", error);
6665 			sc->hn_flags |= HN_FLAG_CHIM_REF;
6666 		}
6667 		sc->hn_chim_gpadl = 0;
6668 	}
6669 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6670 }
6671 
6672 static void
6673 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6674 {
6675 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6676 	    ("invalid ring count %d", ring_cnt));
6677 
6678 	if (sc->hn_tx_ring_cnt > ring_cnt)
6679 		sc->hn_tx_ring_inuse = ring_cnt;
6680 	else
6681 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6682 	sc->hn_rx_ring_inuse = ring_cnt;
6683 
6684 #ifdef RSS
6685 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6686 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6687 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6688 		    rss_getnumbuckets());
6689 	}
6690 #endif
6691 
6692 	if (bootverbose) {
6693 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6694 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6695 	}
6696 }
6697 
6698 static void
6699 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6700 {
6701 
6702 	/*
6703 	 * NOTE:
6704 	 * The TX bufring will not be drained by the hypervisor,
6705 	 * if the primary channel is revoked.
6706 	 */
6707 	while (!vmbus_chan_rx_empty(chan) ||
6708 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6709 	     !vmbus_chan_tx_empty(chan)))
6710 		pause("waitch", 1);
6711 	vmbus_chan_intr_drain(chan);
6712 }
6713 
6714 static void
6715 hn_disable_rx(struct hn_softc *sc)
6716 {
6717 
6718 	/*
6719 	 * Disable RX by clearing RX filter forcefully.
6720 	 */
6721 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6722 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6723 
6724 	/*
6725 	 * Give RNDIS enough time to flush all pending data packets.
6726 	 */
6727 	pause("waitrx", (200 * hz) / 1000);
6728 }
6729 
6730 /*
6731  * NOTE:
6732  * RX/TX _must_ have been suspended/disabled, before this function
6733  * is called.
6734  */
6735 static void
6736 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6737 {
6738 	struct vmbus_channel **subch = NULL;
6739 	int nsubch;
6740 
6741 	/*
6742 	 * Drain RX/TX bufrings and interrupts.
6743 	 */
6744 	nsubch = nchan - 1;
6745 	if (nsubch > 0)
6746 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6747 
6748 	if (subch != NULL) {
6749 		int i;
6750 
6751 		for (i = 0; i < nsubch; ++i)
6752 			hn_chan_drain(sc, subch[i]);
6753 	}
6754 	hn_chan_drain(sc, sc->hn_prichan);
6755 
6756 	if (subch != NULL)
6757 		vmbus_subchan_rel(subch, nsubch);
6758 }
6759 
6760 static void
6761 hn_suspend_data(struct hn_softc *sc)
6762 {
6763 	struct hn_tx_ring *txr;
6764 	int i;
6765 
6766 	HN_LOCK_ASSERT(sc);
6767 
6768 	/*
6769 	 * Suspend TX.
6770 	 */
6771 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6772 		txr = &sc->hn_tx_ring[i];
6773 
6774 		mtx_lock(&txr->hn_tx_lock);
6775 		txr->hn_suspended = 1;
6776 		mtx_unlock(&txr->hn_tx_lock);
6777 		/* No one is able send more packets now. */
6778 
6779 		/*
6780 		 * Wait for all pending sends to finish.
6781 		 *
6782 		 * NOTE:
6783 		 * We will _not_ receive all pending send-done, if the
6784 		 * primary channel is revoked.
6785 		 */
6786 		while (hn_tx_ring_pending(txr) &&
6787 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6788 			pause("hnwtx", 1 /* 1 tick */);
6789 	}
6790 
6791 	/*
6792 	 * Disable RX.
6793 	 */
6794 	hn_disable_rx(sc);
6795 
6796 	/*
6797 	 * Drain RX/TX.
6798 	 */
6799 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6800 
6801 	/*
6802 	 * Drain any pending TX tasks.
6803 	 *
6804 	 * NOTE:
6805 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6806 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6807 	 */
6808 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6809 		txr = &sc->hn_tx_ring[i];
6810 
6811 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6812 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6813 	}
6814 }
6815 
6816 static void
6817 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6818 {
6819 
6820 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6821 }
6822 
6823 static void
6824 hn_suspend_mgmt(struct hn_softc *sc)
6825 {
6826 	struct task task;
6827 
6828 	HN_LOCK_ASSERT(sc);
6829 
6830 	/*
6831 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6832 	 * through hn_mgmt_taskq.
6833 	 */
6834 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6835 	vmbus_chan_run_task(sc->hn_prichan, &task);
6836 
6837 	/*
6838 	 * Make sure that all pending management tasks are completed.
6839 	 */
6840 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6841 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6842 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6843 }
6844 
6845 static void
6846 hn_suspend(struct hn_softc *sc)
6847 {
6848 
6849 	/* Disable polling. */
6850 	hn_polling(sc, 0);
6851 
6852 	/*
6853 	 * If the non-transparent mode VF is activated, the synthetic
6854 	 * device is receiving packets, so the data path of the
6855 	 * synthetic device must be suspended.
6856 	 */
6857 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6858 	    (sc->hn_flags & HN_FLAG_RXVF))
6859 		hn_suspend_data(sc);
6860 	hn_suspend_mgmt(sc);
6861 }
6862 
6863 static void
6864 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6865 {
6866 	int i;
6867 
6868 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6869 	    ("invalid TX ring count %d", tx_ring_cnt));
6870 
6871 	for (i = 0; i < tx_ring_cnt; ++i) {
6872 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6873 
6874 		mtx_lock(&txr->hn_tx_lock);
6875 		txr->hn_suspended = 0;
6876 		mtx_unlock(&txr->hn_tx_lock);
6877 	}
6878 }
6879 
6880 static void
6881 hn_resume_data(struct hn_softc *sc)
6882 {
6883 	int i;
6884 
6885 	HN_LOCK_ASSERT(sc);
6886 
6887 	/*
6888 	 * Re-enable RX.
6889 	 */
6890 	hn_rxfilter_config(sc);
6891 
6892 	/*
6893 	 * Make sure to clear suspend status on "all" TX rings,
6894 	 * since hn_tx_ring_inuse can be changed after
6895 	 * hn_suspend_data().
6896 	 */
6897 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6898 
6899 #ifdef HN_IFSTART_SUPPORT
6900 	if (!hn_use_if_start)
6901 #endif
6902 	{
6903 		/*
6904 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6905 		 * reduced.
6906 		 */
6907 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6908 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6909 	}
6910 
6911 	/*
6912 	 * Kick start TX.
6913 	 */
6914 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6915 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6916 
6917 		/*
6918 		 * Use txeof task, so that any pending oactive can be
6919 		 * cleared properly.
6920 		 */
6921 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6922 	}
6923 }
6924 
6925 static void
6926 hn_resume_mgmt(struct hn_softc *sc)
6927 {
6928 
6929 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6930 
6931 	/*
6932 	 * Kick off network change detection, if it was pending.
6933 	 * If no network change was pending, start link status
6934 	 * checks, which is more lightweight than network change
6935 	 * detection.
6936 	 */
6937 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6938 		hn_change_network(sc);
6939 	else
6940 		hn_update_link_status(sc);
6941 }
6942 
6943 static void
6944 hn_resume(struct hn_softc *sc)
6945 {
6946 
6947 	/*
6948 	 * If the non-transparent mode VF is activated, the synthetic
6949 	 * device have to receive packets, so the data path of the
6950 	 * synthetic device must be resumed.
6951 	 */
6952 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6953 	    (sc->hn_flags & HN_FLAG_RXVF))
6954 		hn_resume_data(sc);
6955 
6956 	/*
6957 	 * Don't resume link status change if VF is attached/activated.
6958 	 * - In the non-transparent VF mode, the synthetic device marks
6959 	 *   link down until the VF is deactivated; i.e. VF is down.
6960 	 * - In transparent VF mode, VF's media status is used until
6961 	 *   the VF is detached.
6962 	 */
6963 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6964 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6965 		hn_resume_mgmt(sc);
6966 
6967 	/*
6968 	 * Re-enable polling if this interface is running and
6969 	 * the polling is requested.
6970 	 */
6971 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6972 		hn_polling(sc, sc->hn_pollhz);
6973 }
6974 
6975 static void
6976 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6977 {
6978 	const struct rndis_status_msg *msg;
6979 	int ofs;
6980 
6981 	if (dlen < sizeof(*msg)) {
6982 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6983 		return;
6984 	}
6985 	msg = data;
6986 
6987 	switch (msg->rm_status) {
6988 	case RNDIS_STATUS_MEDIA_CONNECT:
6989 	case RNDIS_STATUS_MEDIA_DISCONNECT:
6990 		hn_update_link_status(sc);
6991 		break;
6992 
6993 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
6994 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
6995 		/* Not really useful; ignore. */
6996 		break;
6997 
6998 	case RNDIS_STATUS_NETWORK_CHANGE:
6999 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
7000 		if (dlen < ofs + msg->rm_stbuflen ||
7001 		    msg->rm_stbuflen < sizeof(uint32_t)) {
7002 			if_printf(sc->hn_ifp, "network changed\n");
7003 		} else {
7004 			uint32_t change;
7005 
7006 			memcpy(&change, ((const uint8_t *)msg) + ofs,
7007 			    sizeof(change));
7008 			if_printf(sc->hn_ifp, "network changed, change %u\n",
7009 			    change);
7010 		}
7011 		hn_change_network(sc);
7012 		break;
7013 
7014 	default:
7015 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
7016 		    msg->rm_status);
7017 		break;
7018 	}
7019 }
7020 
7021 static int
7022 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
7023 {
7024 	const struct rndis_pktinfo *pi = info_data;
7025 	uint32_t mask = 0;
7026 
7027 	while (info_dlen != 0) {
7028 		const void *data;
7029 		uint32_t dlen;
7030 
7031 		if (__predict_false(info_dlen < sizeof(*pi)))
7032 			return (EINVAL);
7033 		if (__predict_false(info_dlen < pi->rm_size))
7034 			return (EINVAL);
7035 		info_dlen -= pi->rm_size;
7036 
7037 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
7038 			return (EINVAL);
7039 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
7040 			return (EINVAL);
7041 		dlen = pi->rm_size - pi->rm_pktinfooffset;
7042 		data = pi->rm_data;
7043 
7044 		switch (pi->rm_type) {
7045 		case NDIS_PKTINFO_TYPE_VLAN:
7046 			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
7047 				return (EINVAL);
7048 			info->vlan_info = *((const uint32_t *)data);
7049 			mask |= HN_RXINFO_VLAN;
7050 			break;
7051 
7052 		case NDIS_PKTINFO_TYPE_CSUM:
7053 			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
7054 				return (EINVAL);
7055 			info->csum_info = *((const uint32_t *)data);
7056 			mask |= HN_RXINFO_CSUM;
7057 			break;
7058 
7059 		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
7060 			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
7061 				return (EINVAL);
7062 			info->hash_value = *((const uint32_t *)data);
7063 			mask |= HN_RXINFO_HASHVAL;
7064 			break;
7065 
7066 		case HN_NDIS_PKTINFO_TYPE_HASHINF:
7067 			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
7068 				return (EINVAL);
7069 			info->hash_info = *((const uint32_t *)data);
7070 			mask |= HN_RXINFO_HASHINF;
7071 			break;
7072 
7073 		default:
7074 			goto next;
7075 		}
7076 
7077 		if (mask == HN_RXINFO_ALL) {
7078 			/* All found; done */
7079 			break;
7080 		}
7081 next:
7082 		pi = (const struct rndis_pktinfo *)
7083 		    ((const uint8_t *)pi + pi->rm_size);
7084 	}
7085 
7086 	/*
7087 	 * Final fixup.
7088 	 * - If there is no hash value, invalidate the hash info.
7089 	 */
7090 	if ((mask & HN_RXINFO_HASHVAL) == 0)
7091 		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
7092 	return (0);
7093 }
7094 
7095 static __inline bool
7096 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7097 {
7098 
7099 	if (off < check_off) {
7100 		if (__predict_true(off + len <= check_off))
7101 			return (false);
7102 	} else if (off > check_off) {
7103 		if (__predict_true(check_off + check_len <= off))
7104 			return (false);
7105 	}
7106 	return (true);
7107 }
7108 
7109 static void
7110 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7111 {
7112 	const struct rndis_packet_msg *pkt;
7113 	struct hn_rxinfo info;
7114 	int data_off, pktinfo_off, data_len, pktinfo_len;
7115 
7116 	/*
7117 	 * Check length.
7118 	 */
7119 	if (__predict_false(dlen < sizeof(*pkt))) {
7120 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7121 		return;
7122 	}
7123 	pkt = data;
7124 
7125 	if (__predict_false(dlen < pkt->rm_len)) {
7126 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7127 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7128 		return;
7129 	}
7130 	if (__predict_false(pkt->rm_len <
7131 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7132 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7133 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
7134 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7135 		    pkt->rm_pktinfolen);
7136 		return;
7137 	}
7138 	if (__predict_false(pkt->rm_datalen == 0)) {
7139 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7140 		return;
7141 	}
7142 
7143 	/*
7144 	 * Check offests.
7145 	 */
7146 #define IS_OFFSET_INVALID(ofs)			\
7147 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
7148 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7149 
7150 	/* XXX Hyper-V does not meet data offset alignment requirement */
7151 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7152 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7153 		    "data offset %u\n", pkt->rm_dataoffset);
7154 		return;
7155 	}
7156 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7157 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7158 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7159 		    "oob offset %u\n", pkt->rm_oobdataoffset);
7160 		return;
7161 	}
7162 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7163 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7164 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7165 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7166 		return;
7167 	}
7168 
7169 #undef IS_OFFSET_INVALID
7170 
7171 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7172 	data_len = pkt->rm_datalen;
7173 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7174 	pktinfo_len = pkt->rm_pktinfolen;
7175 
7176 	/*
7177 	 * Check OOB coverage.
7178 	 */
7179 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
7180 		int oob_off, oob_len;
7181 
7182 		if_printf(rxr->hn_ifp, "got oobdata\n");
7183 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7184 		oob_len = pkt->rm_oobdatalen;
7185 
7186 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7187 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7188 			    "oob overflow, msglen %u, oob abs %d len %d\n",
7189 			    pkt->rm_len, oob_off, oob_len);
7190 			return;
7191 		}
7192 
7193 		/*
7194 		 * Check against data.
7195 		 */
7196 		if (hn_rndis_check_overlap(oob_off, oob_len,
7197 		    data_off, data_len)) {
7198 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7199 			    "oob overlaps data, oob abs %d len %d, "
7200 			    "data abs %d len %d\n",
7201 			    oob_off, oob_len, data_off, data_len);
7202 			return;
7203 		}
7204 
7205 		/*
7206 		 * Check against pktinfo.
7207 		 */
7208 		if (pktinfo_len != 0 &&
7209 		    hn_rndis_check_overlap(oob_off, oob_len,
7210 		    pktinfo_off, pktinfo_len)) {
7211 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7212 			    "oob overlaps pktinfo, oob abs %d len %d, "
7213 			    "pktinfo abs %d len %d\n",
7214 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
7215 			return;
7216 		}
7217 	}
7218 
7219 	/*
7220 	 * Check per-packet-info coverage and find useful per-packet-info.
7221 	 */
7222 	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
7223 	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
7224 	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
7225 	if (__predict_true(pktinfo_len != 0)) {
7226 		bool overlap;
7227 		int error;
7228 
7229 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7230 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7231 			    "pktinfo overflow, msglen %u, "
7232 			    "pktinfo abs %d len %d\n",
7233 			    pkt->rm_len, pktinfo_off, pktinfo_len);
7234 			return;
7235 		}
7236 
7237 		/*
7238 		 * Check packet info coverage.
7239 		 */
7240 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7241 		    data_off, data_len);
7242 		if (__predict_false(overlap)) {
7243 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7244 			    "pktinfo overlap data, pktinfo abs %d len %d, "
7245 			    "data abs %d len %d\n",
7246 			    pktinfo_off, pktinfo_len, data_off, data_len);
7247 			return;
7248 		}
7249 
7250 		/*
7251 		 * Find useful per-packet-info.
7252 		 */
7253 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7254 		    pktinfo_len, &info);
7255 		if (__predict_false(error)) {
7256 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7257 			    "pktinfo\n");
7258 			return;
7259 		}
7260 	}
7261 
7262 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
7263 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7264 		    "data overflow, msglen %u, data abs %d len %d\n",
7265 		    pkt->rm_len, data_off, data_len);
7266 		return;
7267 	}
7268 	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
7269 }
7270 
7271 static __inline void
7272 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7273 {
7274 	const struct rndis_msghdr *hdr;
7275 
7276 	if (__predict_false(dlen < sizeof(*hdr))) {
7277 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7278 		return;
7279 	}
7280 	hdr = data;
7281 
7282 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7283 		/* Hot data path. */
7284 		hn_rndis_rx_data(rxr, data, dlen);
7285 		/* Done! */
7286 		return;
7287 	}
7288 
7289 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7290 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7291 	else
7292 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7293 }
7294 
7295 static void
7296 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7297 {
7298 	const struct hn_nvs_hdr *hdr;
7299 
7300 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7301 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
7302 		return;
7303 	}
7304 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7305 
7306 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7307 		/* Useless; ignore */
7308 		return;
7309 	}
7310 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7311 }
7312 
7313 static void
7314 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7315     const struct vmbus_chanpkt_hdr *pkt)
7316 {
7317 	struct hn_nvs_sendctx *sndc;
7318 
7319 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7320 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7321 	    VMBUS_CHANPKT_DATALEN(pkt));
7322 	/*
7323 	 * NOTE:
7324 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7325 	 * its callback.
7326 	 */
7327 }
7328 
7329 static void
7330 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7331     const struct vmbus_chanpkt_hdr *pkthdr)
7332 {
7333 	const struct vmbus_chanpkt_rxbuf *pkt;
7334 	const struct hn_nvs_hdr *nvs_hdr;
7335 	int count, i, hlen;
7336 
7337 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7338 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7339 		return;
7340 	}
7341 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7342 
7343 	/* Make sure that this is a RNDIS message. */
7344 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7345 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7346 		    nvs_hdr->nvs_type);
7347 		return;
7348 	}
7349 
7350 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7351 	if (__predict_false(hlen < sizeof(*pkt))) {
7352 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7353 		return;
7354 	}
7355 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7356 
7357 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7358 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7359 		    pkt->cp_rxbuf_id);
7360 		return;
7361 	}
7362 
7363 	count = pkt->cp_rxbuf_cnt;
7364 	if (__predict_false(hlen <
7365 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7366 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7367 		return;
7368 	}
7369 
7370 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7371 	for (i = 0; i < count; ++i) {
7372 		int ofs, len;
7373 
7374 		ofs = pkt->cp_rxbuf[i].rb_ofs;
7375 		len = pkt->cp_rxbuf[i].rb_len;
7376 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7377 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7378 			    "ofs %d, len %d\n", i, ofs, len);
7379 			continue;
7380 		}
7381 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7382 	}
7383 
7384 	/*
7385 	 * Ack the consumed RXBUF associated w/ this channel packet,
7386 	 * so that this RXBUF can be recycled by the hypervisor.
7387 	 */
7388 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7389 }
7390 
7391 static void
7392 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7393     uint64_t tid)
7394 {
7395 	struct hn_nvs_rndis_ack ack;
7396 	int retries, error;
7397 
7398 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7399 	ack.nvs_status = HN_NVS_STATUS_OK;
7400 
7401 	retries = 0;
7402 again:
7403 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7404 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7405 	if (__predict_false(error == EAGAIN)) {
7406 		/*
7407 		 * NOTE:
7408 		 * This should _not_ happen in real world, since the
7409 		 * consumption of the TX bufring from the TX path is
7410 		 * controlled.
7411 		 */
7412 		if (rxr->hn_ack_failed == 0)
7413 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7414 		rxr->hn_ack_failed++;
7415 		retries++;
7416 		if (retries < 10) {
7417 			DELAY(100);
7418 			goto again;
7419 		}
7420 		/* RXBUF leaks! */
7421 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7422 	}
7423 }
7424 
7425 static void
7426 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7427 {
7428 	struct hn_rx_ring *rxr = xrxr;
7429 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
7430 
7431 	for (;;) {
7432 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7433 		int error, pktlen;
7434 
7435 		pktlen = rxr->hn_pktbuf_len;
7436 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7437 		if (__predict_false(error == ENOBUFS)) {
7438 			void *nbuf;
7439 			int nlen;
7440 
7441 			/*
7442 			 * Expand channel packet buffer.
7443 			 *
7444 			 * XXX
7445 			 * Use M_WAITOK here, since allocation failure
7446 			 * is fatal.
7447 			 */
7448 			nlen = rxr->hn_pktbuf_len * 2;
7449 			while (nlen < pktlen)
7450 				nlen *= 2;
7451 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7452 
7453 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7454 			    rxr->hn_pktbuf_len, nlen);
7455 
7456 			free(rxr->hn_pktbuf, M_DEVBUF);
7457 			rxr->hn_pktbuf = nbuf;
7458 			rxr->hn_pktbuf_len = nlen;
7459 			/* Retry! */
7460 			continue;
7461 		} else if (__predict_false(error == EAGAIN)) {
7462 			/* No more channel packets; done! */
7463 			break;
7464 		}
7465 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7466 
7467 		switch (pkt->cph_type) {
7468 		case VMBUS_CHANPKT_TYPE_COMP:
7469 			hn_nvs_handle_comp(sc, chan, pkt);
7470 			break;
7471 
7472 		case VMBUS_CHANPKT_TYPE_RXBUF:
7473 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
7474 			break;
7475 
7476 		case VMBUS_CHANPKT_TYPE_INBAND:
7477 			hn_nvs_handle_notify(sc, pkt);
7478 			break;
7479 
7480 		default:
7481 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7482 			    pkt->cph_type);
7483 			break;
7484 		}
7485 	}
7486 	hn_chan_rollup(rxr, rxr->hn_txr);
7487 }
7488 
7489 static void
7490 hn_sysinit(void *arg __unused)
7491 {
7492 	int i;
7493 
7494 	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7495 
7496 #ifdef HN_IFSTART_SUPPORT
7497 	/*
7498 	 * Don't use ifnet.if_start if transparent VF mode is requested;
7499 	 * mainly due to the IFF_DRV_OACTIVE flag.
7500 	 */
7501 	if (hn_xpnt_vf && hn_use_if_start) {
7502 		hn_use_if_start = 0;
7503 		printf("hn: tranparent VF mode, if_transmit will be used, "
7504 		    "instead of if_start\n");
7505 	}
7506 #endif
7507 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7508 		printf("hn: invalid transparent VF attach routing "
7509 		    "wait timeout %d, reset to %d\n",
7510 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7511 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7512 	}
7513 
7514 	/*
7515 	 * Initialize VF map.
7516 	 */
7517 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7518 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7519 	hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7520 	    M_WAITOK | M_ZERO);
7521 
7522 	/*
7523 	 * Fix the # of TX taskqueues.
7524 	 */
7525 	if (hn_tx_taskq_cnt <= 0)
7526 		hn_tx_taskq_cnt = 1;
7527 	else if (hn_tx_taskq_cnt > mp_ncpus)
7528 		hn_tx_taskq_cnt = mp_ncpus;
7529 
7530 	/*
7531 	 * Fix the TX taskqueue mode.
7532 	 */
7533 	switch (hn_tx_taskq_mode) {
7534 	case HN_TX_TASKQ_M_INDEP:
7535 	case HN_TX_TASKQ_M_GLOBAL:
7536 	case HN_TX_TASKQ_M_EVTTQ:
7537 		break;
7538 	default:
7539 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7540 		break;
7541 	}
7542 
7543 	if (vm_guest != VM_GUEST_HV)
7544 		return;
7545 
7546 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7547 		return;
7548 
7549 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7550 	    M_DEVBUF, M_WAITOK);
7551 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7552 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7553 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7554 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7555 		    "hn tx%d", i);
7556 	}
7557 }
7558 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7559 
7560 static void
7561 hn_sysuninit(void *arg __unused)
7562 {
7563 
7564 	if (hn_tx_taskque != NULL) {
7565 		int i;
7566 
7567 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7568 			taskqueue_free(hn_tx_taskque[i]);
7569 		free(hn_tx_taskque, M_DEVBUF);
7570 	}
7571 
7572 	if (hn_vfmap != NULL)
7573 		free(hn_vfmap, M_DEVBUF);
7574 	rm_destroy(&hn_vfmap_lock);
7575 
7576 	counter_u64_free(hn_udpcs_fixup);
7577 }
7578 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7579