xref: /dragonfly/sys/dev/netif/mxge/if_mxge.c (revision 532828a0)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2013, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 $FreeBSD: head/sys/dev/mxge/if_mxge.c 254263 2013-08-12 23:30:01Z scottl $
29 
30 ***************************************************************************/
31 
32 #include "opt_ifpoll.h"
33 #include "opt_inet.h"
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/linker.h>
38 #include <sys/firmware.h>
39 #include <sys/endian.h>
40 #include <sys/in_cksum.h>
41 #include <sys/sockio.h>
42 #include <sys/mbuf.h>
43 #include <sys/malloc.h>
44 #include <sys/kernel.h>
45 #include <sys/module.h>
46 #include <sys/serialize.h>
47 #include <sys/socket.h>
48 #include <sys/sysctl.h>
49 
50 #include <net/if.h>
51 #include <net/if_arp.h>
52 #include <net/ifq_var.h>
53 #include <net/ethernet.h>
54 #include <net/if_dl.h>
55 #include <net/if_media.h>
56 #include <net/if_poll.h>
57 
58 #include <net/bpf.h>
59 
60 #include <net/if_types.h>
61 #include <net/vlan/if_vlan_var.h>
62 #include <net/zlib.h>
63 #include <net/toeplitz.h>
64 
65 #include <netinet/in_systm.h>
66 #include <netinet/in.h>
67 #include <netinet/ip.h>
68 #include <netinet/tcp.h>
69 
70 #include <sys/bus.h>
71 #include <sys/rman.h>
72 
73 #include <bus/pci/pcireg.h>
74 #include <bus/pci/pcivar.h>
75 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
76 
77 #include <vm/vm.h>		/* for pmap_mapdev() */
78 #include <vm/pmap.h>
79 
80 #if defined(__i386__) || defined(__x86_64__)
81 #include <machine/specialreg.h>
82 #endif
83 
84 #include <dev/netif/mxge/mxge_mcp.h>
85 #include <dev/netif/mxge/mcp_gen_header.h>
86 #include <dev/netif/mxge/if_mxge_var.h>
87 
88 #define MXGE_RX_SMALL_BUFLEN		(MHLEN - MXGEFW_PAD)
89 #define MXGE_HWRSS_KEYLEN		16
90 
91 /* Tunable params */
92 static int mxge_nvidia_ecrc_enable = 1;
93 static int mxge_force_firmware = 0;
94 static int mxge_intr_coal_delay = MXGE_INTR_COAL_DELAY;
95 static int mxge_deassert_wait = 1;
96 static int mxge_flow_control = 1;
97 static int mxge_ticks;
98 static int mxge_num_slices = 0;
99 static int mxge_always_promisc = 0;
100 static int mxge_throttle = 0;
101 static int mxge_msi_enable = 1;
102 static int mxge_msix_enable = 1;
103 static int mxge_multi_tx = 1;
104 /*
105  * Don't use RSS by default, its just too slow
106  */
107 static int mxge_use_rss = 0;
108 
109 static const char *mxge_fw_unaligned = "mxge_ethp_z8e";
110 static const char *mxge_fw_aligned = "mxge_eth_z8e";
111 static const char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
112 static const char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
113 
114 TUNABLE_INT("hw.mxge.num_slices", &mxge_num_slices);
115 TUNABLE_INT("hw.mxge.flow_control_enabled", &mxge_flow_control);
116 TUNABLE_INT("hw.mxge.intr_coal_delay", &mxge_intr_coal_delay);
117 TUNABLE_INT("hw.mxge.nvidia_ecrc_enable", &mxge_nvidia_ecrc_enable);
118 TUNABLE_INT("hw.mxge.force_firmware", &mxge_force_firmware);
119 TUNABLE_INT("hw.mxge.deassert_wait", &mxge_deassert_wait);
120 TUNABLE_INT("hw.mxge.ticks", &mxge_ticks);
121 TUNABLE_INT("hw.mxge.always_promisc", &mxge_always_promisc);
122 TUNABLE_INT("hw.mxge.throttle", &mxge_throttle);
123 TUNABLE_INT("hw.mxge.multi_tx", &mxge_multi_tx);
124 TUNABLE_INT("hw.mxge.use_rss", &mxge_use_rss);
125 TUNABLE_INT("hw.mxge.msi.enable", &mxge_msi_enable);
126 TUNABLE_INT("hw.mxge.msix.enable", &mxge_msix_enable);
127 
128 static int mxge_probe(device_t dev);
129 static int mxge_attach(device_t dev);
130 static int mxge_detach(device_t dev);
131 static int mxge_shutdown(device_t dev);
132 
133 static int mxge_alloc_intr(struct mxge_softc *sc);
134 static void mxge_free_intr(struct mxge_softc *sc);
135 static int mxge_setup_intr(struct mxge_softc *sc);
136 static void mxge_teardown_intr(struct mxge_softc *sc, int cnt);
137 
138 static device_method_t mxge_methods[] = {
139 	/* Device interface */
140 	DEVMETHOD(device_probe, mxge_probe),
141 	DEVMETHOD(device_attach, mxge_attach),
142 	DEVMETHOD(device_detach, mxge_detach),
143 	DEVMETHOD(device_shutdown, mxge_shutdown),
144 	DEVMETHOD_END
145 };
146 
147 static driver_t mxge_driver = {
148 	"mxge",
149 	mxge_methods,
150 	sizeof(mxge_softc_t),
151 };
152 
153 static devclass_t mxge_devclass;
154 
155 /* Declare ourselves to be a child of the PCI bus.*/
156 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, NULL, NULL);
157 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
158 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
159 
160 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
161 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
162 static void mxge_close(mxge_softc_t *sc, int down);
163 static int mxge_open(mxge_softc_t *sc);
164 static void mxge_tick(void *arg);
165 static void mxge_watchdog_reset(mxge_softc_t *sc);
166 static void mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice);
167 
168 static int
169 mxge_probe(device_t dev)
170 {
171 	if (pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM &&
172 	    (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E ||
173 	     pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9)) {
174 		int rev = pci_get_revid(dev);
175 
176 		switch (rev) {
177 		case MXGE_PCI_REV_Z8E:
178 			device_set_desc(dev, "Myri10G-PCIE-8A");
179 			break;
180 		case MXGE_PCI_REV_Z8ES:
181 			device_set_desc(dev, "Myri10G-PCIE-8B");
182 			break;
183 		default:
184 			device_set_desc(dev, "Myri10G-PCIE-8??");
185 			device_printf(dev, "Unrecognized rev %d NIC\n", rev);
186 			break;
187 		}
188 		return 0;
189 	}
190 	return ENXIO;
191 }
192 
193 static void
194 mxge_enable_wc(mxge_softc_t *sc)
195 {
196 #if defined(__i386__) || defined(__x86_64__)
197 	vm_offset_t len;
198 
199 	sc->wc = 1;
200 	len = rman_get_size(sc->mem_res);
201 	pmap_change_attr((vm_offset_t) sc->sram, len / PAGE_SIZE,
202 	    PAT_WRITE_COMBINING);
203 #endif
204 }
205 
206 static int
207 mxge_dma_alloc(mxge_softc_t *sc, bus_dmamem_t *dma, size_t bytes,
208     bus_size_t alignment)
209 {
210 	bus_size_t boundary;
211 	int err;
212 
213 	if (bytes > 4096 && alignment == 4096)
214 		boundary = 0;
215 	else
216 		boundary = 4096;
217 
218 	err = bus_dmamem_coherent(sc->parent_dmat, alignment, boundary,
219 	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, bytes,
220 	    BUS_DMA_WAITOK | BUS_DMA_ZERO, dma);
221 	if (err != 0) {
222 		device_printf(sc->dev, "bus_dmamem_coherent failed: %d\n", err);
223 		return err;
224 	}
225 	return 0;
226 }
227 
228 static void
229 mxge_dma_free(bus_dmamem_t *dma)
230 {
231 	bus_dmamap_unload(dma->dmem_tag, dma->dmem_map);
232 	bus_dmamem_free(dma->dmem_tag, dma->dmem_addr, dma->dmem_map);
233 	bus_dma_tag_destroy(dma->dmem_tag);
234 }
235 
236 /*
237  * The eeprom strings on the lanaiX have the format
238  * SN=x\0
239  * MAC=x:x:x:x:x:x\0
240  * PC=text\0
241  */
242 static int
243 mxge_parse_strings(mxge_softc_t *sc)
244 {
245 	const char *ptr;
246 	int i, found_mac, found_sn2;
247 	char *endptr;
248 
249 	ptr = sc->eeprom_strings;
250 	found_mac = 0;
251 	found_sn2 = 0;
252 	while (*ptr != '\0') {
253 		if (strncmp(ptr, "MAC=", 4) == 0) {
254 			ptr += 4;
255 			for (i = 0;;) {
256 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
257 				if (endptr - ptr != 2)
258 					goto abort;
259 				ptr = endptr;
260 				if (++i == 6)
261 					break;
262 				if (*ptr++ != ':')
263 					goto abort;
264 			}
265 			found_mac = 1;
266 		} else if (strncmp(ptr, "PC=", 3) == 0) {
267 			ptr += 3;
268 			strlcpy(sc->product_code_string, ptr,
269 			    sizeof(sc->product_code_string));
270 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
271 			ptr += 3;
272 			strlcpy(sc->serial_number_string, ptr,
273 			    sizeof(sc->serial_number_string));
274 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
275 			/* SN2 takes precedence over SN */
276 			ptr += 4;
277 			found_sn2 = 1;
278 			strlcpy(sc->serial_number_string, ptr,
279 			    sizeof(sc->serial_number_string));
280 		}
281 		while (*ptr++ != '\0') {}
282 	}
283 
284 	if (found_mac)
285 		return 0;
286 
287 abort:
288 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
289 	return ENXIO;
290 }
291 
292 #if defined(__i386__) || defined(__x86_64__)
293 
294 static void
295 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
296 {
297 	uint32_t val;
298 	unsigned long base, off;
299 	char *va, *cfgptr;
300 	device_t pdev, mcp55;
301 	uint16_t vendor_id, device_id, word;
302 	uintptr_t bus, slot, func, ivend, idev;
303 	uint32_t *ptr32;
304 
305 	if (!mxge_nvidia_ecrc_enable)
306 		return;
307 
308 	pdev = device_get_parent(device_get_parent(sc->dev));
309 	if (pdev == NULL) {
310 		device_printf(sc->dev, "could not find parent?\n");
311 		return;
312 	}
313 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
314 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
315 
316 	if (vendor_id != 0x10de)
317 		return;
318 
319 	base = 0;
320 
321 	if (device_id == 0x005d) {
322 		/* ck804, base address is magic */
323 		base = 0xe0000000UL;
324 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
325 		/* mcp55, base address stored in chipset */
326 		mcp55 = pci_find_bsf(0, 0, 0);
327 		if (mcp55 &&
328 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
329 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
330 			word = pci_read_config(mcp55, 0x90, 2);
331 			base = ((unsigned long)word & 0x7ffeU) << 25;
332 		}
333 	}
334 	if (!base)
335 		return;
336 
337 	/*
338 	 * XXXX
339 	 * Test below is commented because it is believed that doing
340 	 * config read/write beyond 0xff will access the config space
341 	 * for the next larger function.  Uncomment this and remove
342 	 * the hacky pmap_mapdev() way of accessing config space when
343 	 * DragonFly grows support for extended pcie config space access.
344 	 */
345 #if 0
346 	/*
347 	 * See if we can, by some miracle, access the extended
348 	 * config space
349 	 */
350 	val = pci_read_config(pdev, 0x178, 4);
351 	if (val != 0xffffffff) {
352 		val |= 0x40;
353 		pci_write_config(pdev, 0x178, val, 4);
354 		return;
355 	}
356 #endif
357 	/*
358 	 * Rather than using normal pci config space writes, we must
359 	 * map the Nvidia config space ourselves.  This is because on
360 	 * opteron/nvidia class machine the 0xe000000 mapping is
361 	 * handled by the nvidia chipset, that means the internal PCI
362 	 * device (the on-chip northbridge), or the amd-8131 bridge
363 	 * and things behind them are not visible by this method.
364 	 */
365 
366 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
367 		      PCI_IVAR_BUS, &bus);
368 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
369 		      PCI_IVAR_SLOT, &slot);
370 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
371 		      PCI_IVAR_FUNCTION, &func);
372 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
373 		      PCI_IVAR_VENDOR, &ivend);
374 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
375 		      PCI_IVAR_DEVICE, &idev);
376 
377 	off =  base + 0x00100000UL * (unsigned long)bus +
378 	    0x00001000UL * (unsigned long)(func + 8 * slot);
379 
380 	/* map it into the kernel */
381 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
382 	if (va == NULL) {
383 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
384 		return;
385 	}
386 	/* get a pointer to the config space mapped into the kernel */
387 	cfgptr = va + (off & PAGE_MASK);
388 
389 	/* make sure that we can really access it */
390 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
391 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
392 	if (!(vendor_id == ivend && device_id == idev)) {
393 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
394 		    vendor_id, device_id);
395 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
396 		return;
397 	}
398 
399 	ptr32 = (uint32_t*)(cfgptr + 0x178);
400 	val = *ptr32;
401 
402 	if (val == 0xffffffff) {
403 		device_printf(sc->dev, "extended mapping failed\n");
404 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
405 		return;
406 	}
407 	*ptr32 = val | 0x40;
408 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
409 	if (bootverbose) {
410 		device_printf(sc->dev, "Enabled ECRC on upstream "
411 		    "Nvidia bridge at %d:%d:%d\n",
412 		    (int)bus, (int)slot, (int)func);
413 	}
414 }
415 
416 #else	/* __i386__ || __x86_64__ */
417 
418 static void
419 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
420 {
421 	device_printf(sc->dev, "Nforce 4 chipset on non-x86/x86_64!?!?!\n");
422 }
423 
424 #endif
425 
426 static int
427 mxge_dma_test(mxge_softc_t *sc, int test_type)
428 {
429 	mxge_cmd_t cmd;
430 	bus_addr_t dmatest_bus = sc->dmabench_dma.dmem_busaddr;
431 	int status;
432 	uint32_t len;
433 	const char *test = " ";
434 
435 	/*
436 	 * Run a small DMA test.
437 	 * The magic multipliers to the length tell the firmware
438 	 * to do DMA read, write, or read+write tests.  The
439 	 * results are returned in cmd.data0.  The upper 16
440 	 * bits of the return is the number of transfers completed.
441 	 * The lower 16 bits is the time in 0.5us ticks that the
442 	 * transfers took to complete.
443 	 */
444 
445 	len = sc->tx_boundary;
446 
447 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
448 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
449 	cmd.data2 = len * 0x10000;
450 	status = mxge_send_cmd(sc, test_type, &cmd);
451 	if (status != 0) {
452 		test = "read";
453 		goto abort;
454 	}
455 	sc->read_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
456 
457 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
458 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
459 	cmd.data2 = len * 0x1;
460 	status = mxge_send_cmd(sc, test_type, &cmd);
461 	if (status != 0) {
462 		test = "write";
463 		goto abort;
464 	}
465 	sc->write_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
466 
467 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
468 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
469 	cmd.data2 = len * 0x10001;
470 	status = mxge_send_cmd(sc, test_type, &cmd);
471 	if (status != 0) {
472 		test = "read/write";
473 		goto abort;
474 	}
475 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
476 	    (cmd.data0 & 0xffff);
477 
478 abort:
479 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST) {
480 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
481 		    test, status);
482 	}
483 	return status;
484 }
485 
486 /*
487  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
488  * when the PCI-E Completion packets are aligned on an 8-byte
489  * boundary.  Some PCI-E chip sets always align Completion packets; on
490  * the ones that do not, the alignment can be enforced by enabling
491  * ECRC generation (if supported).
492  *
493  * When PCI-E Completion packets are not aligned, it is actually more
494  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
495  *
496  * If the driver can neither enable ECRC nor verify that it has
497  * already been enabled, then it must use a firmware image which works
498  * around unaligned completion packets (ethp_z8e.dat), and it should
499  * also ensure that it never gives the device a Read-DMA which is
500  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
501  * enabled, then the driver should use the aligned (eth_z8e.dat)
502  * firmware image, and set tx_boundary to 4KB.
503  */
504 static int
505 mxge_firmware_probe(mxge_softc_t *sc)
506 {
507 	device_t dev = sc->dev;
508 	int reg, status;
509 	uint16_t pectl;
510 
511 	sc->tx_boundary = 4096;
512 
513 	/*
514 	 * Verify the max read request size was set to 4KB
515 	 * before trying the test with 4KB.
516 	 */
517 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
518 		pectl = pci_read_config(dev, reg + 0x8, 2);
519 		if ((pectl & (5 << 12)) != (5 << 12)) {
520 			device_printf(dev, "Max Read Req. size != 4k (0x%x)\n",
521 			    pectl);
522 			sc->tx_boundary = 2048;
523 		}
524 	}
525 
526 	/*
527 	 * Load the optimized firmware (which assumes aligned PCIe
528 	 * completions) in order to see if it works on this host.
529 	 */
530 	sc->fw_name = mxge_fw_aligned;
531 	status = mxge_load_firmware(sc, 1);
532 	if (status != 0)
533 		return status;
534 
535 	/*
536 	 * Enable ECRC if possible
537 	 */
538 	mxge_enable_nvidia_ecrc(sc);
539 
540 	/*
541 	 * Run a DMA test which watches for unaligned completions and
542 	 * aborts on the first one seen.  Not required on Z8ES or newer.
543 	 */
544 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
545 		return 0;
546 
547 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
548 	if (status == 0)
549 		return 0; /* keep the aligned firmware */
550 
551 	if (status != E2BIG)
552 		device_printf(dev, "DMA test failed: %d\n", status);
553 	if (status == ENOSYS) {
554 		device_printf(dev, "Falling back to ethp! "
555 		    "Please install up to date fw\n");
556 	}
557 	return status;
558 }
559 
560 static int
561 mxge_select_firmware(mxge_softc_t *sc)
562 {
563 	int aligned = 0;
564 	int force_firmware = mxge_force_firmware;
565 
566 	if (sc->throttle)
567 		force_firmware = sc->throttle;
568 
569 	if (force_firmware != 0) {
570 		if (force_firmware == 1)
571 			aligned = 1;
572 		else
573 			aligned = 0;
574 		if (bootverbose) {
575 			device_printf(sc->dev,
576 			    "Assuming %s completions (forced)\n",
577 			    aligned ? "aligned" : "unaligned");
578 		}
579 		goto abort;
580 	}
581 
582 	/*
583 	 * If the PCIe link width is 4 or less, we can use the aligned
584 	 * firmware and skip any checks
585 	 */
586 	if (sc->link_width != 0 && sc->link_width <= 4) {
587 		device_printf(sc->dev, "PCIe x%d Link, "
588 		    "expect reduced performance\n", sc->link_width);
589 		aligned = 1;
590 		goto abort;
591 	}
592 
593 	if (mxge_firmware_probe(sc) == 0)
594 		return 0;
595 
596 abort:
597 	if (aligned) {
598 		sc->fw_name = mxge_fw_aligned;
599 		sc->tx_boundary = 4096;
600 	} else {
601 		sc->fw_name = mxge_fw_unaligned;
602 		sc->tx_boundary = 2048;
603 	}
604 	return mxge_load_firmware(sc, 0);
605 }
606 
607 static int
608 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
609 {
610 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
611 		if_printf(sc->ifp, "Bad firmware type: 0x%x\n",
612 		    be32toh(hdr->mcp_type));
613 		return EIO;
614 	}
615 
616 	/* Save firmware version for sysctl */
617 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
618 	if (bootverbose)
619 		if_printf(sc->ifp, "firmware id: %s\n", hdr->version);
620 
621 	ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
622 	    &sc->fw_ver_minor, &sc->fw_ver_tiny);
623 
624 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR &&
625 	      sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
626 		if_printf(sc->ifp, "Found firmware version %s\n",
627 		    sc->fw_version);
628 		if_printf(sc->ifp, "Driver needs %d.%d\n",
629 		    MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
630 		return EINVAL;
631 	}
632 	return 0;
633 }
634 
635 static void *
636 z_alloc(void *nil, u_int items, u_int size)
637 {
638 	return kmalloc(items * size, M_TEMP, M_WAITOK);
639 }
640 
641 static void
642 z_free(void *nil, void *ptr)
643 {
644 	kfree(ptr, M_TEMP);
645 }
646 
647 static int
648 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
649 {
650 	z_stream zs;
651 	char *inflate_buffer;
652 	const struct firmware *fw;
653 	const mcp_gen_header_t *hdr;
654 	unsigned hdr_offset;
655 	int status;
656 	unsigned int i;
657 	char dummy;
658 	size_t fw_len;
659 
660 	fw = firmware_get(sc->fw_name);
661 	if (fw == NULL) {
662 		if_printf(sc->ifp, "Could not find firmware image %s\n",
663 		    sc->fw_name);
664 		return ENOENT;
665 	}
666 
667 	/* Setup zlib and decompress f/w */
668 	bzero(&zs, sizeof(zs));
669 	zs.zalloc = z_alloc;
670 	zs.zfree = z_free;
671 	status = inflateInit(&zs);
672 	if (status != Z_OK) {
673 		status = EIO;
674 		goto abort_with_fw;
675 	}
676 
677 	/*
678 	 * The uncompressed size is stored as the firmware version,
679 	 * which would otherwise go unused
680 	 */
681 	fw_len = (size_t)fw->version;
682 	inflate_buffer = kmalloc(fw_len, M_TEMP, M_WAITOK);
683 	zs.avail_in = fw->datasize;
684 	zs.next_in = __DECONST(char *, fw->data);
685 	zs.avail_out = fw_len;
686 	zs.next_out = inflate_buffer;
687 	status = inflate(&zs, Z_FINISH);
688 	if (status != Z_STREAM_END) {
689 		if_printf(sc->ifp, "zlib %d\n", status);
690 		status = EIO;
691 		goto abort_with_buffer;
692 	}
693 
694 	/* Check id */
695 	hdr_offset =
696 	htobe32(*(const uint32_t *)(inflate_buffer + MCP_HEADER_PTR_OFFSET));
697 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
698 		if_printf(sc->ifp, "Bad firmware file");
699 		status = EIO;
700 		goto abort_with_buffer;
701 	}
702 	hdr = (const void*)(inflate_buffer + hdr_offset);
703 
704 	status = mxge_validate_firmware(sc, hdr);
705 	if (status != 0)
706 		goto abort_with_buffer;
707 
708 	/* Copy the inflated firmware to NIC SRAM. */
709 	for (i = 0; i < fw_len; i += 256) {
710 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i, inflate_buffer + i,
711 		    min(256U, (unsigned)(fw_len - i)));
712 		wmb();
713 		dummy = *sc->sram;
714 		wmb();
715 	}
716 
717 	*limit = fw_len;
718 	status = 0;
719 abort_with_buffer:
720 	kfree(inflate_buffer, M_TEMP);
721 	inflateEnd(&zs);
722 abort_with_fw:
723 	firmware_put(fw, FIRMWARE_UNLOAD);
724 	return status;
725 }
726 
727 /*
728  * Enable or disable periodic RDMAs from the host to make certain
729  * chipsets resend dropped PCIe messages
730  */
731 static void
732 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
733 {
734 	char buf_bytes[72];
735 	volatile uint32_t *confirm;
736 	volatile char *submit;
737 	uint32_t *buf, dma_low, dma_high;
738 	int i;
739 
740 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
741 
742 	/* Clear confirmation addr */
743 	confirm = (volatile uint32_t *)sc->cmd;
744 	*confirm = 0;
745 	wmb();
746 
747 	/*
748 	 * Send an rdma command to the PCIe engine, and wait for the
749 	 * response in the confirmation address.  The firmware should
750 	 * write a -1 there to indicate it is alive and well
751 	 */
752 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
753 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
754 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
755 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
756 	buf[2] = htobe32(0xffffffff);		/* confirm data */
757 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
758 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
759 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
760 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
761 	buf[5] = htobe32(enable);		/* enable? */
762 
763 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
764 
765 	mxge_pio_copy(submit, buf, 64);
766 	wmb();
767 	DELAY(1000);
768 	wmb();
769 	i = 0;
770 	while (*confirm != 0xffffffff && i < 20) {
771 		DELAY(1000);
772 		i++;
773 	}
774 	if (*confirm != 0xffffffff) {
775 		if_printf(sc->ifp, "dummy rdma %s failed (%p = 0x%x)",
776 		    (enable ? "enable" : "disable"), confirm, *confirm);
777 	}
778 }
779 
780 static int
781 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
782 {
783 	mcp_cmd_t *buf;
784 	char buf_bytes[sizeof(*buf) + 8];
785 	volatile mcp_cmd_response_t *response = sc->cmd;
786 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
787 	uint32_t dma_low, dma_high;
788 	int err, sleep_total = 0;
789 
790 	/* Ensure buf is aligned to 8 bytes */
791 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
792 
793 	buf->data0 = htobe32(data->data0);
794 	buf->data1 = htobe32(data->data1);
795 	buf->data2 = htobe32(data->data2);
796 	buf->cmd = htobe32(cmd);
797 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
798 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
799 
800 	buf->response_addr.low = htobe32(dma_low);
801 	buf->response_addr.high = htobe32(dma_high);
802 
803 	response->result = 0xffffffff;
804 	wmb();
805 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
806 
807 	/*
808 	 * Wait up to 20ms
809 	 */
810 	err = EAGAIN;
811 	for (sleep_total = 0; sleep_total < 20; sleep_total++) {
812 		wmb();
813 		switch (be32toh(response->result)) {
814 		case 0:
815 			data->data0 = be32toh(response->data);
816 			err = 0;
817 			break;
818 		case 0xffffffff:
819 			DELAY(1000);
820 			break;
821 		case MXGEFW_CMD_UNKNOWN:
822 			err = ENOSYS;
823 			break;
824 		case MXGEFW_CMD_ERROR_UNALIGNED:
825 			err = E2BIG;
826 			break;
827 		case MXGEFW_CMD_ERROR_BUSY:
828 			err = EBUSY;
829 			break;
830 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
831 			err = ENXIO;
832 			break;
833 		default:
834 			if_printf(sc->ifp, "command %d failed, result = %d\n",
835 			    cmd, be32toh(response->result));
836 			err = ENXIO;
837 			break;
838 		}
839 		if (err != EAGAIN)
840 			break;
841 	}
842 	if (err == EAGAIN) {
843 		if_printf(sc->ifp, "command %d timed out result = %d\n",
844 		    cmd, be32toh(response->result));
845 	}
846 	return err;
847 }
848 
849 static int
850 mxge_adopt_running_firmware(mxge_softc_t *sc)
851 {
852 	struct mcp_gen_header *hdr;
853 	const size_t bytes = sizeof(struct mcp_gen_header);
854 	size_t hdr_offset;
855 	int status;
856 
857 	/*
858 	 * Find running firmware header
859 	 */
860 	hdr_offset =
861 	htobe32(*(volatile uint32_t *)(sc->sram + MCP_HEADER_PTR_OFFSET));
862 
863 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
864 		if_printf(sc->ifp, "Running firmware has bad header offset "
865 		    "(%zu)\n", hdr_offset);
866 		return EIO;
867 	}
868 
869 	/*
870 	 * Copy header of running firmware from SRAM to host memory to
871 	 * validate firmware
872 	 */
873 	hdr = kmalloc(bytes, M_DEVBUF, M_WAITOK);
874 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
875 	    rman_get_bushandle(sc->mem_res), hdr_offset, (char *)hdr, bytes);
876 	status = mxge_validate_firmware(sc, hdr);
877 	kfree(hdr, M_DEVBUF);
878 
879 	/*
880 	 * Check to see if adopted firmware has bug where adopting
881 	 * it will cause broadcasts to be filtered unless the NIC
882 	 * is kept in ALLMULTI mode
883 	 */
884 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
885 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
886 		sc->adopted_rx_filter_bug = 1;
887 		if_printf(sc->ifp, "Adopting fw %d.%d.%d: "
888 		    "working around rx filter bug\n",
889 		    sc->fw_ver_major, sc->fw_ver_minor, sc->fw_ver_tiny);
890 	}
891 
892 	return status;
893 }
894 
895 static int
896 mxge_load_firmware(mxge_softc_t *sc, int adopt)
897 {
898 	volatile uint32_t *confirm;
899 	volatile char *submit;
900 	char buf_bytes[72];
901 	uint32_t *buf, size, dma_low, dma_high;
902 	int status, i;
903 
904 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
905 
906 	size = sc->sram_size;
907 	status = mxge_load_firmware_helper(sc, &size);
908 	if (status) {
909 		if (!adopt)
910 			return status;
911 
912 		/*
913 		 * Try to use the currently running firmware, if
914 		 * it is new enough
915 		 */
916 		status = mxge_adopt_running_firmware(sc);
917 		if (status) {
918 			if_printf(sc->ifp,
919 			    "failed to adopt running firmware\n");
920 			return status;
921 		}
922 		if_printf(sc->ifp, "Successfully adopted running firmware\n");
923 
924 		if (sc->tx_boundary == 4096) {
925 			if_printf(sc->ifp,
926 			     "Using firmware currently running on NIC.  "
927 			     "For optimal\n");
928 			if_printf(sc->ifp, "performance consider loading "
929 			     "optimized firmware\n");
930 		}
931 		sc->fw_name = mxge_fw_unaligned;
932 		sc->tx_boundary = 2048;
933 		return 0;
934 	}
935 
936 	/* Clear confirmation addr */
937 	confirm = (volatile uint32_t *)sc->cmd;
938 	*confirm = 0;
939 	wmb();
940 
941 	/*
942 	 * Send a reload command to the bootstrap MCP, and wait for the
943 	 * response in the confirmation address.  The firmware should
944 	 * write a -1 there to indicate it is alive and well
945 	 */
946 
947 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
948 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
949 
950 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
951 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
952 	buf[2] = htobe32(0xffffffff);	/* confirm data */
953 
954 	/*
955 	 * FIX: All newest firmware should un-protect the bottom of
956 	 * the sram before handoff. However, the very first interfaces
957 	 * do not. Therefore the handoff copy must skip the first 8 bytes
958 	 */
959 					/* where the code starts*/
960 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
961 	buf[4] = htobe32(size - 8); 	/* length of code */
962 	buf[5] = htobe32(8);		/* where to copy to */
963 	buf[6] = htobe32(0);		/* where to jump to */
964 
965 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
966 	mxge_pio_copy(submit, buf, 64);
967 	wmb();
968 	DELAY(1000);
969 	wmb();
970 	i = 0;
971 	while (*confirm != 0xffffffff && i < 20) {
972 		DELAY(1000*10);
973 		i++;
974 	}
975 	if (*confirm != 0xffffffff) {
976 		if_printf(sc->ifp,"handoff failed (%p = 0x%x)",
977 		    confirm, *confirm);
978 		return ENXIO;
979 	}
980 	return 0;
981 }
982 
983 static int
984 mxge_update_mac_address(mxge_softc_t *sc)
985 {
986 	mxge_cmd_t cmd;
987 	uint8_t *addr = sc->mac_addr;
988 
989 	cmd.data0 = (addr[0] << 24) | (addr[1] << 16) |
990 	    (addr[2] << 8) | addr[3];
991 	cmd.data1 = (addr[4] << 8) | (addr[5]);
992 	return mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
993 }
994 
995 static int
996 mxge_change_pause(mxge_softc_t *sc, int pause)
997 {
998 	mxge_cmd_t cmd;
999 	int status;
1000 
1001 	if (pause)
1002 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL, &cmd);
1003 	else
1004 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL, &cmd);
1005 	if (status) {
1006 		if_printf(sc->ifp, "Failed to set flow control mode\n");
1007 		return ENXIO;
1008 	}
1009 	sc->pause = pause;
1010 	return 0;
1011 }
1012 
1013 static void
1014 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1015 {
1016 	mxge_cmd_t cmd;
1017 	int status;
1018 
1019 	if (mxge_always_promisc)
1020 		promisc = 1;
1021 
1022 	if (promisc)
1023 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC, &cmd);
1024 	else
1025 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC, &cmd);
1026 	if (status)
1027 		if_printf(sc->ifp, "Failed to set promisc mode\n");
1028 }
1029 
1030 static void
1031 mxge_set_multicast_list(mxge_softc_t *sc)
1032 {
1033 	mxge_cmd_t cmd;
1034 	struct ifmultiaddr *ifma;
1035 	struct ifnet *ifp = sc->ifp;
1036 	int err;
1037 
1038 	/* This firmware is known to not support multicast */
1039 	if (!sc->fw_multicast_support)
1040 		return;
1041 
1042 	/* Disable multicast filtering while we play with the lists*/
1043 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1044 	if (err != 0) {
1045 		if_printf(ifp, "Failed MXGEFW_ENABLE_ALLMULTI, "
1046 		    "error status: %d\n", err);
1047 		return;
1048 	}
1049 
1050 	if (sc->adopted_rx_filter_bug)
1051 		return;
1052 
1053 	if (ifp->if_flags & IFF_ALLMULTI) {
1054 		/* Request to disable multicast filtering, so quit here */
1055 		return;
1056 	}
1057 
1058 	/* Flush all the filters */
1059 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1060 	if (err != 0) {
1061 		if_printf(ifp, "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, "
1062 		    "error status: %d\n", err);
1063 		return;
1064 	}
1065 
1066 	/*
1067 	 * Walk the multicast list, and add each address
1068 	 */
1069 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1070 		if (ifma->ifma_addr->sa_family != AF_LINK)
1071 			continue;
1072 
1073 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1074 		    &cmd.data0, 4);
1075 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1076 		    &cmd.data1, 2);
1077 		cmd.data0 = htonl(cmd.data0);
1078 		cmd.data1 = htonl(cmd.data1);
1079 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1080 		if (err != 0) {
1081 			if_printf(ifp, "Failed MXGEFW_JOIN_MULTICAST_GROUP, "
1082 			    "error status: %d\n", err);
1083 			/* Abort, leaving multicast filtering off */
1084 			return;
1085 		}
1086 	}
1087 
1088 	/* Enable multicast filtering */
1089 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1090 	if (err != 0) {
1091 		if_printf(ifp, "Failed MXGEFW_DISABLE_ALLMULTI, "
1092 		    "error status: %d\n", err);
1093 	}
1094 }
1095 
1096 #if 0
1097 static int
1098 mxge_max_mtu(mxge_softc_t *sc)
1099 {
1100 	mxge_cmd_t cmd;
1101 	int status;
1102 
1103 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1104 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1105 
1106 	/* try to set nbufs to see if it we can
1107 	   use virtually contiguous jumbos */
1108 	cmd.data0 = 0;
1109 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1110 			       &cmd);
1111 	if (status == 0)
1112 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1113 
1114 	/* otherwise, we're limited to MJUMPAGESIZE */
1115 	return MJUMPAGESIZE - MXGEFW_PAD;
1116 }
1117 #endif
1118 
1119 static int
1120 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1121 {
1122 	struct mxge_slice_state *ss;
1123 	mxge_rx_done_t *rx_done;
1124 	volatile uint32_t *irq_claim;
1125 	mxge_cmd_t cmd;
1126 	int slice, status, rx_intr_size;
1127 
1128 	/*
1129 	 * Try to send a reset command to the card to see if it
1130 	 * is alive
1131 	 */
1132 	memset(&cmd, 0, sizeof (cmd));
1133 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1134 	if (status != 0) {
1135 		if_printf(sc->ifp, "failed reset\n");
1136 		return ENXIO;
1137 	}
1138 
1139 	mxge_dummy_rdma(sc, 1);
1140 
1141 	/*
1142 	 * Set the intrq size
1143 	 * XXX assume 4byte mcp_slot
1144 	 */
1145 	rx_intr_size = sc->rx_intr_slots * sizeof(mcp_slot_t);
1146 	cmd.data0 = rx_intr_size;
1147 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1148 
1149 	/*
1150 	 * Even though we already know how many slices are supported
1151 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1152 	 * has magic side effects, and must be called after a reset.
1153 	 * It must be called prior to calling any RSS related cmds,
1154 	 * including assigning an interrupt queue for anything but
1155 	 * slice 0.  It must also be called *after*
1156 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1157 	 * the firmware to compute offsets.
1158 	 */
1159 	if (sc->num_slices > 1) {
1160 		/* Ask the maximum number of slices it supports */
1161 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
1162 		if (status != 0) {
1163 			if_printf(sc->ifp, "failed to get number of slices\n");
1164 			return status;
1165 		}
1166 
1167 		/*
1168 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1169 		 * to setting up the interrupt queue DMA
1170 		 */
1171 		cmd.data0 = sc->num_slices;
1172 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1173 		if (sc->num_tx_rings > 1)
1174 			cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1175 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES, &cmd);
1176 		if (status != 0) {
1177 			if_printf(sc->ifp, "failed to set number of slices\n");
1178 			return status;
1179 		}
1180 	}
1181 
1182 	if (interrupts_setup) {
1183 		/* Now exchange information about interrupts  */
1184 		for (slice = 0; slice < sc->num_slices; slice++) {
1185 			ss = &sc->ss[slice];
1186 
1187 			rx_done = &ss->rx_data.rx_done;
1188 			memset(rx_done->entry, 0, rx_intr_size);
1189 
1190 			cmd.data0 =
1191 			    MXGE_LOWPART_TO_U32(ss->rx_done_dma.dmem_busaddr);
1192 			cmd.data1 =
1193 			    MXGE_HIGHPART_TO_U32(ss->rx_done_dma.dmem_busaddr);
1194 			cmd.data2 = slice;
1195 			status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA,
1196 			    &cmd);
1197 		}
1198 	}
1199 
1200 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET,
1201 	    &cmd);
1202 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1203 
1204 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1205 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1206 
1207 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, &cmd);
1208 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1209 
1210 	if (status != 0) {
1211 		if_printf(sc->ifp, "failed set interrupt parameters\n");
1212 		return status;
1213 	}
1214 
1215 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1216 
1217 	/* Run a DMA benchmark */
1218 	mxge_dma_test(sc, MXGEFW_DMA_TEST);
1219 
1220 	for (slice = 0; slice < sc->num_slices; slice++) {
1221 		ss = &sc->ss[slice];
1222 
1223 		ss->irq_claim = irq_claim + (2 * slice);
1224 
1225 		/* Reset mcp/driver shared state back to 0 */
1226 		ss->rx_data.rx_done.idx = 0;
1227 		ss->tx.req = 0;
1228 		ss->tx.done = 0;
1229 		ss->tx.pkt_done = 0;
1230 		ss->tx.queue_active = 0;
1231 		ss->tx.activate = 0;
1232 		ss->tx.deactivate = 0;
1233 		ss->rx_data.rx_big.cnt = 0;
1234 		ss->rx_data.rx_small.cnt = 0;
1235 		if (ss->fw_stats != NULL)
1236 			bzero(ss->fw_stats, sizeof(*ss->fw_stats));
1237 	}
1238 	sc->rdma_tags_available = 15;
1239 
1240 	status = mxge_update_mac_address(sc);
1241 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1242 	mxge_change_pause(sc, sc->pause);
1243 	mxge_set_multicast_list(sc);
1244 
1245 	if (sc->throttle) {
1246 		cmd.data0 = sc->throttle;
1247 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd))
1248 			if_printf(sc->ifp, "can't enable throttle\n");
1249 	}
1250 	return status;
1251 }
1252 
1253 static int
1254 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1255 {
1256 	mxge_cmd_t cmd;
1257 	mxge_softc_t *sc;
1258 	int err;
1259 	unsigned int throttle;
1260 
1261 	sc = arg1;
1262 	throttle = sc->throttle;
1263 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1264 	if (err != 0)
1265 		return err;
1266 
1267 	if (throttle == sc->throttle)
1268 		return 0;
1269 
1270 	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1271 		return EINVAL;
1272 
1273 	ifnet_serialize_all(sc->ifp);
1274 
1275 	cmd.data0 = throttle;
1276 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1277 	if (err == 0)
1278 		sc->throttle = throttle;
1279 
1280 	ifnet_deserialize_all(sc->ifp);
1281 	return err;
1282 }
1283 
1284 static int
1285 mxge_change_use_rss(SYSCTL_HANDLER_ARGS)
1286 {
1287 	mxge_softc_t *sc;
1288 	int err, use_rss;
1289 
1290 	sc = arg1;
1291 	use_rss = sc->use_rss;
1292 	err = sysctl_handle_int(oidp, &use_rss, arg2, req);
1293 	if (err != 0)
1294 		return err;
1295 
1296 	if (use_rss == sc->use_rss)
1297 		return 0;
1298 
1299 	ifnet_serialize_all(sc->ifp);
1300 
1301 	sc->use_rss = use_rss;
1302 	if (sc->ifp->if_flags & IFF_RUNNING) {
1303 		mxge_close(sc, 0);
1304 		mxge_open(sc);
1305 	}
1306 
1307 	ifnet_deserialize_all(sc->ifp);
1308 	return err;
1309 }
1310 
1311 static int
1312 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1313 {
1314 	mxge_softc_t *sc;
1315 	unsigned int intr_coal_delay;
1316 	int err;
1317 
1318 	sc = arg1;
1319 	intr_coal_delay = sc->intr_coal_delay;
1320 	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1321 	if (err != 0)
1322 		return err;
1323 
1324 	if (intr_coal_delay == sc->intr_coal_delay)
1325 		return 0;
1326 
1327 	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1328 		return EINVAL;
1329 
1330 	ifnet_serialize_all(sc->ifp);
1331 
1332 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1333 	sc->intr_coal_delay = intr_coal_delay;
1334 
1335 	ifnet_deserialize_all(sc->ifp);
1336 	return err;
1337 }
1338 
1339 static int
1340 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1341 {
1342 	mxge_softc_t *sc;
1343 	unsigned int enabled;
1344 	int err;
1345 
1346 	sc = arg1;
1347 	enabled = sc->pause;
1348 	err = sysctl_handle_int(oidp, &enabled, arg2, req);
1349 	if (err != 0)
1350 		return err;
1351 
1352 	if (enabled == sc->pause)
1353 		return 0;
1354 
1355 	ifnet_serialize_all(sc->ifp);
1356 	err = mxge_change_pause(sc, enabled);
1357 	ifnet_deserialize_all(sc->ifp);
1358 
1359 	return err;
1360 }
1361 
1362 static int
1363 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1364 {
1365 	int err;
1366 
1367 	if (arg1 == NULL)
1368 		return EFAULT;
1369 	arg2 = be32toh(*(int *)arg1);
1370 	arg1 = NULL;
1371 	err = sysctl_handle_int(oidp, arg1, arg2, req);
1372 
1373 	return err;
1374 }
1375 
1376 static void
1377 mxge_rem_sysctls(mxge_softc_t *sc)
1378 {
1379 	if (sc->ss != NULL) {
1380 		struct mxge_slice_state *ss;
1381 		int slice;
1382 
1383 		for (slice = 0; slice < sc->num_slices; slice++) {
1384 			ss = &sc->ss[slice];
1385 			if (ss->sysctl_tree != NULL) {
1386 				sysctl_ctx_free(&ss->sysctl_ctx);
1387 				ss->sysctl_tree = NULL;
1388 			}
1389 		}
1390 	}
1391 
1392 	if (sc->slice_sysctl_tree != NULL) {
1393 		sysctl_ctx_free(&sc->slice_sysctl_ctx);
1394 		sc->slice_sysctl_tree = NULL;
1395 	}
1396 
1397 	if (sc->sysctl_tree != NULL) {
1398 		sysctl_ctx_free(&sc->sysctl_ctx);
1399 		sc->sysctl_tree = NULL;
1400 	}
1401 }
1402 
1403 static void
1404 mxge_add_sysctls(mxge_softc_t *sc)
1405 {
1406 	struct sysctl_ctx_list *ctx;
1407 	struct sysctl_oid_list *children;
1408 	mcp_irq_data_t *fw;
1409 	struct mxge_slice_state *ss;
1410 	int slice;
1411 	char slice_num[8];
1412 
1413 	ctx = &sc->sysctl_ctx;
1414 	sysctl_ctx_init(ctx);
1415 	sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1416 	    OID_AUTO, device_get_nameunit(sc->dev), CTLFLAG_RD, 0, "");
1417 	if (sc->sysctl_tree == NULL) {
1418 		device_printf(sc->dev, "can't add sysctl node\n");
1419 		return;
1420 	}
1421 
1422 	children = SYSCTL_CHILDREN(sc->sysctl_tree);
1423 	fw = sc->ss[0].fw_stats;
1424 
1425 	/*
1426 	 * Random information
1427 	 */
1428 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version",
1429 	    CTLFLAG_RD, &sc->fw_version, 0, "firmware version");
1430 
1431 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "serial_number",
1432 	    CTLFLAG_RD, &sc->serial_number_string, 0, "serial number");
1433 
1434 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "product_code",
1435 	    CTLFLAG_RD, &sc->product_code_string, 0, "product code");
1436 
1437 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "pcie_link_width",
1438 	    CTLFLAG_RD, &sc->link_width, 0, "link width");
1439 
1440 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_boundary",
1441 	    CTLFLAG_RD, &sc->tx_boundary, 0, "tx boundary");
1442 
1443 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_combine",
1444 	    CTLFLAG_RD, &sc->wc, 0, "write combining PIO");
1445 
1446 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_dma_MBs",
1447 	    CTLFLAG_RD, &sc->read_dma, 0, "DMA Read speed in MB/s");
1448 
1449 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_dma_MBs",
1450 	    CTLFLAG_RD, &sc->write_dma, 0, "DMA Write speed in MB/s");
1451 
1452 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_write_dma_MBs",
1453 	    CTLFLAG_RD, &sc->read_write_dma, 0,
1454 	    "DMA concurrent Read/Write speed in MB/s");
1455 
1456 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "watchdog_resets",
1457 	    CTLFLAG_RD, &sc->watchdog_resets, 0,
1458 	    "Number of times NIC was reset");
1459 
1460 	/*
1461 	 * Performance related tunables
1462 	 */
1463 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "intr_coal_delay",
1464 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_intr_coal, "I",
1465 	    "Interrupt coalescing delay in usecs");
1466 
1467 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "throttle",
1468 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_throttle, "I",
1469 	    "Transmit throttling");
1470 
1471 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "flow_control_enabled",
1472 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_flow_control, "I",
1473 	    "Interrupt coalescing delay in usecs");
1474 
1475 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "use_rss",
1476 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_use_rss, "I",
1477 	    "Use RSS");
1478 
1479 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "deassert_wait",
1480 	    CTLFLAG_RW, &mxge_deassert_wait, 0,
1481 	    "Wait for IRQ line to go low in ihandler");
1482 
1483 	/*
1484 	 * Stats block from firmware is in network byte order.
1485 	 * Need to swap it
1486 	 */
1487 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "link_up",
1488 	    CTLTYPE_INT|CTLFLAG_RD, &fw->link_up, 0,
1489 	    mxge_handle_be32, "I", "link up");
1490 
1491 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_tags_available",
1492 	    CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available, 0,
1493 	    mxge_handle_be32, "I", "rdma_tags_available");
1494 
1495 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_crc32",
1496 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_crc32, 0,
1497 	    mxge_handle_be32, "I", "dropped_bad_crc32");
1498 
1499 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_phy",
1500 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_phy, 0,
1501 	    mxge_handle_be32, "I", "dropped_bad_phy");
1502 
1503 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_error_or_filtered",
1504 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_error_or_filtered, 0,
1505 	    mxge_handle_be32, "I", "dropped_link_error_or_filtered");
1506 
1507 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_overflow",
1508 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow, 0,
1509 	    mxge_handle_be32, "I", "dropped_link_overflow");
1510 
1511 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_multicast_filtered",
1512 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_multicast_filtered, 0,
1513 	    mxge_handle_be32, "I", "dropped_multicast_filtered");
1514 
1515 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_big_buffer",
1516 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer, 0,
1517 	    mxge_handle_be32, "I", "dropped_no_big_buffer");
1518 
1519 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_small_buffer",
1520 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_small_buffer, 0,
1521 	    mxge_handle_be32, "I", "dropped_no_small_buffer");
1522 
1523 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_overrun",
1524 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun, 0,
1525 	    mxge_handle_be32, "I", "dropped_overrun");
1526 
1527 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_pause",
1528 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_pause, 0,
1529 	    mxge_handle_be32, "I", "dropped_pause");
1530 
1531 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_runt",
1532 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt, 0,
1533 	    mxge_handle_be32, "I", "dropped_runt");
1534 
1535 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_unicast_filtered",
1536 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered, 0,
1537 	    mxge_handle_be32, "I", "dropped_unicast_filtered");
1538 
1539 	/* add counters exported for debugging from all slices */
1540 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1541 	sc->slice_sysctl_tree = SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx,
1542 	    children, OID_AUTO, "slice", CTLFLAG_RD, 0, "");
1543 	if (sc->slice_sysctl_tree == NULL) {
1544 		device_printf(sc->dev, "can't add slice sysctl node\n");
1545 		return;
1546 	}
1547 
1548 	for (slice = 0; slice < sc->num_slices; slice++) {
1549 		ss = &sc->ss[slice];
1550 		sysctl_ctx_init(&ss->sysctl_ctx);
1551 		ctx = &ss->sysctl_ctx;
1552 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1553 		ksprintf(slice_num, "%d", slice);
1554 		ss->sysctl_tree = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
1555 		    slice_num, CTLFLAG_RD, 0, "");
1556 		if (ss->sysctl_tree == NULL) {
1557 			device_printf(sc->dev,
1558 			    "can't add %d slice sysctl node\n", slice);
1559 			return;	/* XXX continue? */
1560 		}
1561 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1562 
1563 		/*
1564 		 * XXX change to ULONG
1565 		 */
1566 
1567 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_small_cnt",
1568 		    CTLFLAG_RD, &ss->rx_data.rx_small.cnt, 0, "rx_small_cnt");
1569 
1570 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_big_cnt",
1571 		    CTLFLAG_RD, &ss->rx_data.rx_big.cnt, 0, "rx_small_cnt");
1572 
1573 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_req",
1574 		    CTLFLAG_RD, &ss->tx.req, 0, "tx_req");
1575 
1576 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_done",
1577 		    CTLFLAG_RD, &ss->tx.done, 0, "tx_done");
1578 
1579 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_pkt_done",
1580 		    CTLFLAG_RD, &ss->tx.pkt_done, 0, "tx_done");
1581 
1582 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_queue_active",
1583 		    CTLFLAG_RD, &ss->tx.queue_active, 0, "tx_queue_active");
1584 
1585 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_activate",
1586 		    CTLFLAG_RD, &ss->tx.activate, 0, "tx_activate");
1587 
1588 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_deactivate",
1589 		    CTLFLAG_RD, &ss->tx.deactivate, 0, "tx_deactivate");
1590 	}
1591 }
1592 
1593 /*
1594  * Copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1595  * backwards one at a time and handle ring wraps
1596  */
1597 static __inline void
1598 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1599     mcp_kreq_ether_send_t *src, int cnt)
1600 {
1601 	int idx, starting_slot;
1602 
1603 	starting_slot = tx->req;
1604 	while (cnt > 1) {
1605 		cnt--;
1606 		idx = (starting_slot + cnt) & tx->mask;
1607 		mxge_pio_copy(&tx->lanai[idx], &src[cnt], sizeof(*src));
1608 		wmb();
1609 	}
1610 }
1611 
1612 /*
1613  * Copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1614  * at most 32 bytes at a time, so as to avoid involving the software
1615  * pio handler in the nic.  We re-write the first segment's flags
1616  * to mark them valid only after writing the entire chain
1617  */
1618 static __inline void
1619 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src, int cnt)
1620 {
1621 	int idx, i;
1622 	uint32_t *src_ints;
1623 	volatile uint32_t *dst_ints;
1624 	mcp_kreq_ether_send_t *srcp;
1625 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1626 	uint8_t last_flags;
1627 
1628 	idx = tx->req & tx->mask;
1629 
1630 	last_flags = src->flags;
1631 	src->flags = 0;
1632 	wmb();
1633 	dst = dstp = &tx->lanai[idx];
1634 	srcp = src;
1635 
1636 	if ((idx + cnt) < tx->mask) {
1637 		for (i = 0; i < cnt - 1; i += 2) {
1638 			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1639 			wmb(); /* force write every 32 bytes */
1640 			srcp += 2;
1641 			dstp += 2;
1642 		}
1643 	} else {
1644 		/*
1645 		 * Submit all but the first request, and ensure
1646 		 * that it is submitted below
1647 		 */
1648 		mxge_submit_req_backwards(tx, src, cnt);
1649 		i = 0;
1650 	}
1651 	if (i < cnt) {
1652 		/* Submit the first request */
1653 		mxge_pio_copy(dstp, srcp, sizeof(*src));
1654 		wmb(); /* barrier before setting valid flag */
1655 	}
1656 
1657 	/* Re-write the last 32-bits with the valid flags */
1658 	src->flags = last_flags;
1659 	src_ints = (uint32_t *)src;
1660 	src_ints+=3;
1661 	dst_ints = (volatile uint32_t *)dst;
1662 	dst_ints+=3;
1663 	*dst_ints = *src_ints;
1664 	tx->req += cnt;
1665 	wmb();
1666 }
1667 
1668 static int
1669 mxge_pullup_tso(struct mbuf **mp)
1670 {
1671 	int hoff, iphlen, thoff;
1672 	struct mbuf *m;
1673 
1674 	m = *mp;
1675 	KASSERT(M_WRITABLE(m), ("TSO mbuf not writable"));
1676 
1677 	iphlen = m->m_pkthdr.csum_iphlen;
1678 	thoff = m->m_pkthdr.csum_thlen;
1679 	hoff = m->m_pkthdr.csum_lhlen;
1680 
1681 	KASSERT(iphlen > 0, ("invalid ip hlen"));
1682 	KASSERT(thoff > 0, ("invalid tcp hlen"));
1683 	KASSERT(hoff > 0, ("invalid ether hlen"));
1684 
1685 	if (__predict_false(m->m_len < hoff + iphlen + thoff)) {
1686 		m = m_pullup(m, hoff + iphlen + thoff);
1687 		if (m == NULL) {
1688 			*mp = NULL;
1689 			return ENOBUFS;
1690 		}
1691 		*mp = m;
1692 	}
1693 	return 0;
1694 }
1695 
1696 static int
1697 mxge_encap_tso(mxge_tx_ring_t *tx, struct mxge_buffer_state *info_map,
1698     struct mbuf *m, int busdma_seg_cnt)
1699 {
1700 	mcp_kreq_ether_send_t *req;
1701 	bus_dma_segment_t *seg;
1702 	uint32_t low, high_swapped;
1703 	int len, seglen, cum_len, cum_len_next;
1704 	int next_is_first, chop, cnt, rdma_count, small;
1705 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1706 	uint8_t flags, flags_next;
1707 	struct mxge_buffer_state *info_last;
1708 	bus_dmamap_t map = info_map->map;
1709 
1710 	mss = m->m_pkthdr.tso_segsz;
1711 
1712 	/*
1713 	 * Negative cum_len signifies to the send loop that we are
1714 	 * still in the header portion of the TSO packet.
1715 	 */
1716 	cum_len = -(m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen +
1717 	    m->m_pkthdr.csum_thlen);
1718 
1719 	/*
1720 	 * TSO implies checksum offload on this hardware
1721 	 */
1722 	cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1723 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1724 
1725 	/*
1726 	 * For TSO, pseudo_hdr_offset holds mss.  The firmware figures
1727 	 * out where to put the checksum by parsing the header.
1728 	 */
1729 	pseudo_hdr_offset = htobe16(mss);
1730 
1731 	req = tx->req_list;
1732 	seg = tx->seg_list;
1733 	cnt = 0;
1734 	rdma_count = 0;
1735 
1736 	/*
1737 	 * "rdma_count" is the number of RDMAs belonging to the current
1738 	 * packet BEFORE the current send request.  For non-TSO packets,
1739 	 * this is equal to "count".
1740 	 *
1741 	 * For TSO packets, rdma_count needs to be reset to 0 after a
1742 	 * segment cut.
1743 	 *
1744 	 * The rdma_count field of the send request is the number of
1745 	 * RDMAs of the packet starting at that request.  For TSO send
1746 	 * requests with one ore more cuts in the middle, this is the
1747 	 * number of RDMAs starting after the last cut in the request.
1748 	 * All previous segments before the last cut implicitly have 1
1749 	 * RDMA.
1750 	 *
1751 	 * Since the number of RDMAs is not known beforehand, it must be
1752 	 * filled-in retroactively - after each segmentation cut or at
1753 	 * the end of the entire packet.
1754 	 */
1755 
1756 	while (busdma_seg_cnt) {
1757 		/*
1758 		 * Break the busdma segment up into pieces
1759 		 */
1760 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1761 		high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1762 		len = seg->ds_len;
1763 
1764 		while (len) {
1765 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1766 			seglen = len;
1767 			cum_len_next = cum_len + seglen;
1768 			(req - rdma_count)->rdma_count = rdma_count + 1;
1769 			if (__predict_true(cum_len >= 0)) {
1770 				/* Payload */
1771 				chop = (cum_len_next > mss);
1772 				cum_len_next = cum_len_next % mss;
1773 				next_is_first = (cum_len_next == 0);
1774 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1775 				flags_next |=
1776 				    next_is_first * MXGEFW_FLAGS_FIRST;
1777 				rdma_count |= -(chop | next_is_first);
1778 				rdma_count += chop & !next_is_first;
1779 			} else if (cum_len_next >= 0) {
1780 				/* Header ends */
1781 				rdma_count = -1;
1782 				cum_len_next = 0;
1783 				seglen = -cum_len;
1784 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1785 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1786 				    MXGEFW_FLAGS_FIRST |
1787 				    (small * MXGEFW_FLAGS_SMALL);
1788 			}
1789 
1790 			req->addr_high = high_swapped;
1791 			req->addr_low = htobe32(low);
1792 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1793 			req->pad = 0;
1794 			req->rdma_count = 1;
1795 			req->length = htobe16(seglen);
1796 			req->cksum_offset = cksum_offset;
1797 			req->flags =
1798 			    flags | ((cum_len & 1) * MXGEFW_FLAGS_ALIGN_ODD);
1799 			low += seglen;
1800 			len -= seglen;
1801 			cum_len = cum_len_next;
1802 			flags = flags_next;
1803 			req++;
1804 			cnt++;
1805 			rdma_count++;
1806 			if (__predict_false(cksum_offset > seglen))
1807 				cksum_offset -= seglen;
1808 			else
1809 				cksum_offset = 0;
1810 			if (__predict_false(cnt > tx->max_desc))
1811 				goto drop;
1812 		}
1813 		busdma_seg_cnt--;
1814 		seg++;
1815 	}
1816 	(req - rdma_count)->rdma_count = rdma_count;
1817 
1818 	do {
1819 		req--;
1820 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1821 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1822 
1823 	info_last = &tx->info[((cnt - 1) + tx->req) & tx->mask];
1824 
1825 	info_map->map = info_last->map;
1826 	info_last->map = map;
1827 	info_last->m = m;
1828 
1829 	mxge_submit_req(tx, tx->req_list, cnt);
1830 
1831 	if (tx->send_go != NULL && tx->queue_active == 0) {
1832 		/* Tell the NIC to start polling this slice */
1833 		*tx->send_go = 1;
1834 		tx->queue_active = 1;
1835 		tx->activate++;
1836 		wmb();
1837 	}
1838 	return 0;
1839 
1840 drop:
1841 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1842 	m_freem(m);
1843 	return ENOBUFS;
1844 }
1845 
1846 static int
1847 mxge_encap(mxge_tx_ring_t *tx, struct mbuf *m, bus_addr_t zeropad)
1848 {
1849 	mcp_kreq_ether_send_t *req;
1850 	bus_dma_segment_t *seg;
1851 	bus_dmamap_t map;
1852 	int cnt, cum_len, err, i, idx, odd_flag;
1853 	uint16_t pseudo_hdr_offset;
1854 	uint8_t flags, cksum_offset;
1855 	struct mxge_buffer_state *info_map, *info_last;
1856 
1857 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
1858 		err = mxge_pullup_tso(&m);
1859 		if (__predict_false(err))
1860 			return err;
1861 	}
1862 
1863 	/*
1864 	 * Map the frame for DMA
1865 	 */
1866 	idx = tx->req & tx->mask;
1867 	info_map = &tx->info[idx];
1868 	map = info_map->map;
1869 
1870 	err = bus_dmamap_load_mbuf_defrag(tx->dmat, map, &m,
1871 	    tx->seg_list, tx->max_desc - 2, &cnt, BUS_DMA_NOWAIT);
1872 	if (__predict_false(err != 0))
1873 		goto drop;
1874 	bus_dmamap_sync(tx->dmat, map, BUS_DMASYNC_PREWRITE);
1875 
1876 	/*
1877 	 * TSO is different enough, we handle it in another routine
1878 	 */
1879 	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1880 		return mxge_encap_tso(tx, info_map, m, cnt);
1881 
1882 	req = tx->req_list;
1883 	cksum_offset = 0;
1884 	pseudo_hdr_offset = 0;
1885 	flags = MXGEFW_FLAGS_NO_TSO;
1886 
1887 	/*
1888 	 * Checksum offloading
1889 	 */
1890 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1891 		cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1892 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1893 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1894 		req->cksum_offset = cksum_offset;
1895 		flags |= MXGEFW_FLAGS_CKSUM;
1896 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1897 	} else {
1898 		odd_flag = 0;
1899 	}
1900 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1901 		flags |= MXGEFW_FLAGS_SMALL;
1902 
1903 	/*
1904 	 * Convert segments into a request list
1905 	 */
1906 	cum_len = 0;
1907 	seg = tx->seg_list;
1908 	req->flags = MXGEFW_FLAGS_FIRST;
1909 	for (i = 0; i < cnt; i++) {
1910 		req->addr_low = htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1911 		req->addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1912 		req->length = htobe16(seg->ds_len);
1913 		req->cksum_offset = cksum_offset;
1914 		if (cksum_offset > seg->ds_len)
1915 			cksum_offset -= seg->ds_len;
1916 		else
1917 			cksum_offset = 0;
1918 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1919 		req->pad = 0; /* complete solid 16-byte block */
1920 		req->rdma_count = 1;
1921 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1922 		cum_len += seg->ds_len;
1923 		seg++;
1924 		req++;
1925 		req->flags = 0;
1926 	}
1927 	req--;
1928 
1929 	/*
1930 	 * Pad runt to 60 bytes
1931 	 */
1932 	if (cum_len < 60) {
1933 		req++;
1934 		req->addr_low = htobe32(MXGE_LOWPART_TO_U32(zeropad));
1935 		req->addr_high = htobe32(MXGE_HIGHPART_TO_U32(zeropad));
1936 		req->length = htobe16(60 - cum_len);
1937 		req->cksum_offset = 0;
1938 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1939 		req->pad = 0; /* complete solid 16-byte block */
1940 		req->rdma_count = 1;
1941 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1942 		cnt++;
1943 	}
1944 
1945 	tx->req_list[0].rdma_count = cnt;
1946 #if 0
1947 	/* print what the firmware will see */
1948 	for (i = 0; i < cnt; i++) {
1949 		kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1950 		    "cso:%d, flags:0x%x, rdma:%d\n",
1951 		    i, (int)ntohl(tx->req_list[i].addr_high),
1952 		    (int)ntohl(tx->req_list[i].addr_low),
1953 		    (int)ntohs(tx->req_list[i].length),
1954 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1955 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1956 		    tx->req_list[i].rdma_count);
1957 	}
1958 	kprintf("--------------\n");
1959 #endif
1960 	info_last = &tx->info[((cnt - 1) + tx->req) & tx->mask];
1961 
1962 	info_map->map = info_last->map;
1963 	info_last->map = map;
1964 	info_last->m = m;
1965 
1966 	mxge_submit_req(tx, tx->req_list, cnt);
1967 
1968 	if (tx->send_go != NULL && tx->queue_active == 0) {
1969 		/* Tell the NIC to start polling this slice */
1970 		*tx->send_go = 1;
1971 		tx->queue_active = 1;
1972 		tx->activate++;
1973 		wmb();
1974 	}
1975 	return 0;
1976 
1977 drop:
1978 	m_freem(m);
1979 	return err;
1980 }
1981 
1982 static void
1983 mxge_start(struct ifnet *ifp, struct ifaltq_subque *ifsq)
1984 {
1985 	mxge_softc_t *sc = ifp->if_softc;
1986 	mxge_tx_ring_t *tx = ifsq_get_priv(ifsq);
1987 	bus_addr_t zeropad;
1988 	int encap = 0;
1989 
1990 	KKASSERT(tx->ifsq == ifsq);
1991 	ASSERT_SERIALIZED(&tx->tx_serialize);
1992 
1993 	if ((ifp->if_flags & IFF_RUNNING) == 0 || ifsq_is_oactive(ifsq))
1994 		return;
1995 
1996 	zeropad = sc->zeropad_dma.dmem_busaddr;
1997 	while (tx->mask - (tx->req - tx->done) > tx->max_desc) {
1998 		struct mbuf *m;
1999 		int error;
2000 
2001 		m = ifsq_dequeue(ifsq);
2002 		if (m == NULL)
2003 			goto done;
2004 
2005 		BPF_MTAP(ifp, m);
2006 		error = mxge_encap(tx, m, zeropad);
2007 		if (!error)
2008 			encap = 1;
2009 		else
2010 			IFNET_STAT_INC(ifp, oerrors, 1);
2011 	}
2012 
2013 	/* Ran out of transmit slots */
2014 	ifsq_set_oactive(ifsq);
2015 done:
2016 	if (encap)
2017 		tx->watchdog.wd_timer = 5;
2018 }
2019 
2020 static void
2021 mxge_watchdog(struct ifaltq_subque *ifsq)
2022 {
2023 	struct ifnet *ifp = ifsq_get_ifp(ifsq);
2024 	struct mxge_softc *sc = ifp->if_softc;
2025 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
2026 	mxge_tx_ring_t *tx = ifsq_get_priv(ifsq);
2027 
2028 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
2029 
2030 	/* Check for pause blocking before resetting */
2031 	if (tx->watchdog_rx_pause == rx_pause) {
2032 		mxge_warn_stuck(sc, tx, 0);
2033 		mxge_watchdog_reset(sc);
2034 		return;
2035 	} else {
2036 		if_printf(ifp, "Flow control blocking xmits, "
2037 		    "check link partner\n");
2038 	}
2039 	tx->watchdog_rx_pause = rx_pause;
2040 }
2041 
2042 /*
2043  * Copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2044  * at most 32 bytes at a time, so as to avoid involving the software
2045  * pio handler in the nic.  We re-write the first segment's low
2046  * DMA address to mark it valid only after we write the entire chunk
2047  * in a burst
2048  */
2049 static __inline void
2050 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2051     mcp_kreq_ether_recv_t *src)
2052 {
2053 	uint32_t low;
2054 
2055 	low = src->addr_low;
2056 	src->addr_low = 0xffffffff;
2057 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2058 	wmb();
2059 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2060 	wmb();
2061 	src->addr_low = low;
2062 	dst->addr_low = low;
2063 	wmb();
2064 }
2065 
2066 static int
2067 mxge_get_buf_small(mxge_rx_ring_t *rx, bus_dmamap_t map, int idx,
2068     boolean_t init)
2069 {
2070 	bus_dma_segment_t seg;
2071 	struct mbuf *m;
2072 	int cnt, err, mflag;
2073 
2074 	mflag = MB_DONTWAIT;
2075 	if (__predict_false(init))
2076 		mflag = MB_WAIT;
2077 
2078 	m = m_gethdr(mflag, MT_DATA);
2079 	if (m == NULL) {
2080 		err = ENOBUFS;
2081 		if (__predict_false(init)) {
2082 			/*
2083 			 * During initialization, there
2084 			 * is nothing to setup; bail out
2085 			 */
2086 			return err;
2087 		}
2088 		goto done;
2089 	}
2090 	m->m_len = m->m_pkthdr.len = MHLEN;
2091 
2092 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2093 	    &seg, 1, &cnt, BUS_DMA_NOWAIT);
2094 	if (err != 0) {
2095 		m_freem(m);
2096 		if (__predict_false(init)) {
2097 			/*
2098 			 * During initialization, there
2099 			 * is nothing to setup; bail out
2100 			 */
2101 			return err;
2102 		}
2103 		goto done;
2104 	}
2105 
2106 	rx->info[idx].m = m;
2107 	rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2108 	rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2109 
2110 done:
2111 	if ((idx & 7) == 7)
2112 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2113 	return err;
2114 }
2115 
2116 static int
2117 mxge_get_buf_big(mxge_rx_ring_t *rx, bus_dmamap_t map, int idx,
2118     boolean_t init)
2119 {
2120 	bus_dma_segment_t seg;
2121 	struct mbuf *m;
2122 	int cnt, err, mflag;
2123 
2124 	mflag = MB_DONTWAIT;
2125 	if (__predict_false(init))
2126 		mflag = MB_WAIT;
2127 
2128 	if (rx->cl_size == MCLBYTES)
2129 		m = m_getcl(mflag, MT_DATA, M_PKTHDR);
2130 	else
2131 		m = m_getjcl(mflag, MT_DATA, M_PKTHDR, MJUMPAGESIZE);
2132 	if (m == NULL) {
2133 		err = ENOBUFS;
2134 		if (__predict_false(init)) {
2135 			/*
2136 			 * During initialization, there
2137 			 * is nothing to setup; bail out
2138 			 */
2139 			return err;
2140 		}
2141 		goto done;
2142 	}
2143 	m->m_len = m->m_pkthdr.len = rx->cl_size;
2144 
2145 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2146 	    &seg, 1, &cnt, BUS_DMA_NOWAIT);
2147 	if (err != 0) {
2148 		m_freem(m);
2149 		if (__predict_false(init)) {
2150 			/*
2151 			 * During initialization, there
2152 			 * is nothing to setup; bail out
2153 			 */
2154 			return err;
2155 		}
2156 		goto done;
2157 	}
2158 
2159 	rx->info[idx].m = m;
2160 	rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2161 	rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2162 
2163 done:
2164 	if ((idx & 7) == 7)
2165 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2166 	return err;
2167 }
2168 
2169 /*
2170  * Myri10GE hardware checksums are not valid if the sender
2171  * padded the frame with non-zero padding.  This is because
2172  * the firmware just does a simple 16-bit 1s complement
2173  * checksum across the entire frame, excluding the first 14
2174  * bytes.  It is best to simply to check the checksum and
2175  * tell the stack about it only if the checksum is good
2176  */
2177 static __inline uint16_t
2178 mxge_rx_csum(struct mbuf *m, int csum)
2179 {
2180 	const struct ether_header *eh;
2181 	const struct ip *ip;
2182 	uint16_t c;
2183 
2184 	eh = mtod(m, const struct ether_header *);
2185 
2186 	/* Only deal with IPv4 TCP & UDP for now */
2187 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2188 		return 1;
2189 
2190 	ip = (const struct ip *)(eh + 1);
2191 	if (__predict_false(ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP))
2192 		return 1;
2193 
2194 #ifdef INET
2195 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2196 	    htonl(ntohs(csum) + ntohs(ip->ip_len) +
2197 	          - (ip->ip_hl << 2) + ip->ip_p));
2198 #else
2199 	c = 1;
2200 #endif
2201 	c ^= 0xffff;
2202 	return c;
2203 }
2204 
2205 static void
2206 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2207 {
2208 	struct ether_vlan_header *evl;
2209 	uint32_t partial;
2210 
2211 	evl = mtod(m, struct ether_vlan_header *);
2212 
2213 	/*
2214 	 * Fix checksum by subtracting EVL_ENCAPLEN bytes after
2215 	 * what the firmware thought was the end of the ethernet
2216 	 * header.
2217 	 */
2218 
2219 	/* Put checksum into host byte order */
2220 	*csum = ntohs(*csum);
2221 
2222 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2223 	*csum += ~partial;
2224 	*csum += ((*csum) < ~partial);
2225 	*csum = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2226 	*csum = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2227 
2228 	/*
2229 	 * Restore checksum to network byte order;
2230 	 * later consumers expect this
2231 	 */
2232 	*csum = htons(*csum);
2233 
2234 	/* save the tag */
2235 	m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2236 	m->m_flags |= M_VLANTAG;
2237 
2238 	/*
2239 	 * Remove the 802.1q header by copying the Ethernet
2240 	 * addresses over it and adjusting the beginning of
2241 	 * the data in the mbuf.  The encapsulated Ethernet
2242 	 * type field is already in place.
2243 	 */
2244 	bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2245 	    ETHER_HDR_LEN - ETHER_TYPE_LEN);
2246 	m_adj(m, EVL_ENCAPLEN);
2247 }
2248 
2249 
2250 static __inline void
2251 mxge_rx_done_big(struct ifnet *ifp, mxge_rx_ring_t *rx,
2252     uint32_t len, uint32_t csum)
2253 {
2254 	struct mbuf *m;
2255 	const struct ether_header *eh;
2256 	bus_dmamap_t old_map;
2257 	int idx;
2258 
2259 	idx = rx->cnt & rx->mask;
2260 	rx->cnt++;
2261 
2262 	/* Save a pointer to the received mbuf */
2263 	m = rx->info[idx].m;
2264 
2265 	/* Try to replace the received mbuf */
2266 	if (mxge_get_buf_big(rx, rx->extra_map, idx, FALSE)) {
2267 		/* Drop the frame -- the old mbuf is re-cycled */
2268 		IFNET_STAT_INC(ifp, ierrors, 1);
2269 		return;
2270 	}
2271 
2272 	/* Unmap the received buffer */
2273 	old_map = rx->info[idx].map;
2274 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2275 	bus_dmamap_unload(rx->dmat, old_map);
2276 
2277 	/* Swap the bus_dmamap_t's */
2278 	rx->info[idx].map = rx->extra_map;
2279 	rx->extra_map = old_map;
2280 
2281 	/*
2282 	 * mcp implicitly skips 1st 2 bytes so that packet is properly
2283 	 * aligned
2284 	 */
2285 	m->m_data += MXGEFW_PAD;
2286 
2287 	m->m_pkthdr.rcvif = ifp;
2288 	m->m_len = m->m_pkthdr.len = len;
2289 
2290 	IFNET_STAT_INC(ifp, ipackets, 1);
2291 
2292 	eh = mtod(m, const struct ether_header *);
2293 	if (eh->ether_type == htons(ETHERTYPE_VLAN))
2294 		mxge_vlan_tag_remove(m, &csum);
2295 
2296 	/* If the checksum is valid, mark it in the mbuf header */
2297 	if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2298 	    mxge_rx_csum(m, csum) == 0) {
2299 		/* Tell the stack that the checksum is good */
2300 		m->m_pkthdr.csum_data = 0xffff;
2301 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2302 		    CSUM_DATA_VALID;
2303 	}
2304 	ifp->if_input(ifp, m);
2305 }
2306 
2307 static __inline void
2308 mxge_rx_done_small(struct ifnet *ifp, mxge_rx_ring_t *rx,
2309     uint32_t len, uint32_t csum)
2310 {
2311 	const struct ether_header *eh;
2312 	struct mbuf *m;
2313 	bus_dmamap_t old_map;
2314 	int idx;
2315 
2316 	idx = rx->cnt & rx->mask;
2317 	rx->cnt++;
2318 
2319 	/* Save a pointer to the received mbuf */
2320 	m = rx->info[idx].m;
2321 
2322 	/* Try to replace the received mbuf */
2323 	if (mxge_get_buf_small(rx, rx->extra_map, idx, FALSE)) {
2324 		/* Drop the frame -- the old mbuf is re-cycled */
2325 		IFNET_STAT_INC(ifp, ierrors, 1);
2326 		return;
2327 	}
2328 
2329 	/* Unmap the received buffer */
2330 	old_map = rx->info[idx].map;
2331 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2332 	bus_dmamap_unload(rx->dmat, old_map);
2333 
2334 	/* Swap the bus_dmamap_t's */
2335 	rx->info[idx].map = rx->extra_map;
2336 	rx->extra_map = old_map;
2337 
2338 	/*
2339 	 * mcp implicitly skips 1st 2 bytes so that packet is properly
2340 	 * aligned
2341 	 */
2342 	m->m_data += MXGEFW_PAD;
2343 
2344 	m->m_pkthdr.rcvif = ifp;
2345 	m->m_len = m->m_pkthdr.len = len;
2346 
2347 	IFNET_STAT_INC(ifp, ipackets, 1);
2348 
2349 	eh = mtod(m, const struct ether_header *);
2350 	if (eh->ether_type == htons(ETHERTYPE_VLAN))
2351 		mxge_vlan_tag_remove(m, &csum);
2352 
2353 	/* If the checksum is valid, mark it in the mbuf header */
2354 	if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2355 	    mxge_rx_csum(m, csum) == 0) {
2356 		/* Tell the stack that the checksum is good */
2357 		m->m_pkthdr.csum_data = 0xffff;
2358 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2359 		    CSUM_DATA_VALID;
2360 	}
2361 	ifp->if_input(ifp, m);
2362 }
2363 
2364 static __inline void
2365 mxge_clean_rx_done(struct ifnet *ifp, struct mxge_rx_data *rx_data, int cycle)
2366 {
2367 	mxge_rx_done_t *rx_done = &rx_data->rx_done;
2368 
2369 	while (rx_done->entry[rx_done->idx].length != 0 && cycle != 0) {
2370 		uint16_t length, checksum;
2371 
2372 		length = ntohs(rx_done->entry[rx_done->idx].length);
2373 		rx_done->entry[rx_done->idx].length = 0;
2374 
2375 		checksum = rx_done->entry[rx_done->idx].checksum;
2376 
2377 		if (length <= MXGE_RX_SMALL_BUFLEN) {
2378 			mxge_rx_done_small(ifp, &rx_data->rx_small,
2379 			    length, checksum);
2380 		} else {
2381 			mxge_rx_done_big(ifp, &rx_data->rx_big,
2382 			    length, checksum);
2383 		}
2384 
2385 		rx_done->idx++;
2386 		rx_done->idx &= rx_done->mask;
2387 		--cycle;
2388 	}
2389 }
2390 
2391 static __inline void
2392 mxge_tx_done(struct ifnet *ifp, mxge_tx_ring_t *tx, uint32_t mcp_idx)
2393 {
2394 	ASSERT_SERIALIZED(&tx->tx_serialize);
2395 
2396 	while (tx->pkt_done != mcp_idx) {
2397 		struct mbuf *m;
2398 		int idx;
2399 
2400 		idx = tx->done & tx->mask;
2401 		tx->done++;
2402 
2403 		m = tx->info[idx].m;
2404 		/*
2405 		 * mbuf and DMA map only attached to the first
2406 		 * segment per-mbuf.
2407 		 */
2408 		if (m != NULL) {
2409 			tx->pkt_done++;
2410 			IFNET_STAT_INC(ifp, opackets, 1);
2411 			tx->info[idx].m = NULL;
2412 			bus_dmamap_unload(tx->dmat, tx->info[idx].map);
2413 			m_freem(m);
2414 		}
2415 	}
2416 
2417 	/*
2418 	 * If we have space, clear OACTIVE to tell the stack that
2419 	 * its OK to send packets
2420 	 */
2421 	if (tx->req - tx->done < (tx->mask + 1) / 2) {
2422 		ifsq_clr_oactive(tx->ifsq);
2423 		if (tx->req == tx->done) {
2424 			/* Reset watchdog */
2425 			tx->watchdog.wd_timer = 0;
2426 		}
2427 	}
2428 
2429 	if (!ifsq_is_empty(tx->ifsq))
2430 		ifsq_devstart(tx->ifsq);
2431 
2432 	if (tx->send_stop != NULL && tx->req == tx->done) {
2433 		/*
2434 		 * Let the NIC stop polling this queue, since there
2435 		 * are no more transmits pending
2436 		 */
2437 		*tx->send_stop = 1;
2438 		tx->queue_active = 0;
2439 		tx->deactivate++;
2440 		wmb();
2441 	}
2442 }
2443 
2444 static struct mxge_media_type mxge_xfp_media_types[] = {
2445 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2446 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2447 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2448 	{0,		(1 << 5),	"10GBASE-ER"},
2449 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2450 	{0,		(1 << 3),	"10GBASE-SW"},
2451 	{0,		(1 << 2),	"10GBASE-LW"},
2452 	{0,		(1 << 1),	"10GBASE-EW"},
2453 	{0,		(1 << 0),	"Reserved"}
2454 };
2455 
2456 static struct mxge_media_type mxge_sfp_media_types[] = {
2457 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2458 	{0,		(1 << 7),	"Reserved"},
2459 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2460 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2461 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2462 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2463 };
2464 
2465 static void
2466 mxge_media_set(mxge_softc_t *sc, int media_type)
2467 {
2468 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type, 0, NULL);
2469 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2470 	sc->current_media = media_type;
2471 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2472 }
2473 
2474 static void
2475 mxge_media_init(mxge_softc_t *sc)
2476 {
2477 	const char *ptr;
2478 	int i;
2479 
2480 	ifmedia_removeall(&sc->media);
2481 	mxge_media_set(sc, IFM_AUTO);
2482 
2483 	/*
2484 	 * Parse the product code to deterimine the interface type
2485 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2486 	 * after the 3rd dash in the driver's cached copy of the
2487 	 * EEPROM's product code string.
2488 	 */
2489 	ptr = sc->product_code_string;
2490 	if (ptr == NULL) {
2491 		if_printf(sc->ifp, "Missing product code\n");
2492 		return;
2493 	}
2494 
2495 	for (i = 0; i < 3; i++, ptr++) {
2496 		ptr = strchr(ptr, '-');
2497 		if (ptr == NULL) {
2498 			if_printf(sc->ifp, "only %d dashes in PC?!?\n", i);
2499 			return;
2500 		}
2501 	}
2502 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2503 		/* -C is CX4 */
2504 		sc->connector = MXGE_CX4;
2505 		mxge_media_set(sc, IFM_10G_CX4);
2506 	} else if (*ptr == 'Q') {
2507 		/* -Q is Quad Ribbon Fiber */
2508 		sc->connector = MXGE_QRF;
2509 		if_printf(sc->ifp, "Quad Ribbon Fiber Media\n");
2510 		/* DragonFly has no media type for Quad ribbon fiber */
2511 	} else if (*ptr == 'R') {
2512 		/* -R is XFP */
2513 		sc->connector = MXGE_XFP;
2514 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2515 		/* -S or -2S is SFP+ */
2516 		sc->connector = MXGE_SFP;
2517 	} else {
2518 		if_printf(sc->ifp, "Unknown media type: %c\n", *ptr);
2519 	}
2520 }
2521 
2522 /*
2523  * Determine the media type for a NIC.  Some XFPs will identify
2524  * themselves only when their link is up, so this is initiated via a
2525  * link up interrupt.  However, this can potentially take up to
2526  * several milliseconds, so it is run via the watchdog routine, rather
2527  * than in the interrupt handler itself.
2528  */
2529 static void
2530 mxge_media_probe(mxge_softc_t *sc)
2531 {
2532 	mxge_cmd_t cmd;
2533 	const char *cage_type;
2534 	struct mxge_media_type *mxge_media_types = NULL;
2535 	int i, err, ms, mxge_media_type_entries;
2536 	uint32_t byte;
2537 
2538 	sc->need_media_probe = 0;
2539 
2540 	if (sc->connector == MXGE_XFP) {
2541 		/* -R is XFP */
2542 		mxge_media_types = mxge_xfp_media_types;
2543 		mxge_media_type_entries = sizeof(mxge_xfp_media_types) /
2544 		    sizeof(mxge_xfp_media_types[0]);
2545 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2546 		cage_type = "XFP";
2547 	} else 	if (sc->connector == MXGE_SFP) {
2548 		/* -S or -2S is SFP+ */
2549 		mxge_media_types = mxge_sfp_media_types;
2550 		mxge_media_type_entries = sizeof(mxge_sfp_media_types) /
2551 		    sizeof(mxge_sfp_media_types[0]);
2552 		cage_type = "SFP+";
2553 		byte = 3;
2554 	} else {
2555 		/* nothing to do; media type cannot change */
2556 		return;
2557 	}
2558 
2559 	/*
2560 	 * At this point we know the NIC has an XFP cage, so now we
2561 	 * try to determine what is in the cage by using the
2562 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2563 	 * register.  We read just one byte, which may take over
2564 	 * a millisecond
2565 	 */
2566 
2567 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2568 	cmd.data1 = byte;
2569 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2570 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE)
2571 		if_printf(sc->ifp, "failed to read XFP\n");
2572 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT)
2573 		if_printf(sc->ifp, "Type R/S with no XFP!?!?\n");
2574 	if (err != MXGEFW_CMD_OK)
2575 		return;
2576 
2577 	/* Now we wait for the data to be cached */
2578 	cmd.data0 = byte;
2579 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2580 	for (ms = 0; err == EBUSY && ms < 50; ms++) {
2581 		DELAY(1000);
2582 		cmd.data0 = byte;
2583 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2584 	}
2585 	if (err != MXGEFW_CMD_OK) {
2586 		if_printf(sc->ifp, "failed to read %s (%d, %dms)\n",
2587 		    cage_type, err, ms);
2588 		return;
2589 	}
2590 
2591 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2592 		if (bootverbose) {
2593 			if_printf(sc->ifp, "%s:%s\n", cage_type,
2594 			    mxge_media_types[0].name);
2595 		}
2596 		if (sc->current_media != mxge_media_types[0].flag) {
2597 			mxge_media_init(sc);
2598 			mxge_media_set(sc, mxge_media_types[0].flag);
2599 		}
2600 		return;
2601 	}
2602 	for (i = 1; i < mxge_media_type_entries; i++) {
2603 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2604 			if (bootverbose) {
2605 				if_printf(sc->ifp, "%s:%s\n", cage_type,
2606 				    mxge_media_types[i].name);
2607 			}
2608 
2609 			if (sc->current_media != mxge_media_types[i].flag) {
2610 				mxge_media_init(sc);
2611 				mxge_media_set(sc, mxge_media_types[i].flag);
2612 			}
2613 			return;
2614 		}
2615 	}
2616 	if (bootverbose) {
2617 		if_printf(sc->ifp, "%s media 0x%x unknown\n", cage_type,
2618 		    cmd.data0);
2619 	}
2620 }
2621 
2622 static void
2623 mxge_intr_status(struct mxge_softc *sc, const mcp_irq_data_t *stats)
2624 {
2625 	if (sc->link_state != stats->link_up) {
2626 		sc->link_state = stats->link_up;
2627 		if (sc->link_state) {
2628 			sc->ifp->if_link_state = LINK_STATE_UP;
2629 			if_link_state_change(sc->ifp);
2630 			if (bootverbose)
2631 				if_printf(sc->ifp, "link up\n");
2632 		} else {
2633 			sc->ifp->if_link_state = LINK_STATE_DOWN;
2634 			if_link_state_change(sc->ifp);
2635 			if (bootverbose)
2636 				if_printf(sc->ifp, "link down\n");
2637 		}
2638 		sc->need_media_probe = 1;
2639 	}
2640 
2641 	if (sc->rdma_tags_available != be32toh(stats->rdma_tags_available)) {
2642 		sc->rdma_tags_available = be32toh(stats->rdma_tags_available);
2643 		if_printf(sc->ifp, "RDMA timed out! %d tags left\n",
2644 		    sc->rdma_tags_available);
2645 	}
2646 
2647 	if (stats->link_down) {
2648 		sc->down_cnt += stats->link_down;
2649 		sc->link_state = 0;
2650 		sc->ifp->if_link_state = LINK_STATE_DOWN;
2651 		if_link_state_change(sc->ifp);
2652 	}
2653 }
2654 
2655 static void
2656 mxge_serialize_skipmain(struct mxge_softc *sc)
2657 {
2658 	lwkt_serialize_array_enter(sc->serializes, sc->nserialize, 1);
2659 }
2660 
2661 static void
2662 mxge_deserialize_skipmain(struct mxge_softc *sc)
2663 {
2664 	lwkt_serialize_array_exit(sc->serializes, sc->nserialize, 1);
2665 }
2666 
2667 static void
2668 mxge_legacy(void *arg)
2669 {
2670 	struct mxge_slice_state *ss = arg;
2671 	mxge_softc_t *sc = ss->sc;
2672 	mcp_irq_data_t *stats = ss->fw_stats;
2673 	mxge_tx_ring_t *tx = &ss->tx;
2674 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2675 	uint32_t send_done_count;
2676 	uint8_t valid;
2677 
2678 	ASSERT_SERIALIZED(&sc->main_serialize);
2679 
2680 	/* Make sure the DMA has finished */
2681 	if (!stats->valid)
2682 		return;
2683 	valid = stats->valid;
2684 
2685 	/* Lower legacy IRQ */
2686 	*sc->irq_deassert = 0;
2687 	if (!mxge_deassert_wait) {
2688 		/* Don't wait for conf. that irq is low */
2689 		stats->valid = 0;
2690 	}
2691 
2692 	mxge_serialize_skipmain(sc);
2693 
2694 	/*
2695 	 * Loop while waiting for legacy irq deassertion
2696 	 * XXX do we really want to loop?
2697 	 */
2698 	do {
2699 		/* Check for transmit completes and receives */
2700 		send_done_count = be32toh(stats->send_done_count);
2701 		while ((send_done_count != tx->pkt_done) ||
2702 		       (rx_done->entry[rx_done->idx].length != 0)) {
2703 			if (send_done_count != tx->pkt_done) {
2704 				mxge_tx_done(&sc->arpcom.ac_if, tx,
2705 				    (int)send_done_count);
2706 			}
2707 			mxge_clean_rx_done(&sc->arpcom.ac_if, &ss->rx_data, -1);
2708 			send_done_count = be32toh(stats->send_done_count);
2709 		}
2710 		if (mxge_deassert_wait)
2711 			wmb();
2712 	} while (*((volatile uint8_t *)&stats->valid));
2713 
2714 	mxge_deserialize_skipmain(sc);
2715 
2716 	/* Fw link & error stats meaningful only on the first slice */
2717 	if (__predict_false(stats->stats_updated))
2718 		mxge_intr_status(sc, stats);
2719 
2720 	/* Check to see if we have rx token to pass back */
2721 	if (valid & 0x1)
2722 		*ss->irq_claim = be32toh(3);
2723 	*(ss->irq_claim + 1) = be32toh(3);
2724 }
2725 
2726 static void
2727 mxge_msi(void *arg)
2728 {
2729 	struct mxge_slice_state *ss = arg;
2730 	mxge_softc_t *sc = ss->sc;
2731 	mcp_irq_data_t *stats = ss->fw_stats;
2732 	mxge_tx_ring_t *tx = &ss->tx;
2733 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2734 	uint32_t send_done_count;
2735 	uint8_t valid;
2736 #ifndef IFPOLL_ENABLE
2737 	const boolean_t polling = FALSE;
2738 #else
2739 	boolean_t polling = FALSE;
2740 #endif
2741 
2742 	ASSERT_SERIALIZED(&sc->main_serialize);
2743 
2744 	/* Make sure the DMA has finished */
2745 	if (__predict_false(!stats->valid))
2746 		return;
2747 
2748 	valid = stats->valid;
2749 	stats->valid = 0;
2750 
2751 #ifdef IFPOLL_ENABLE
2752 	if (sc->arpcom.ac_if.if_flags & IFF_NPOLLING)
2753 		polling = TRUE;
2754 #endif
2755 
2756 	if (!polling) {
2757 		/* Check for receives */
2758 		lwkt_serialize_enter(&ss->rx_data.rx_serialize);
2759 		if (rx_done->entry[rx_done->idx].length != 0)
2760 			mxge_clean_rx_done(&sc->arpcom.ac_if, &ss->rx_data, -1);
2761 		lwkt_serialize_exit(&ss->rx_data.rx_serialize);
2762 	}
2763 
2764 	/*
2765 	 * Check for transmit completes
2766 	 *
2767 	 * NOTE:
2768 	 * Since pkt_done is only changed by mxge_tx_done(),
2769 	 * which is called only in interrupt handler, the
2770 	 * check w/o holding tx serializer is MPSAFE.
2771 	 */
2772 	send_done_count = be32toh(stats->send_done_count);
2773 	if (send_done_count != tx->pkt_done) {
2774 		lwkt_serialize_enter(&tx->tx_serialize);
2775 		mxge_tx_done(&sc->arpcom.ac_if, tx, (int)send_done_count);
2776 		lwkt_serialize_exit(&tx->tx_serialize);
2777 	}
2778 
2779 	if (__predict_false(stats->stats_updated))
2780 		mxge_intr_status(sc, stats);
2781 
2782 	/* Check to see if we have rx token to pass back */
2783 	if (!polling && (valid & 0x1))
2784 		*ss->irq_claim = be32toh(3);
2785 	*(ss->irq_claim + 1) = be32toh(3);
2786 }
2787 
2788 static void
2789 mxge_msix_rx(void *arg)
2790 {
2791 	struct mxge_slice_state *ss = arg;
2792 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2793 
2794 #ifdef IFPOLL_ENABLE
2795 	if (ss->sc->arpcom.ac_if.if_flags & IFF_NPOLLING)
2796 		return;
2797 #endif
2798 
2799 	ASSERT_SERIALIZED(&ss->rx_data.rx_serialize);
2800 
2801 	if (rx_done->entry[rx_done->idx].length != 0)
2802 		mxge_clean_rx_done(&ss->sc->arpcom.ac_if, &ss->rx_data, -1);
2803 
2804 	*ss->irq_claim = be32toh(3);
2805 }
2806 
2807 static void
2808 mxge_msix_rxtx(void *arg)
2809 {
2810 	struct mxge_slice_state *ss = arg;
2811 	mxge_softc_t *sc = ss->sc;
2812 	mcp_irq_data_t *stats = ss->fw_stats;
2813 	mxge_tx_ring_t *tx = &ss->tx;
2814 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2815 	uint32_t send_done_count;
2816 	uint8_t valid;
2817 #ifndef IFPOLL_ENABLE
2818 	const boolean_t polling = FALSE;
2819 #else
2820 	boolean_t polling = FALSE;
2821 #endif
2822 
2823 	ASSERT_SERIALIZED(&ss->rx_data.rx_serialize);
2824 
2825 	/* Make sure the DMA has finished */
2826 	if (__predict_false(!stats->valid))
2827 		return;
2828 
2829 	valid = stats->valid;
2830 	stats->valid = 0;
2831 
2832 #ifdef IFPOLL_ENABLE
2833 	if (sc->arpcom.ac_if.if_flags & IFF_NPOLLING)
2834 		polling = TRUE;
2835 #endif
2836 
2837 	/* Check for receives */
2838 	if (!polling && rx_done->entry[rx_done->idx].length != 0)
2839 		mxge_clean_rx_done(&sc->arpcom.ac_if, &ss->rx_data, -1);
2840 
2841 	/*
2842 	 * Check for transmit completes
2843 	 *
2844 	 * NOTE:
2845 	 * Since pkt_done is only changed by mxge_tx_done(),
2846 	 * which is called only in interrupt handler, the
2847 	 * check w/o holding tx serializer is MPSAFE.
2848 	 */
2849 	send_done_count = be32toh(stats->send_done_count);
2850 	if (send_done_count != tx->pkt_done) {
2851 		lwkt_serialize_enter(&tx->tx_serialize);
2852 		mxge_tx_done(&sc->arpcom.ac_if, tx, (int)send_done_count);
2853 		lwkt_serialize_exit(&tx->tx_serialize);
2854 	}
2855 
2856 	/* Check to see if we have rx token to pass back */
2857 	if (!polling && (valid & 0x1))
2858 		*ss->irq_claim = be32toh(3);
2859 	*(ss->irq_claim + 1) = be32toh(3);
2860 }
2861 
2862 static void
2863 mxge_init(void *arg)
2864 {
2865 	struct mxge_softc *sc = arg;
2866 
2867 	ASSERT_IFNET_SERIALIZED_ALL(sc->ifp);
2868 	if ((sc->ifp->if_flags & IFF_RUNNING) == 0)
2869 		mxge_open(sc);
2870 }
2871 
2872 static void
2873 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2874 {
2875 	int i;
2876 
2877 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
2878 		if (ss->rx_data.rx_big.info[i].m == NULL)
2879 			continue;
2880 		bus_dmamap_unload(ss->rx_data.rx_big.dmat,
2881 		    ss->rx_data.rx_big.info[i].map);
2882 		m_freem(ss->rx_data.rx_big.info[i].m);
2883 		ss->rx_data.rx_big.info[i].m = NULL;
2884 	}
2885 
2886 	for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
2887 		if (ss->rx_data.rx_small.info[i].m == NULL)
2888 			continue;
2889 		bus_dmamap_unload(ss->rx_data.rx_small.dmat,
2890 		    ss->rx_data.rx_small.info[i].map);
2891 		m_freem(ss->rx_data.rx_small.info[i].m);
2892 		ss->rx_data.rx_small.info[i].m = NULL;
2893 	}
2894 
2895 	/* Transmit ring used only on the first slice */
2896 	if (ss->tx.info == NULL)
2897 		return;
2898 
2899 	for (i = 0; i <= ss->tx.mask; i++) {
2900 		if (ss->tx.info[i].m == NULL)
2901 			continue;
2902 		bus_dmamap_unload(ss->tx.dmat, ss->tx.info[i].map);
2903 		m_freem(ss->tx.info[i].m);
2904 		ss->tx.info[i].m = NULL;
2905 	}
2906 }
2907 
2908 static void
2909 mxge_free_mbufs(mxge_softc_t *sc)
2910 {
2911 	int slice;
2912 
2913 	for (slice = 0; slice < sc->num_slices; slice++)
2914 		mxge_free_slice_mbufs(&sc->ss[slice]);
2915 }
2916 
2917 static void
2918 mxge_free_slice_rings(struct mxge_slice_state *ss)
2919 {
2920 	int i;
2921 
2922 	if (ss->rx_data.rx_done.entry != NULL) {
2923 		mxge_dma_free(&ss->rx_done_dma);
2924 		ss->rx_data.rx_done.entry = NULL;
2925 	}
2926 
2927 	if (ss->tx.req_list != NULL) {
2928 		kfree(ss->tx.req_list, M_DEVBUF);
2929 		ss->tx.req_list = NULL;
2930 	}
2931 
2932 	if (ss->tx.seg_list != NULL) {
2933 		kfree(ss->tx.seg_list, M_DEVBUF);
2934 		ss->tx.seg_list = NULL;
2935 	}
2936 
2937 	if (ss->rx_data.rx_small.shadow != NULL) {
2938 		kfree(ss->rx_data.rx_small.shadow, M_DEVBUF);
2939 		ss->rx_data.rx_small.shadow = NULL;
2940 	}
2941 
2942 	if (ss->rx_data.rx_big.shadow != NULL) {
2943 		kfree(ss->rx_data.rx_big.shadow, M_DEVBUF);
2944 		ss->rx_data.rx_big.shadow = NULL;
2945 	}
2946 
2947 	if (ss->tx.info != NULL) {
2948 		if (ss->tx.dmat != NULL) {
2949 			for (i = 0; i <= ss->tx.mask; i++) {
2950 				bus_dmamap_destroy(ss->tx.dmat,
2951 				    ss->tx.info[i].map);
2952 			}
2953 			bus_dma_tag_destroy(ss->tx.dmat);
2954 		}
2955 		kfree(ss->tx.info, M_DEVBUF);
2956 		ss->tx.info = NULL;
2957 	}
2958 
2959 	if (ss->rx_data.rx_small.info != NULL) {
2960 		if (ss->rx_data.rx_small.dmat != NULL) {
2961 			for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
2962 				bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
2963 				    ss->rx_data.rx_small.info[i].map);
2964 			}
2965 			bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
2966 			    ss->rx_data.rx_small.extra_map);
2967 			bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
2968 		}
2969 		kfree(ss->rx_data.rx_small.info, M_DEVBUF);
2970 		ss->rx_data.rx_small.info = NULL;
2971 	}
2972 
2973 	if (ss->rx_data.rx_big.info != NULL) {
2974 		if (ss->rx_data.rx_big.dmat != NULL) {
2975 			for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
2976 				bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
2977 				    ss->rx_data.rx_big.info[i].map);
2978 			}
2979 			bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
2980 			    ss->rx_data.rx_big.extra_map);
2981 			bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
2982 		}
2983 		kfree(ss->rx_data.rx_big.info, M_DEVBUF);
2984 		ss->rx_data.rx_big.info = NULL;
2985 	}
2986 }
2987 
2988 static void
2989 mxge_free_rings(mxge_softc_t *sc)
2990 {
2991 	int slice;
2992 
2993 	if (sc->ss == NULL)
2994 		return;
2995 
2996 	for (slice = 0; slice < sc->num_slices; slice++)
2997 		mxge_free_slice_rings(&sc->ss[slice]);
2998 }
2999 
3000 static int
3001 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3002     int tx_ring_entries)
3003 {
3004 	mxge_softc_t *sc = ss->sc;
3005 	size_t bytes;
3006 	int err, i;
3007 
3008 	/*
3009 	 * Allocate per-slice receive resources
3010 	 */
3011 
3012 	ss->rx_data.rx_small.mask = ss->rx_data.rx_big.mask =
3013 	    rx_ring_entries - 1;
3014 	ss->rx_data.rx_done.mask = (2 * rx_ring_entries) - 1;
3015 
3016 	/* Allocate the rx shadow rings */
3017 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_small.shadow);
3018 	ss->rx_data.rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3019 
3020 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_big.shadow);
3021 	ss->rx_data.rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3022 
3023 	/* Allocate the rx host info rings */
3024 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_small.info);
3025 	ss->rx_data.rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3026 
3027 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_big.info);
3028 	ss->rx_data.rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3029 
3030 	/* Allocate the rx busdma resources */
3031 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3032 				 1,			/* alignment */
3033 				 4096,			/* boundary */
3034 				 BUS_SPACE_MAXADDR,	/* low */
3035 				 BUS_SPACE_MAXADDR,	/* high */
3036 				 NULL, NULL,		/* filter */
3037 				 MHLEN,			/* maxsize */
3038 				 1,			/* num segs */
3039 				 MHLEN,			/* maxsegsize */
3040 				 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
3041 				 			/* flags */
3042 				 &ss->rx_data.rx_small.dmat); /* tag */
3043 	if (err != 0) {
3044 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3045 		    err);
3046 		return err;
3047 	}
3048 
3049 	err = bus_dmamap_create(ss->rx_data.rx_small.dmat, BUS_DMA_WAITOK,
3050 	    &ss->rx_data.rx_small.extra_map);
3051 	if (err != 0) {
3052 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n", err);
3053 		bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
3054 		ss->rx_data.rx_small.dmat = NULL;
3055 		return err;
3056 	}
3057 	for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
3058 		err = bus_dmamap_create(ss->rx_data.rx_small.dmat,
3059 		    BUS_DMA_WAITOK, &ss->rx_data.rx_small.info[i].map);
3060 		if (err != 0) {
3061 			int j;
3062 
3063 			device_printf(sc->dev, "Err %d rx_small dmamap\n", err);
3064 
3065 			for (j = 0; j < i; ++j) {
3066 				bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
3067 				    ss->rx_data.rx_small.info[j].map);
3068 			}
3069 			bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
3070 			    ss->rx_data.rx_small.extra_map);
3071 			bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
3072 			ss->rx_data.rx_small.dmat = NULL;
3073 			return err;
3074 		}
3075 	}
3076 
3077 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3078 				 1,			/* alignment */
3079 				 4096,			/* boundary */
3080 				 BUS_SPACE_MAXADDR,	/* low */
3081 				 BUS_SPACE_MAXADDR,	/* high */
3082 				 NULL, NULL,		/* filter */
3083 				 4096,			/* maxsize */
3084 				 1,			/* num segs */
3085 				 4096,			/* maxsegsize*/
3086 				 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
3087 				 			/* flags */
3088 				 &ss->rx_data.rx_big.dmat); /* tag */
3089 	if (err != 0) {
3090 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3091 		    err);
3092 		return err;
3093 	}
3094 
3095 	err = bus_dmamap_create(ss->rx_data.rx_big.dmat, BUS_DMA_WAITOK,
3096 	    &ss->rx_data.rx_big.extra_map);
3097 	if (err != 0) {
3098 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n", err);
3099 		bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
3100 		ss->rx_data.rx_big.dmat = NULL;
3101 		return err;
3102 	}
3103 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
3104 		err = bus_dmamap_create(ss->rx_data.rx_big.dmat, BUS_DMA_WAITOK,
3105 		    &ss->rx_data.rx_big.info[i].map);
3106 		if (err != 0) {
3107 			int j;
3108 
3109 			device_printf(sc->dev, "Err %d rx_big dmamap\n", err);
3110 			for (j = 0; j < i; ++j) {
3111 				bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
3112 				    ss->rx_data.rx_big.info[j].map);
3113 			}
3114 			bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
3115 			    ss->rx_data.rx_big.extra_map);
3116 			bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
3117 			ss->rx_data.rx_big.dmat = NULL;
3118 			return err;
3119 		}
3120 	}
3121 
3122 	/*
3123 	 * Now allocate TX resources
3124 	 */
3125 
3126 	ss->tx.mask = tx_ring_entries - 1;
3127 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3128 
3129 	/*
3130 	 * Allocate the tx request copy block; MUST be at least 8 bytes
3131 	 * aligned
3132 	 */
3133 	bytes = sizeof(*ss->tx.req_list) * (ss->tx.max_desc + 4);
3134 	ss->tx.req_list = kmalloc_cachealign(__VM_CACHELINE_ALIGN(bytes),
3135 	    M_DEVBUF, M_WAITOK);
3136 
3137 	/* Allocate the tx busdma segment list */
3138 	bytes = sizeof(*ss->tx.seg_list) * ss->tx.max_desc;
3139 	ss->tx.seg_list = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3140 
3141 	/* Allocate the tx host info ring */
3142 	bytes = tx_ring_entries * sizeof(*ss->tx.info);
3143 	ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3144 
3145 	/* Allocate the tx busdma resources */
3146 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3147 				 1,			/* alignment */
3148 				 sc->tx_boundary,	/* boundary */
3149 				 BUS_SPACE_MAXADDR,	/* low */
3150 				 BUS_SPACE_MAXADDR,	/* high */
3151 				 NULL, NULL,		/* filter */
3152 				 IP_MAXPACKET +
3153 				 sizeof(struct ether_vlan_header),
3154 				 			/* maxsize */
3155 				 ss->tx.max_desc - 2,	/* num segs */
3156 				 sc->tx_boundary,	/* maxsegsz */
3157 				 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW |
3158 				 BUS_DMA_ONEBPAGE,	/* flags */
3159 				 &ss->tx.dmat);		/* tag */
3160 	if (err != 0) {
3161 		device_printf(sc->dev, "Err %d allocating tx dmat\n", err);
3162 		return err;
3163 	}
3164 
3165 	/*
3166 	 * Now use these tags to setup DMA maps for each slot in the ring
3167 	 */
3168 	for (i = 0; i <= ss->tx.mask; i++) {
3169 		err = bus_dmamap_create(ss->tx.dmat,
3170 		    BUS_DMA_WAITOK | BUS_DMA_ONEBPAGE, &ss->tx.info[i].map);
3171 		if (err != 0) {
3172 			int j;
3173 
3174 			device_printf(sc->dev, "Err %d tx dmamap\n", err);
3175 			for (j = 0; j < i; ++j) {
3176 				bus_dmamap_destroy(ss->tx.dmat,
3177 				    ss->tx.info[j].map);
3178 			}
3179 			bus_dma_tag_destroy(ss->tx.dmat);
3180 			ss->tx.dmat = NULL;
3181 			return err;
3182 		}
3183 	}
3184 	return 0;
3185 }
3186 
3187 static int
3188 mxge_alloc_rings(mxge_softc_t *sc)
3189 {
3190 	mxge_cmd_t cmd;
3191 	int tx_ring_size;
3192 	int tx_ring_entries, rx_ring_entries;
3193 	int err, slice;
3194 
3195 	/* Get ring sizes */
3196 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3197 	if (err != 0) {
3198 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3199 		return err;
3200 	}
3201 	tx_ring_size = cmd.data0;
3202 
3203 	tx_ring_entries = tx_ring_size / sizeof(mcp_kreq_ether_send_t);
3204 	rx_ring_entries = sc->rx_intr_slots / 2;
3205 
3206 	if (bootverbose) {
3207 		device_printf(sc->dev, "tx desc %d, rx desc %d\n",
3208 		    tx_ring_entries, rx_ring_entries);
3209 	}
3210 
3211 	ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3212 	ifq_set_ready(&sc->ifp->if_snd);
3213 	ifq_set_subq_cnt(&sc->ifp->if_snd, sc->num_tx_rings);
3214 
3215 	if (sc->num_tx_rings > 1) {
3216 		sc->ifp->if_mapsubq = ifq_mapsubq_mask;
3217 		ifq_set_subq_mask(&sc->ifp->if_snd, sc->num_tx_rings - 1);
3218 	}
3219 
3220 	for (slice = 0; slice < sc->num_slices; slice++) {
3221 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3222 		    rx_ring_entries, tx_ring_entries);
3223 		if (err != 0) {
3224 			device_printf(sc->dev,
3225 			    "alloc %d slice rings failed\n", slice);
3226 			return err;
3227 		}
3228 	}
3229 	return 0;
3230 }
3231 
3232 static void
3233 mxge_choose_params(int mtu, int *cl_size)
3234 {
3235 	int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3236 
3237 	if (bufsize < MCLBYTES) {
3238 		*cl_size = MCLBYTES;
3239 	} else {
3240 		KASSERT(bufsize < MJUMPAGESIZE, ("invalid MTU %d", mtu));
3241 		*cl_size = MJUMPAGESIZE;
3242 	}
3243 }
3244 
3245 static int
3246 mxge_slice_open(struct mxge_slice_state *ss, int cl_size)
3247 {
3248 	mxge_cmd_t cmd;
3249 	int err, i, slice;
3250 
3251 	slice = ss - ss->sc->ss;
3252 
3253 	/*
3254 	 * Get the lanai pointers to the send and receive rings
3255 	 */
3256 	err = 0;
3257 
3258 	if (ss->sc->num_tx_rings == 1) {
3259 		if (slice == 0) {
3260 			cmd.data0 = slice;
3261 			err = mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SEND_OFFSET,
3262 			    &cmd);
3263 			ss->tx.lanai = (volatile mcp_kreq_ether_send_t *)
3264 			    (ss->sc->sram + cmd.data0);
3265 			/* Leave send_go and send_stop as NULL */
3266 		}
3267 	} else {
3268 		cmd.data0 = slice;
3269 		err = mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3270 		ss->tx.lanai = (volatile mcp_kreq_ether_send_t *)
3271 		    (ss->sc->sram + cmd.data0);
3272 		ss->tx.send_go = (volatile uint32_t *)
3273 		    (ss->sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3274 		ss->tx.send_stop = (volatile uint32_t *)
3275 		    (ss->sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3276 	}
3277 
3278 	cmd.data0 = slice;
3279 	err |= mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3280 	ss->rx_data.rx_small.lanai =
3281 	    (volatile mcp_kreq_ether_recv_t *)(ss->sc->sram + cmd.data0);
3282 
3283 	cmd.data0 = slice;
3284 	err |= mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3285 	ss->rx_data.rx_big.lanai =
3286 	    (volatile mcp_kreq_ether_recv_t *)(ss->sc->sram + cmd.data0);
3287 
3288 	if (err != 0) {
3289 		if_printf(ss->sc->ifp,
3290 		    "failed to get ring sizes or locations\n");
3291 		return EIO;
3292 	}
3293 
3294 	/*
3295 	 * Stock small receive ring
3296 	 */
3297 	for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
3298 		err = mxge_get_buf_small(&ss->rx_data.rx_small,
3299 		    ss->rx_data.rx_small.info[i].map, i, TRUE);
3300 		if (err) {
3301 			if_printf(ss->sc->ifp, "alloced %d/%d smalls\n", i,
3302 			    ss->rx_data.rx_small.mask + 1);
3303 			return ENOMEM;
3304 		}
3305 	}
3306 
3307 	/*
3308 	 * Stock big receive ring
3309 	 */
3310 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
3311 		ss->rx_data.rx_big.shadow[i].addr_low = 0xffffffff;
3312 		ss->rx_data.rx_big.shadow[i].addr_high = 0xffffffff;
3313 	}
3314 
3315 	ss->rx_data.rx_big.cl_size = cl_size;
3316 
3317 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
3318 		err = mxge_get_buf_big(&ss->rx_data.rx_big,
3319 		    ss->rx_data.rx_big.info[i].map, i, TRUE);
3320 		if (err) {
3321 			if_printf(ss->sc->ifp, "alloced %d/%d bigs\n", i,
3322 			    ss->rx_data.rx_big.mask + 1);
3323 			return ENOMEM;
3324 		}
3325 	}
3326 	return 0;
3327 }
3328 
3329 static int
3330 mxge_open(mxge_softc_t *sc)
3331 {
3332 	struct ifnet *ifp = sc->ifp;
3333 	mxge_cmd_t cmd;
3334 	int err, slice, cl_size, i;
3335 	bus_addr_t bus;
3336 	volatile uint8_t *itable;
3337 	struct mxge_slice_state *ss;
3338 
3339 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
3340 
3341 	/* Copy the MAC address in case it was overridden */
3342 	bcopy(IF_LLADDR(ifp), sc->mac_addr, ETHER_ADDR_LEN);
3343 
3344 	err = mxge_reset(sc, 1);
3345 	if (err != 0) {
3346 		if_printf(ifp, "failed to reset\n");
3347 		return EIO;
3348 	}
3349 
3350 	if (sc->num_slices > 1) {
3351 		/* Setup the indirection table */
3352 		cmd.data0 = sc->num_slices;
3353 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE, &cmd);
3354 
3355 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET, &cmd);
3356 		if (err != 0) {
3357 			if_printf(ifp, "failed to setup rss tables\n");
3358 			return err;
3359 		}
3360 
3361 		/* Just enable an identity mapping */
3362 		itable = sc->sram + cmd.data0;
3363 		for (i = 0; i < sc->num_slices; i++)
3364 			itable[i] = (uint8_t)i;
3365 
3366 		if (sc->use_rss) {
3367 			volatile uint8_t *hwkey;
3368 			uint8_t swkey[MXGE_HWRSS_KEYLEN];
3369 
3370 			err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_KEY_OFFSET,
3371 			    &cmd);
3372 			if (err != 0) {
3373 				if_printf(ifp, "failed to get rsskey\n");
3374 				return err;
3375 			}
3376 			hwkey = sc->sram + cmd.data0;
3377 
3378 			toeplitz_get_key(swkey, MXGE_HWRSS_KEYLEN);
3379 			for (i = 0; i < MXGE_HWRSS_KEYLEN; ++i)
3380 				hwkey[i] = swkey[i];
3381 			wmb();
3382 
3383 			err = mxge_send_cmd(sc, MXGEFW_CMD_RSS_KEY_UPDATED,
3384 			    &cmd);
3385 			if (err != 0) {
3386 				if_printf(ifp, "failed to update rsskey\n");
3387 				return err;
3388 			}
3389 			if (bootverbose)
3390 				if_printf(ifp, "RSS key updated\n");
3391 		}
3392 
3393 		cmd.data0 = 1;
3394 		if (sc->use_rss) {
3395 			if (bootverbose)
3396 				if_printf(ifp, "input hash: RSS\n");
3397 			cmd.data1 = MXGEFW_RSS_HASH_TYPE_IPV4 |
3398 			    MXGEFW_RSS_HASH_TYPE_TCP_IPV4;
3399 		} else {
3400 			if (bootverbose)
3401 				if_printf(ifp, "input hash: SRC_DST_PORT\n");
3402 			cmd.data1 = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
3403 		}
3404 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3405 		if (err != 0) {
3406 			if_printf(ifp, "failed to enable slices\n");
3407 			return err;
3408 		}
3409 	}
3410 
3411 	cmd.data0 = MXGEFW_TSO_MODE_NDIS;
3412 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_TSO_MODE, &cmd);
3413 	if (err) {
3414 		/*
3415 		 * Can't change TSO mode to NDIS, never allow TSO then
3416 		 */
3417 		if_printf(ifp, "failed to set TSO mode\n");
3418 		ifp->if_capenable &= ~IFCAP_TSO;
3419 		ifp->if_capabilities &= ~IFCAP_TSO;
3420 		ifp->if_hwassist &= ~CSUM_TSO;
3421 	}
3422 
3423 	mxge_choose_params(ifp->if_mtu, &cl_size);
3424 
3425 	cmd.data0 = 1;
3426 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS, &cmd);
3427 	/*
3428 	 * Error is only meaningful if we're trying to set
3429 	 * MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1
3430 	 */
3431 
3432 	/*
3433 	 * Give the firmware the mtu and the big and small buffer
3434 	 * sizes.  The firmware wants the big buf size to be a power
3435 	 * of two. Luckily, DragonFly's clusters are powers of two
3436 	 */
3437 	cmd.data0 = ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3438 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3439 
3440 	cmd.data0 = MXGE_RX_SMALL_BUFLEN;
3441 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE, &cmd);
3442 
3443 	cmd.data0 = cl_size;
3444 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3445 
3446 	if (err != 0) {
3447 		if_printf(ifp, "failed to setup params\n");
3448 		goto abort;
3449 	}
3450 
3451 	/* Now give him the pointer to the stats block */
3452 	for (slice = 0; slice < sc->num_slices; slice++) {
3453 		ss = &sc->ss[slice];
3454 		cmd.data0 = MXGE_LOWPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3455 		cmd.data1 = MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3456 		cmd.data2 = sizeof(struct mcp_irq_data);
3457 		cmd.data2 |= (slice << 16);
3458 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3459 	}
3460 
3461 	if (err != 0) {
3462 		bus = sc->ss->fw_stats_dma.dmem_busaddr;
3463 		bus += offsetof(struct mcp_irq_data, send_done_count);
3464 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3465 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3466 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3467 		    &cmd);
3468 
3469 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3470 		sc->fw_multicast_support = 0;
3471 	} else {
3472 		sc->fw_multicast_support = 1;
3473 	}
3474 
3475 	if (err != 0) {
3476 		if_printf(ifp, "failed to setup params\n");
3477 		goto abort;
3478 	}
3479 
3480 	for (slice = 0; slice < sc->num_slices; slice++) {
3481 		err = mxge_slice_open(&sc->ss[slice], cl_size);
3482 		if (err != 0) {
3483 			if_printf(ifp, "couldn't open slice %d\n", slice);
3484 			goto abort;
3485 		}
3486 	}
3487 
3488 	/* Finally, start the firmware running */
3489 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3490 	if (err) {
3491 		if_printf(ifp, "Couldn't bring up link\n");
3492 		goto abort;
3493 	}
3494 
3495 	ifp->if_flags |= IFF_RUNNING;
3496 	for (i = 0; i < sc->num_tx_rings; ++i) {
3497 		mxge_tx_ring_t *tx = &sc->ss[i].tx;
3498 
3499 		ifsq_clr_oactive(tx->ifsq);
3500 		ifsq_watchdog_start(&tx->watchdog);
3501 	}
3502 
3503 	return 0;
3504 
3505 abort:
3506 	mxge_free_mbufs(sc);
3507 	return err;
3508 }
3509 
3510 static void
3511 mxge_close(mxge_softc_t *sc, int down)
3512 {
3513 	struct ifnet *ifp = sc->ifp;
3514 	mxge_cmd_t cmd;
3515 	int err, old_down_cnt, i;
3516 
3517 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
3518 
3519 	if (!down) {
3520 		old_down_cnt = sc->down_cnt;
3521 		wmb();
3522 
3523 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3524 		if (err)
3525 			if_printf(ifp, "Couldn't bring down link\n");
3526 
3527 		if (old_down_cnt == sc->down_cnt) {
3528 			/*
3529 			 * Wait for down irq
3530 			 * XXX racy
3531 			 */
3532 			ifnet_deserialize_all(ifp);
3533 			DELAY(10 * sc->intr_coal_delay);
3534 			ifnet_serialize_all(ifp);
3535 		}
3536 
3537 		wmb();
3538 		if (old_down_cnt == sc->down_cnt)
3539 			if_printf(ifp, "never got down irq\n");
3540 	}
3541 	mxge_free_mbufs(sc);
3542 
3543 	ifp->if_flags &= ~IFF_RUNNING;
3544 	for (i = 0; i < sc->num_tx_rings; ++i) {
3545 		mxge_tx_ring_t *tx = &sc->ss[i].tx;
3546 
3547 		ifsq_clr_oactive(tx->ifsq);
3548 		ifsq_watchdog_stop(&tx->watchdog);
3549 	}
3550 }
3551 
3552 static void
3553 mxge_setup_cfg_space(mxge_softc_t *sc)
3554 {
3555 	device_t dev = sc->dev;
3556 	int reg;
3557 	uint16_t lnk, pectl;
3558 
3559 	/* Find the PCIe link width and set max read request to 4KB */
3560 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3561 		lnk = pci_read_config(dev, reg + 0x12, 2);
3562 		sc->link_width = (lnk >> 4) & 0x3f;
3563 
3564 		if (sc->pectl == 0) {
3565 			pectl = pci_read_config(dev, reg + 0x8, 2);
3566 			pectl = (pectl & ~0x7000) | (5 << 12);
3567 			pci_write_config(dev, reg + 0x8, pectl, 2);
3568 			sc->pectl = pectl;
3569 		} else {
3570 			/* Restore saved pectl after watchdog reset */
3571 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3572 		}
3573 	}
3574 
3575 	/* Enable DMA and memory space access */
3576 	pci_enable_busmaster(dev);
3577 }
3578 
3579 static uint32_t
3580 mxge_read_reboot(mxge_softc_t *sc)
3581 {
3582 	device_t dev = sc->dev;
3583 	uint32_t vs;
3584 
3585 	/* Find the vendor specific offset */
3586 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3587 		if_printf(sc->ifp, "could not find vendor specific offset\n");
3588 		return (uint32_t)-1;
3589 	}
3590 	/* Enable read32 mode */
3591 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3592 	/* Tell NIC which register to read */
3593 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3594 	return pci_read_config(dev, vs + 0x14, 4);
3595 }
3596 
3597 static void
3598 mxge_watchdog_reset(mxge_softc_t *sc)
3599 {
3600 	struct pci_devinfo *dinfo;
3601 	int err, running;
3602 	uint32_t reboot;
3603 	uint16_t cmd;
3604 
3605 	err = ENXIO;
3606 
3607 	if_printf(sc->ifp, "Watchdog reset!\n");
3608 
3609 	/*
3610 	 * Check to see if the NIC rebooted.  If it did, then all of
3611 	 * PCI config space has been reset, and things like the
3612 	 * busmaster bit will be zero.  If this is the case, then we
3613 	 * must restore PCI config space before the NIC can be used
3614 	 * again
3615 	 */
3616 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3617 	if (cmd == 0xffff) {
3618 		/*
3619 		 * Maybe the watchdog caught the NIC rebooting; wait
3620 		 * up to 100ms for it to finish.  If it does not come
3621 		 * back, then give up
3622 		 */
3623 		DELAY(1000*100);
3624 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3625 		if (cmd == 0xffff)
3626 			if_printf(sc->ifp, "NIC disappeared!\n");
3627 	}
3628 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3629 		/* Print the reboot status */
3630 		reboot = mxge_read_reboot(sc);
3631 		if_printf(sc->ifp, "NIC rebooted, status = 0x%x\n", reboot);
3632 
3633 		running = sc->ifp->if_flags & IFF_RUNNING;
3634 		if (running) {
3635 			/*
3636 			 * Quiesce NIC so that TX routines will not try to
3637 			 * xmit after restoration of BAR
3638 			 */
3639 
3640 			/* Mark the link as down */
3641 			if (sc->link_state) {
3642 				sc->ifp->if_link_state = LINK_STATE_DOWN;
3643 				if_link_state_change(sc->ifp);
3644 			}
3645 			mxge_close(sc, 1);
3646 		}
3647 		/* Restore PCI configuration space */
3648 		dinfo = device_get_ivars(sc->dev);
3649 		pci_cfg_restore(sc->dev, dinfo);
3650 
3651 		/* And redo any changes we made to our config space */
3652 		mxge_setup_cfg_space(sc);
3653 
3654 		/* Reload f/w */
3655 		err = mxge_load_firmware(sc, 0);
3656 		if (err)
3657 			if_printf(sc->ifp, "Unable to re-load f/w\n");
3658 		if (running && !err) {
3659 			int i;
3660 
3661 			err = mxge_open(sc);
3662 
3663 			for (i = 0; i < sc->num_tx_rings; ++i)
3664 				ifsq_devstart_sched(sc->ss[i].tx.ifsq);
3665 		}
3666 		sc->watchdog_resets++;
3667 	} else {
3668 		if_printf(sc->ifp, "NIC did not reboot, not resetting\n");
3669 		err = 0;
3670 	}
3671 	if (err) {
3672 		if_printf(sc->ifp, "watchdog reset failed\n");
3673 	} else {
3674 		if (sc->dying == 2)
3675 			sc->dying = 0;
3676 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3677 	}
3678 }
3679 
3680 static void
3681 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3682 {
3683 	if_printf(sc->ifp, "slice %d struck? ring state:\n", slice);
3684 	if_printf(sc->ifp, "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3685 	    tx->req, tx->done, tx->queue_active);
3686 	if_printf(sc->ifp, "tx.activate=%d tx.deactivate=%d\n",
3687 	    tx->activate, tx->deactivate);
3688 	if_printf(sc->ifp, "pkt_done=%d fw=%d\n",
3689 	    tx->pkt_done, be32toh(sc->ss->fw_stats->send_done_count));
3690 }
3691 
3692 static u_long
3693 mxge_update_stats(mxge_softc_t *sc)
3694 {
3695 	u_long ipackets, opackets, pkts;
3696 
3697 	IFNET_STAT_GET(sc->ifp, ipackets, ipackets);
3698 	IFNET_STAT_GET(sc->ifp, opackets, opackets);
3699 
3700 	pkts = ipackets - sc->ipackets;
3701 	pkts += opackets - sc->opackets;
3702 
3703 	sc->ipackets = ipackets;
3704 	sc->opackets = opackets;
3705 
3706 	return pkts;
3707 }
3708 
3709 static void
3710 mxge_tick(void *arg)
3711 {
3712 	mxge_softc_t *sc = arg;
3713 	u_long pkts = 0;
3714 	int err = 0;
3715 	int ticks;
3716 
3717 	lwkt_serialize_enter(&sc->main_serialize);
3718 
3719 	ticks = mxge_ticks;
3720 	if (sc->ifp->if_flags & IFF_RUNNING) {
3721 		/* Aggregate stats from different slices */
3722 		pkts = mxge_update_stats(sc);
3723 		if (sc->need_media_probe)
3724 			mxge_media_probe(sc);
3725 	}
3726 	if (pkts == 0) {
3727 		uint16_t cmd;
3728 
3729 		/* Ensure NIC did not suffer h/w fault while idle */
3730 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3731 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3732 			sc->dying = 2;
3733 			mxge_serialize_skipmain(sc);
3734 			mxge_watchdog_reset(sc);
3735 			mxge_deserialize_skipmain(sc);
3736 			err = ENXIO;
3737 		}
3738 
3739 		/* Look less often if NIC is idle */
3740 		ticks *= 4;
3741 	}
3742 
3743 	if (err == 0)
3744 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
3745 
3746 	lwkt_serialize_exit(&sc->main_serialize);
3747 }
3748 
3749 static int
3750 mxge_media_change(struct ifnet *ifp)
3751 {
3752 	return EINVAL;
3753 }
3754 
3755 static int
3756 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3757 {
3758 	struct ifnet *ifp = sc->ifp;
3759 	int real_mtu, old_mtu;
3760 	int err = 0;
3761 
3762 	real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3763 	if (mtu > sc->max_mtu || real_mtu < 60)
3764 		return EINVAL;
3765 
3766 	old_mtu = ifp->if_mtu;
3767 	ifp->if_mtu = mtu;
3768 	if (ifp->if_flags & IFF_RUNNING) {
3769 		mxge_close(sc, 0);
3770 		err = mxge_open(sc);
3771 		if (err != 0) {
3772 			ifp->if_mtu = old_mtu;
3773 			mxge_close(sc, 0);
3774 			mxge_open(sc);
3775 		}
3776 	}
3777 	return err;
3778 }
3779 
3780 static void
3781 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3782 {
3783 	mxge_softc_t *sc = ifp->if_softc;
3784 
3785 
3786 	if (sc == NULL)
3787 		return;
3788 	ifmr->ifm_status = IFM_AVALID;
3789 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
3790 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3791 	ifmr->ifm_active |= sc->current_media;
3792 }
3793 
3794 static int
3795 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data,
3796     struct ucred *cr __unused)
3797 {
3798 	mxge_softc_t *sc = ifp->if_softc;
3799 	struct ifreq *ifr = (struct ifreq *)data;
3800 	int err, mask;
3801 
3802 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
3803 	err = 0;
3804 
3805 	switch (command) {
3806 	case SIOCSIFMTU:
3807 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3808 		break;
3809 
3810 	case SIOCSIFFLAGS:
3811 		if (sc->dying)
3812 			return EINVAL;
3813 
3814 		if (ifp->if_flags & IFF_UP) {
3815 			if (!(ifp->if_flags & IFF_RUNNING)) {
3816 				err = mxge_open(sc);
3817 			} else {
3818 				/*
3819 				 * Take care of PROMISC and ALLMULTI
3820 				 * flag changes
3821 				 */
3822 				mxge_change_promisc(sc,
3823 				    ifp->if_flags & IFF_PROMISC);
3824 				mxge_set_multicast_list(sc);
3825 			}
3826 		} else {
3827 			if (ifp->if_flags & IFF_RUNNING)
3828 				mxge_close(sc, 0);
3829 		}
3830 		break;
3831 
3832 	case SIOCADDMULTI:
3833 	case SIOCDELMULTI:
3834 		mxge_set_multicast_list(sc);
3835 		break;
3836 
3837 	case SIOCSIFCAP:
3838 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3839 		if (mask & IFCAP_TXCSUM) {
3840 			ifp->if_capenable ^= IFCAP_TXCSUM;
3841 			if (ifp->if_capenable & IFCAP_TXCSUM)
3842 				ifp->if_hwassist |= CSUM_TCP | CSUM_UDP;
3843 			else
3844 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
3845 		}
3846 		if (mask & IFCAP_TSO) {
3847 			ifp->if_capenable ^= IFCAP_TSO;
3848 			if (ifp->if_capenable & IFCAP_TSO)
3849 				ifp->if_hwassist |= CSUM_TSO;
3850 			else
3851 				ifp->if_hwassist &= ~CSUM_TSO;
3852 		}
3853 		if (mask & IFCAP_RXCSUM)
3854 			ifp->if_capenable ^= IFCAP_RXCSUM;
3855 		if (mask & IFCAP_VLAN_HWTAGGING)
3856 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3857 		break;
3858 
3859 	case SIOCGIFMEDIA:
3860 		mxge_media_probe(sc);
3861 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3862 		    &sc->media, command);
3863 		break;
3864 
3865 	default:
3866 		err = ether_ioctl(ifp, command, data);
3867 		break;
3868 	}
3869 	return err;
3870 }
3871 
3872 static void
3873 mxge_fetch_tunables(mxge_softc_t *sc)
3874 {
3875 	sc->intr_coal_delay = mxge_intr_coal_delay;
3876 	if (sc->intr_coal_delay < 0 || sc->intr_coal_delay > (10 * 1000))
3877 		sc->intr_coal_delay = MXGE_INTR_COAL_DELAY;
3878 
3879 	/* XXX */
3880 	if (mxge_ticks == 0)
3881 		mxge_ticks = hz / 2;
3882 
3883 	sc->pause = mxge_flow_control;
3884 	sc->use_rss = mxge_use_rss;
3885 
3886 	sc->throttle = mxge_throttle;
3887 	if (sc->throttle && sc->throttle > MXGE_MAX_THROTTLE)
3888 		sc->throttle = MXGE_MAX_THROTTLE;
3889 	if (sc->throttle && sc->throttle < MXGE_MIN_THROTTLE)
3890 		sc->throttle = MXGE_MIN_THROTTLE;
3891 }
3892 
3893 static void
3894 mxge_free_slices(mxge_softc_t *sc)
3895 {
3896 	struct mxge_slice_state *ss;
3897 	int i;
3898 
3899 	if (sc->ss == NULL)
3900 		return;
3901 
3902 	for (i = 0; i < sc->num_slices; i++) {
3903 		ss = &sc->ss[i];
3904 		if (ss->fw_stats != NULL) {
3905 			mxge_dma_free(&ss->fw_stats_dma);
3906 			ss->fw_stats = NULL;
3907 		}
3908 		if (ss->rx_data.rx_done.entry != NULL) {
3909 			mxge_dma_free(&ss->rx_done_dma);
3910 			ss->rx_data.rx_done.entry = NULL;
3911 		}
3912 	}
3913 	kfree(sc->ss, M_DEVBUF);
3914 	sc->ss = NULL;
3915 }
3916 
3917 static int
3918 mxge_alloc_slices(mxge_softc_t *sc)
3919 {
3920 	mxge_cmd_t cmd;
3921 	struct mxge_slice_state *ss;
3922 	size_t bytes;
3923 	int err, i, rx_ring_size;
3924 
3925 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3926 	if (err != 0) {
3927 		device_printf(sc->dev, "Cannot determine rx ring size\n");
3928 		return err;
3929 	}
3930 	rx_ring_size = cmd.data0;
3931 	sc->rx_intr_slots = 2 * (rx_ring_size / sizeof (mcp_dma_addr_t));
3932 
3933 	bytes = sizeof(*sc->ss) * sc->num_slices;
3934 	sc->ss = kmalloc_cachealign(bytes, M_DEVBUF, M_WAITOK | M_ZERO);
3935 
3936 	for (i = 0; i < sc->num_slices; i++) {
3937 		ss = &sc->ss[i];
3938 
3939 		ss->sc = sc;
3940 
3941 		lwkt_serialize_init(&ss->rx_data.rx_serialize);
3942 		lwkt_serialize_init(&ss->tx.tx_serialize);
3943 		ss->intr_rid = -1;
3944 
3945 		/*
3946 		 * Allocate per-slice rx interrupt queue
3947 		 * XXX assume 4bytes mcp_slot
3948 		 */
3949 		bytes = sc->rx_intr_slots * sizeof(mcp_slot_t);
3950 		err = mxge_dma_alloc(sc, &ss->rx_done_dma, bytes, 4096);
3951 		if (err != 0) {
3952 			device_printf(sc->dev,
3953 			    "alloc %d slice rx_done failed\n", i);
3954 			return err;
3955 		}
3956 		ss->rx_data.rx_done.entry = ss->rx_done_dma.dmem_addr;
3957 
3958 		/*
3959 		 * Allocate the per-slice firmware stats
3960 		 */
3961 		bytes = sizeof(*ss->fw_stats);
3962 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
3963 		    sizeof(*ss->fw_stats), 64);
3964 		if (err != 0) {
3965 			device_printf(sc->dev,
3966 			    "alloc %d fw_stats failed\n", i);
3967 			return err;
3968 		}
3969 		ss->fw_stats = ss->fw_stats_dma.dmem_addr;
3970 	}
3971 	return 0;
3972 }
3973 
3974 static void
3975 mxge_slice_probe(mxge_softc_t *sc)
3976 {
3977 	int status, max_intr_slots, max_slices, num_slices;
3978 	int msix_cnt, msix_enable, i, multi_tx;
3979 	mxge_cmd_t cmd;
3980 	const char *old_fw;
3981 
3982 	sc->num_slices = 1;
3983 	sc->num_tx_rings = 1;
3984 
3985 	num_slices = device_getenv_int(sc->dev, "num_slices", mxge_num_slices);
3986 	if (num_slices == 1)
3987 		return;
3988 
3989 	if (ncpus2 == 1)
3990 		return;
3991 
3992 	msix_enable = device_getenv_int(sc->dev, "msix.enable",
3993 	    mxge_msix_enable);
3994 	if (!msix_enable)
3995 		return;
3996 
3997 	msix_cnt = pci_msix_count(sc->dev);
3998 	if (msix_cnt < 2)
3999 		return;
4000 
4001 	/*
4002 	 * Round down MSI-X vector count to the nearest power of 2
4003 	 */
4004 	i = 0;
4005 	while ((1 << (i + 1)) <= msix_cnt)
4006 		++i;
4007 	msix_cnt = 1 << i;
4008 
4009 	/*
4010 	 * Now load the slice aware firmware see what it supports
4011 	 */
4012 	old_fw = sc->fw_name;
4013 	if (old_fw == mxge_fw_aligned)
4014 		sc->fw_name = mxge_fw_rss_aligned;
4015 	else
4016 		sc->fw_name = mxge_fw_rss_unaligned;
4017 	status = mxge_load_firmware(sc, 0);
4018 	if (status != 0) {
4019 		device_printf(sc->dev, "Falling back to a single slice\n");
4020 		return;
4021 	}
4022 
4023 	/*
4024 	 * Try to send a reset command to the card to see if it is alive
4025 	 */
4026 	memset(&cmd, 0, sizeof(cmd));
4027 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4028 	if (status != 0) {
4029 		device_printf(sc->dev, "failed reset\n");
4030 		goto abort_with_fw;
4031 	}
4032 
4033 	/*
4034 	 * Get rx ring size to calculate rx interrupt queue size
4035 	 */
4036 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4037 	if (status != 0) {
4038 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4039 		goto abort_with_fw;
4040 	}
4041 	max_intr_slots = 2 * (cmd.data0 / sizeof(mcp_dma_addr_t));
4042 
4043 	/*
4044 	 * Tell it the size of the rx interrupt queue
4045 	 */
4046 	cmd.data0 = max_intr_slots * sizeof(struct mcp_slot);
4047 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4048 	if (status != 0) {
4049 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4050 		goto abort_with_fw;
4051 	}
4052 
4053 	/*
4054 	 * Ask the maximum number of slices it supports
4055 	 */
4056 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4057 	if (status != 0) {
4058 		device_printf(sc->dev,
4059 		    "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4060 		goto abort_with_fw;
4061 	}
4062 	max_slices = cmd.data0;
4063 
4064 	/*
4065 	 * Round down max slices count to the nearest power of 2
4066 	 */
4067 	i = 0;
4068 	while ((1 << (i + 1)) <= max_slices)
4069 		++i;
4070 	max_slices = 1 << i;
4071 
4072 	if (max_slices > msix_cnt)
4073 		max_slices = msix_cnt;
4074 
4075 	sc->num_slices = num_slices;
4076 	sc->num_slices = if_ring_count2(sc->num_slices, max_slices);
4077 
4078 	multi_tx = device_getenv_int(sc->dev, "multi_tx", mxge_multi_tx);
4079 	if (multi_tx)
4080 		sc->num_tx_rings = sc->num_slices;
4081 
4082 	if (bootverbose) {
4083 		device_printf(sc->dev, "using %d slices, max %d\n",
4084 		    sc->num_slices, max_slices);
4085 	}
4086 
4087 	if (sc->num_slices == 1)
4088 		goto abort_with_fw;
4089 	return;
4090 
4091 abort_with_fw:
4092 	sc->fw_name = old_fw;
4093 	mxge_load_firmware(sc, 0);
4094 }
4095 
4096 static void
4097 mxge_setup_serialize(struct mxge_softc *sc)
4098 {
4099 	int i = 0, slice;
4100 
4101 	/* Main + rx + tx */
4102 	sc->nserialize = (2 * sc->num_slices) + 1;
4103 	sc->serializes =
4104 	    kmalloc(sc->nserialize * sizeof(struct lwkt_serialize *),
4105 	        M_DEVBUF, M_WAITOK | M_ZERO);
4106 
4107 	/*
4108 	 * Setup serializes
4109 	 *
4110 	 * NOTE: Order is critical
4111 	 */
4112 
4113 	KKASSERT(i < sc->nserialize);
4114 	sc->serializes[i++] = &sc->main_serialize;
4115 
4116 	for (slice = 0; slice < sc->num_slices; ++slice) {
4117 		KKASSERT(i < sc->nserialize);
4118 		sc->serializes[i++] = &sc->ss[slice].rx_data.rx_serialize;
4119 	}
4120 
4121 	for (slice = 0; slice < sc->num_slices; ++slice) {
4122 		KKASSERT(i < sc->nserialize);
4123 		sc->serializes[i++] = &sc->ss[slice].tx.tx_serialize;
4124 	}
4125 
4126 	KKASSERT(i == sc->nserialize);
4127 }
4128 
4129 static void
4130 mxge_serialize(struct ifnet *ifp, enum ifnet_serialize slz)
4131 {
4132 	struct mxge_softc *sc = ifp->if_softc;
4133 
4134 	ifnet_serialize_array_enter(sc->serializes, sc->nserialize, slz);
4135 }
4136 
4137 static void
4138 mxge_deserialize(struct ifnet *ifp, enum ifnet_serialize slz)
4139 {
4140 	struct mxge_softc *sc = ifp->if_softc;
4141 
4142 	ifnet_serialize_array_exit(sc->serializes, sc->nserialize, slz);
4143 }
4144 
4145 static int
4146 mxge_tryserialize(struct ifnet *ifp, enum ifnet_serialize slz)
4147 {
4148 	struct mxge_softc *sc = ifp->if_softc;
4149 
4150 	return ifnet_serialize_array_try(sc->serializes, sc->nserialize, slz);
4151 }
4152 
4153 #ifdef INVARIANTS
4154 
4155 static void
4156 mxge_serialize_assert(struct ifnet *ifp, enum ifnet_serialize slz,
4157     boolean_t serialized)
4158 {
4159 	struct mxge_softc *sc = ifp->if_softc;
4160 
4161 	ifnet_serialize_array_assert(sc->serializes, sc->nserialize,
4162 	    slz, serialized);
4163 }
4164 
4165 #endif	/* INVARIANTS */
4166 
4167 #ifdef IFPOLL_ENABLE
4168 
4169 static void
4170 mxge_npoll_rx(struct ifnet *ifp, void *xss, int cycle)
4171 {
4172 	struct mxge_slice_state *ss = xss;
4173 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
4174 
4175 	ASSERT_SERIALIZED(&ss->rx_data.rx_serialize);
4176 
4177 	if (rx_done->entry[rx_done->idx].length != 0) {
4178 		mxge_clean_rx_done(&ss->sc->arpcom.ac_if, &ss->rx_data, cycle);
4179 	} else {
4180 		/*
4181 		 * XXX
4182 		 * This register writting obviously has cost,
4183 		 * however, if we don't hand back the rx token,
4184 		 * the upcoming packets may suffer rediculously
4185 		 * large delay, as observed on 8AL-C using ping(8).
4186 		 */
4187 		*ss->irq_claim = be32toh(3);
4188 	}
4189 }
4190 
4191 static void
4192 mxge_npoll(struct ifnet *ifp, struct ifpoll_info *info)
4193 {
4194 	struct mxge_softc *sc = ifp->if_softc;
4195 	int i;
4196 
4197 	if (info == NULL)
4198 		return;
4199 
4200 	/*
4201 	 * Only poll rx; polling tx and status don't seem to work
4202 	 */
4203 	for (i = 0; i < sc->num_slices; ++i) {
4204 		struct mxge_slice_state *ss = &sc->ss[i];
4205 		int idx = ss->intr_cpuid;
4206 
4207 		KKASSERT(idx < ncpus2);
4208 		info->ifpi_rx[idx].poll_func = mxge_npoll_rx;
4209 		info->ifpi_rx[idx].arg = ss;
4210 		info->ifpi_rx[idx].serializer = &ss->rx_data.rx_serialize;
4211 	}
4212 }
4213 
4214 #endif	/* IFPOLL_ENABLE */
4215 
4216 static int
4217 mxge_attach(device_t dev)
4218 {
4219 	mxge_softc_t *sc = device_get_softc(dev);
4220 	struct ifnet *ifp = &sc->arpcom.ac_if;
4221 	int err, rid, i;
4222 
4223 	/*
4224 	 * Avoid rewriting half the lines in this file to use
4225 	 * &sc->arpcom.ac_if instead
4226 	 */
4227 	sc->ifp = ifp;
4228 	sc->dev = dev;
4229 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4230 	ifmedia_init(&sc->media, 0, mxge_media_change, mxge_media_status);
4231 
4232 	lwkt_serialize_init(&sc->main_serialize);
4233 
4234 	mxge_fetch_tunables(sc);
4235 
4236 	err = bus_dma_tag_create(NULL,			/* parent */
4237 				 1,			/* alignment */
4238 				 0,			/* boundary */
4239 				 BUS_SPACE_MAXADDR,	/* low */
4240 				 BUS_SPACE_MAXADDR,	/* high */
4241 				 NULL, NULL,		/* filter */
4242 				 BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
4243 				 0, 			/* num segs */
4244 				 BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
4245 				 0,			/* flags */
4246 				 &sc->parent_dmat);	/* tag */
4247 	if (err != 0) {
4248 		device_printf(dev, "Err %d allocating parent dmat\n", err);
4249 		goto failed;
4250 	}
4251 
4252 	callout_init_mp(&sc->co_hdl);
4253 
4254 	mxge_setup_cfg_space(sc);
4255 
4256 	/*
4257 	 * Map the board into the kernel
4258 	 */
4259 	rid = PCIR_BARS;
4260 	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
4261 	    &rid, RF_ACTIVE);
4262 	if (sc->mem_res == NULL) {
4263 		device_printf(dev, "could not map memory\n");
4264 		err = ENXIO;
4265 		goto failed;
4266 	}
4267 
4268 	sc->sram = rman_get_virtual(sc->mem_res);
4269 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4270 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4271 		device_printf(dev, "impossible memory region size %ld\n",
4272 		    rman_get_size(sc->mem_res));
4273 		err = ENXIO;
4274 		goto failed;
4275 	}
4276 
4277 	/*
4278 	 * Make NULL terminated copy of the EEPROM strings section of
4279 	 * lanai SRAM
4280 	 */
4281 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4282 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4283 	    rman_get_bushandle(sc->mem_res),
4284 	    sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4285 	    sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE - 2);
4286 	err = mxge_parse_strings(sc);
4287 	if (err != 0) {
4288 		device_printf(dev, "parse EEPROM string failed\n");
4289 		goto failed;
4290 	}
4291 
4292 	/*
4293 	 * Enable write combining for efficient use of PCIe bus
4294 	 */
4295 	mxge_enable_wc(sc);
4296 
4297 	/*
4298 	 * Allocate the out of band DMA memory
4299 	 */
4300 	err = mxge_dma_alloc(sc, &sc->cmd_dma, sizeof(mxge_cmd_t), 64);
4301 	if (err != 0) {
4302 		device_printf(dev, "alloc cmd DMA buf failed\n");
4303 		goto failed;
4304 	}
4305 	sc->cmd = sc->cmd_dma.dmem_addr;
4306 
4307 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4308 	if (err != 0) {
4309 		device_printf(dev, "alloc zeropad DMA buf failed\n");
4310 		goto failed;
4311 	}
4312 
4313 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4314 	if (err != 0) {
4315 		device_printf(dev, "alloc dmabench DMA buf failed\n");
4316 		goto failed;
4317 	}
4318 
4319 	/* Select & load the firmware */
4320 	err = mxge_select_firmware(sc);
4321 	if (err != 0) {
4322 		device_printf(dev, "select firmware failed\n");
4323 		goto failed;
4324 	}
4325 
4326 	mxge_slice_probe(sc);
4327 	err = mxge_alloc_slices(sc);
4328 	if (err != 0) {
4329 		device_printf(dev, "alloc slices failed\n");
4330 		goto failed;
4331 	}
4332 
4333 	err = mxge_alloc_intr(sc);
4334 	if (err != 0) {
4335 		device_printf(dev, "alloc intr failed\n");
4336 		goto failed;
4337 	}
4338 
4339 	/* Setup serializes */
4340 	mxge_setup_serialize(sc);
4341 
4342 	err = mxge_reset(sc, 0);
4343 	if (err != 0) {
4344 		device_printf(dev, "reset failed\n");
4345 		goto failed;
4346 	}
4347 
4348 	err = mxge_alloc_rings(sc);
4349 	if (err != 0) {
4350 		device_printf(dev, "failed to allocate rings\n");
4351 		goto failed;
4352 	}
4353 
4354 	ifp->if_baudrate = IF_Gbps(10UL);
4355 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO;
4356 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4357 
4358 	ifp->if_capabilities |= IFCAP_VLAN_MTU;
4359 #if 0
4360 	/* Well, its software, sigh */
4361 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
4362 #endif
4363 	ifp->if_capenable = ifp->if_capabilities;
4364 
4365 	ifp->if_softc = sc;
4366 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4367 	ifp->if_init = mxge_init;
4368 	ifp->if_ioctl = mxge_ioctl;
4369 	ifp->if_start = mxge_start;
4370 #ifdef IFPOLL_ENABLE
4371 	if (sc->intr_type != PCI_INTR_TYPE_LEGACY)
4372 		ifp->if_npoll = mxge_npoll;
4373 #endif
4374 	ifp->if_serialize = mxge_serialize;
4375 	ifp->if_deserialize = mxge_deserialize;
4376 	ifp->if_tryserialize = mxge_tryserialize;
4377 #ifdef INVARIANTS
4378 	ifp->if_serialize_assert = mxge_serialize_assert;
4379 #endif
4380 
4381 	/* Increase TSO burst length */
4382 	ifp->if_tsolen = (32 * ETHERMTU);
4383 
4384 	/* Initialise the ifmedia structure */
4385 	mxge_media_init(sc);
4386 	mxge_media_probe(sc);
4387 
4388 	ether_ifattach(ifp, sc->mac_addr, NULL);
4389 
4390 	/* Setup TX rings and subqueues */
4391 	for (i = 0; i < sc->num_tx_rings; ++i) {
4392 		struct ifaltq_subque *ifsq = ifq_get_subq(&ifp->if_snd, i);
4393 		struct mxge_slice_state *ss = &sc->ss[i];
4394 
4395 		ifsq_set_cpuid(ifsq, ss->intr_cpuid);
4396 		ifsq_set_hw_serialize(ifsq, &ss->tx.tx_serialize);
4397 		ifsq_set_priv(ifsq, &ss->tx);
4398 		ss->tx.ifsq = ifsq;
4399 
4400 		ifsq_watchdog_init(&ss->tx.watchdog, ifsq, mxge_watchdog);
4401 	}
4402 
4403 	/*
4404 	 * XXX
4405 	 * We are not ready to do "gather" jumbo frame, so
4406 	 * limit MTU to MJUMPAGESIZE
4407 	 */
4408 	sc->max_mtu = MJUMPAGESIZE -
4409 	    ETHER_HDR_LEN - EVL_ENCAPLEN - MXGEFW_PAD - 1;
4410 	sc->dying = 0;
4411 
4412 	err = mxge_setup_intr(sc);
4413 	if (err != 0) {
4414 		device_printf(dev, "alloc and setup intr failed\n");
4415 		ether_ifdetach(ifp);
4416 		goto failed;
4417 	}
4418 
4419 	mxge_add_sysctls(sc);
4420 
4421 	callout_reset_bycpu(&sc->co_hdl, mxge_ticks, mxge_tick, sc,
4422 	    sc->ss[0].intr_cpuid);
4423 	return 0;
4424 
4425 failed:
4426 	mxge_detach(dev);
4427 	return err;
4428 }
4429 
4430 static int
4431 mxge_detach(device_t dev)
4432 {
4433 	mxge_softc_t *sc = device_get_softc(dev);
4434 
4435 	if (device_is_attached(dev)) {
4436 		struct ifnet *ifp = sc->ifp;
4437 
4438 		ifnet_serialize_all(ifp);
4439 
4440 		sc->dying = 1;
4441 		if (ifp->if_flags & IFF_RUNNING)
4442 			mxge_close(sc, 1);
4443 		callout_stop(&sc->co_hdl);
4444 
4445 		mxge_teardown_intr(sc, sc->num_slices);
4446 
4447 		ifnet_deserialize_all(ifp);
4448 
4449 		callout_terminate(&sc->co_hdl);
4450 
4451 		ether_ifdetach(ifp);
4452 	}
4453 	ifmedia_removeall(&sc->media);
4454 
4455 	if (sc->cmd != NULL && sc->zeropad_dma.dmem_addr != NULL &&
4456 	    sc->sram != NULL)
4457 		mxge_dummy_rdma(sc, 0);
4458 
4459 	mxge_free_intr(sc);
4460 	mxge_rem_sysctls(sc);
4461 	mxge_free_rings(sc);
4462 
4463 	/* MUST after sysctls, intr and rings are freed */
4464 	mxge_free_slices(sc);
4465 
4466 	if (sc->dmabench_dma.dmem_addr != NULL)
4467 		mxge_dma_free(&sc->dmabench_dma);
4468 	if (sc->zeropad_dma.dmem_addr != NULL)
4469 		mxge_dma_free(&sc->zeropad_dma);
4470 	if (sc->cmd_dma.dmem_addr != NULL)
4471 		mxge_dma_free(&sc->cmd_dma);
4472 
4473 	if (sc->msix_table_res != NULL) {
4474 		bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(2),
4475 		    sc->msix_table_res);
4476 	}
4477 	if (sc->mem_res != NULL) {
4478 		bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS,
4479 		    sc->mem_res);
4480 	}
4481 
4482 	if (sc->parent_dmat != NULL)
4483 		bus_dma_tag_destroy(sc->parent_dmat);
4484 
4485 	return 0;
4486 }
4487 
4488 static int
4489 mxge_shutdown(device_t dev)
4490 {
4491 	return 0;
4492 }
4493 
4494 static void
4495 mxge_free_msix(struct mxge_softc *sc, boolean_t setup)
4496 {
4497 	int i;
4498 
4499 	KKASSERT(sc->num_slices > 1);
4500 
4501 	for (i = 0; i < sc->num_slices; ++i) {
4502 		struct mxge_slice_state *ss = &sc->ss[i];
4503 
4504 		if (ss->intr_res != NULL) {
4505 			bus_release_resource(sc->dev, SYS_RES_IRQ,
4506 			    ss->intr_rid, ss->intr_res);
4507 		}
4508 		if (ss->intr_rid >= 0)
4509 			pci_release_msix_vector(sc->dev, ss->intr_rid);
4510 	}
4511 	if (setup)
4512 		pci_teardown_msix(sc->dev);
4513 }
4514 
4515 static int
4516 mxge_alloc_msix(struct mxge_softc *sc)
4517 {
4518 	struct mxge_slice_state *ss;
4519 	int offset, rid, error, i;
4520 	boolean_t setup = FALSE;
4521 
4522 	KKASSERT(sc->num_slices > 1);
4523 
4524 	if (sc->num_slices == ncpus2) {
4525 		offset = 0;
4526 	} else {
4527 		int offset_def;
4528 
4529 		offset_def = (sc->num_slices * device_get_unit(sc->dev)) %
4530 		    ncpus2;
4531 
4532 		offset = device_getenv_int(sc->dev, "msix.offset", offset_def);
4533 		if (offset >= ncpus2 ||
4534 		    offset % sc->num_slices != 0) {
4535 			device_printf(sc->dev, "invalid msix.offset %d, "
4536 			    "use %d\n", offset, offset_def);
4537 			offset = offset_def;
4538 		}
4539 	}
4540 
4541 	ss = &sc->ss[0];
4542 
4543 	ss->intr_serialize = &sc->main_serialize;
4544 	ss->intr_func = mxge_msi;
4545 	ksnprintf(ss->intr_desc0, sizeof(ss->intr_desc0),
4546 	    "%s comb", device_get_nameunit(sc->dev));
4547 	ss->intr_desc = ss->intr_desc0;
4548 	ss->intr_cpuid = offset;
4549 
4550 	for (i = 1; i < sc->num_slices; ++i) {
4551 		ss = &sc->ss[i];
4552 
4553 		ss->intr_serialize = &ss->rx_data.rx_serialize;
4554 		if (sc->num_tx_rings == 1) {
4555 			ss->intr_func = mxge_msix_rx;
4556 			ksnprintf(ss->intr_desc0, sizeof(ss->intr_desc0),
4557 			    "%s rx", device_get_nameunit(sc->dev));
4558 		} else {
4559 			ss->intr_func = mxge_msix_rxtx;
4560 			ksnprintf(ss->intr_desc0, sizeof(ss->intr_desc0),
4561 			    "%s rxtx", device_get_nameunit(sc->dev));
4562 		}
4563 		ss->intr_desc = ss->intr_desc0;
4564 		ss->intr_cpuid = offset + i;
4565 	}
4566 
4567 	rid = PCIR_BAR(2);
4568 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4569 	    &rid, RF_ACTIVE);
4570 	if (sc->msix_table_res == NULL) {
4571 		device_printf(sc->dev, "couldn't alloc MSI-X table res\n");
4572 		return ENXIO;
4573 	}
4574 
4575 	error = pci_setup_msix(sc->dev);
4576 	if (error) {
4577 		device_printf(sc->dev, "could not setup MSI-X\n");
4578 		goto back;
4579 	}
4580 	setup = TRUE;
4581 
4582 	for (i = 0; i < sc->num_slices; ++i) {
4583 		ss = &sc->ss[i];
4584 
4585 		error = pci_alloc_msix_vector(sc->dev, i, &ss->intr_rid,
4586 		    ss->intr_cpuid);
4587 		if (error) {
4588 			device_printf(sc->dev, "could not alloc "
4589 			    "MSI-X %d on cpu%d\n", i, ss->intr_cpuid);
4590 			goto back;
4591 		}
4592 
4593 		ss->intr_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
4594 		    &ss->intr_rid, RF_ACTIVE);
4595 		if (ss->intr_res == NULL) {
4596 			device_printf(sc->dev, "could not alloc "
4597 			    "MSI-X %d resource\n", i);
4598 			error = ENXIO;
4599 			goto back;
4600 		}
4601 	}
4602 
4603 	pci_enable_msix(sc->dev);
4604 	sc->intr_type = PCI_INTR_TYPE_MSIX;
4605 back:
4606 	if (error)
4607 		mxge_free_msix(sc, setup);
4608 	return error;
4609 }
4610 
4611 static int
4612 mxge_alloc_intr(struct mxge_softc *sc)
4613 {
4614 	struct mxge_slice_state *ss;
4615 	u_int irq_flags;
4616 
4617 	if (sc->num_slices > 1) {
4618 		int error;
4619 
4620 		error = mxge_alloc_msix(sc);
4621 		if (error)
4622 			return error;
4623 		KKASSERT(sc->intr_type == PCI_INTR_TYPE_MSIX);
4624 		return 0;
4625 	}
4626 
4627 	ss = &sc->ss[0];
4628 
4629 	sc->intr_type = pci_alloc_1intr(sc->dev, mxge_msi_enable,
4630 	    &ss->intr_rid, &irq_flags);
4631 
4632 	ss->intr_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
4633 	    &ss->intr_rid, irq_flags);
4634 	if (ss->intr_res == NULL) {
4635 		device_printf(sc->dev, "could not alloc interrupt\n");
4636 		return ENXIO;
4637 	}
4638 
4639 	if (sc->intr_type == PCI_INTR_TYPE_LEGACY)
4640 		ss->intr_func = mxge_legacy;
4641 	else
4642 		ss->intr_func = mxge_msi;
4643 	ss->intr_serialize = &sc->main_serialize;
4644 	ss->intr_cpuid = rman_get_cpuid(ss->intr_res);
4645 
4646 	return 0;
4647 }
4648 
4649 static int
4650 mxge_setup_intr(struct mxge_softc *sc)
4651 {
4652 	int i;
4653 
4654 	for (i = 0; i < sc->num_slices; ++i) {
4655 		struct mxge_slice_state *ss = &sc->ss[i];
4656 		int error;
4657 
4658 		error = bus_setup_intr_descr(sc->dev, ss->intr_res,
4659 		    INTR_MPSAFE, ss->intr_func, ss, &ss->intr_hand,
4660 		    ss->intr_serialize, ss->intr_desc);
4661 		if (error) {
4662 			device_printf(sc->dev, "can't setup %dth intr\n", i);
4663 			mxge_teardown_intr(sc, i);
4664 			return error;
4665 		}
4666 	}
4667 	return 0;
4668 }
4669 
4670 static void
4671 mxge_teardown_intr(struct mxge_softc *sc, int cnt)
4672 {
4673 	int i;
4674 
4675 	if (sc->ss == NULL)
4676 		return;
4677 
4678 	for (i = 0; i < cnt; ++i) {
4679 		struct mxge_slice_state *ss = &sc->ss[i];
4680 
4681 		bus_teardown_intr(sc->dev, ss->intr_res, ss->intr_hand);
4682 	}
4683 }
4684 
4685 static void
4686 mxge_free_intr(struct mxge_softc *sc)
4687 {
4688 	if (sc->ss == NULL)
4689 		return;
4690 
4691 	if (sc->intr_type != PCI_INTR_TYPE_MSIX) {
4692 		struct mxge_slice_state *ss = &sc->ss[0];
4693 
4694 		if (ss->intr_res != NULL) {
4695 			bus_release_resource(sc->dev, SYS_RES_IRQ,
4696 			    ss->intr_rid, ss->intr_res);
4697 		}
4698 		if (sc->intr_type == PCI_INTR_TYPE_MSI)
4699 			pci_release_msi(sc->dev);
4700 	} else {
4701 		mxge_free_msix(sc, TRUE);
4702 	}
4703 }
4704