xref: /dragonfly/sys/dev/netif/mxge/if_mxge.c (revision 7608722c)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2013, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 $FreeBSD: head/sys/dev/mxge/if_mxge.c 254263 2013-08-12 23:30:01Z scottl $
29 
30 ***************************************************************************/
31 
32 #include "opt_ifpoll.h"
33 #include "opt_inet.h"
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/linker.h>
38 #include <sys/firmware.h>
39 #include <sys/endian.h>
40 #include <sys/in_cksum.h>
41 #include <sys/sockio.h>
42 #include <sys/mbuf.h>
43 #include <sys/malloc.h>
44 #include <sys/kernel.h>
45 #include <sys/module.h>
46 #include <sys/serialize.h>
47 #include <sys/socket.h>
48 #include <sys/sysctl.h>
49 
50 #include <net/if.h>
51 #include <net/if_arp.h>
52 #include <net/ifq_var.h>
53 #include <net/ethernet.h>
54 #include <net/if_dl.h>
55 #include <net/if_media.h>
56 #include <net/if_poll.h>
57 
58 #include <net/bpf.h>
59 
60 #include <net/if_types.h>
61 #include <net/vlan/if_vlan_var.h>
62 #include <net/zlib.h>
63 #include <net/toeplitz.h>
64 
65 #include <netinet/in_systm.h>
66 #include <netinet/in.h>
67 #include <netinet/ip.h>
68 #include <netinet/tcp.h>
69 
70 #include <sys/bus.h>
71 #include <sys/rman.h>
72 
73 #include <bus/pci/pcireg.h>
74 #include <bus/pci/pcivar.h>
75 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
76 
77 #include <vm/vm.h>		/* for pmap_mapdev() */
78 #include <vm/pmap.h>
79 
80 #if defined(__x86_64__)
81 #include <machine/specialreg.h>
82 #endif
83 
84 #include <dev/netif/mxge/mxge_mcp.h>
85 #include <dev/netif/mxge/mcp_gen_header.h>
86 #include <dev/netif/mxge/if_mxge_var.h>
87 
88 #define MXGE_IFM	(IFM_ETHER | IFM_FDX | IFM_ETH_FORCEPAUSE)
89 
90 #define MXGE_RX_SMALL_BUFLEN		(MHLEN - MXGEFW_PAD)
91 #define MXGE_HWRSS_KEYLEN		16
92 
93 /* Tunable params */
94 static int mxge_nvidia_ecrc_enable = 1;
95 static int mxge_force_firmware = 0;
96 static int mxge_intr_coal_delay = MXGE_INTR_COAL_DELAY;
97 static int mxge_deassert_wait = 1;
98 static int mxge_ticks;
99 static int mxge_num_slices = 0;
100 static int mxge_always_promisc = 0;
101 static int mxge_throttle = 0;
102 static int mxge_msi_enable = 1;
103 static int mxge_msix_enable = 1;
104 static int mxge_multi_tx = 1;
105 /*
106  * Don't use RSS by default, its just too slow
107  */
108 static int mxge_use_rss = 0;
109 
110 static char mxge_flowctrl[IFM_ETH_FC_STRLEN] = IFM_ETH_FC_FORCE_FULL;
111 
112 static const char *mxge_fw_unaligned = "mxge_ethp_z8e";
113 static const char *mxge_fw_aligned = "mxge_eth_z8e";
114 static const char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
115 static const char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
116 
117 TUNABLE_INT("hw.mxge.num_slices", &mxge_num_slices);
118 TUNABLE_INT("hw.mxge.intr_coal_delay", &mxge_intr_coal_delay);
119 TUNABLE_INT("hw.mxge.nvidia_ecrc_enable", &mxge_nvidia_ecrc_enable);
120 TUNABLE_INT("hw.mxge.force_firmware", &mxge_force_firmware);
121 TUNABLE_INT("hw.mxge.deassert_wait", &mxge_deassert_wait);
122 TUNABLE_INT("hw.mxge.ticks", &mxge_ticks);
123 TUNABLE_INT("hw.mxge.always_promisc", &mxge_always_promisc);
124 TUNABLE_INT("hw.mxge.throttle", &mxge_throttle);
125 TUNABLE_INT("hw.mxge.multi_tx", &mxge_multi_tx);
126 TUNABLE_INT("hw.mxge.use_rss", &mxge_use_rss);
127 TUNABLE_INT("hw.mxge.msi.enable", &mxge_msi_enable);
128 TUNABLE_INT("hw.mxge.msix.enable", &mxge_msix_enable);
129 TUNABLE_STR("hw.mxge.flow_ctrl", mxge_flowctrl, sizeof(mxge_flowctrl));
130 
131 static int mxge_probe(device_t dev);
132 static int mxge_attach(device_t dev);
133 static int mxge_detach(device_t dev);
134 static int mxge_shutdown(device_t dev);
135 
136 static int mxge_alloc_intr(struct mxge_softc *sc);
137 static void mxge_free_intr(struct mxge_softc *sc);
138 static int mxge_setup_intr(struct mxge_softc *sc);
139 static void mxge_teardown_intr(struct mxge_softc *sc, int cnt);
140 
141 static device_method_t mxge_methods[] = {
142 	/* Device interface */
143 	DEVMETHOD(device_probe, mxge_probe),
144 	DEVMETHOD(device_attach, mxge_attach),
145 	DEVMETHOD(device_detach, mxge_detach),
146 	DEVMETHOD(device_shutdown, mxge_shutdown),
147 	DEVMETHOD_END
148 };
149 
150 static driver_t mxge_driver = {
151 	"mxge",
152 	mxge_methods,
153 	sizeof(mxge_softc_t),
154 };
155 
156 static devclass_t mxge_devclass;
157 
158 /* Declare ourselves to be a child of the PCI bus.*/
159 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, NULL, NULL);
160 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
161 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
162 
163 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
164 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
165 static void mxge_close(mxge_softc_t *sc, int down);
166 static int mxge_open(mxge_softc_t *sc);
167 static void mxge_tick(void *arg);
168 static void mxge_watchdog_reset(mxge_softc_t *sc);
169 static void mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice);
170 
171 static int
172 mxge_probe(device_t dev)
173 {
174 	if (pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM &&
175 	    (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E ||
176 	     pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9)) {
177 		int rev = pci_get_revid(dev);
178 
179 		switch (rev) {
180 		case MXGE_PCI_REV_Z8E:
181 			device_set_desc(dev, "Myri10G-PCIE-8A");
182 			break;
183 		case MXGE_PCI_REV_Z8ES:
184 			device_set_desc(dev, "Myri10G-PCIE-8B");
185 			break;
186 		default:
187 			device_set_desc(dev, "Myri10G-PCIE-8??");
188 			device_printf(dev, "Unrecognized rev %d NIC\n", rev);
189 			break;
190 		}
191 		return 0;
192 	}
193 	return ENXIO;
194 }
195 
196 static void
197 mxge_enable_wc(mxge_softc_t *sc)
198 {
199 #if defined(__x86_64__)
200 	vm_offset_t len;
201 
202 	sc->wc = 1;
203 	len = rman_get_size(sc->mem_res);
204 	pmap_change_attr((vm_offset_t) sc->sram, len / PAGE_SIZE,
205 	    PAT_WRITE_COMBINING);
206 #endif
207 }
208 
209 static int
210 mxge_dma_alloc(mxge_softc_t *sc, bus_dmamem_t *dma, size_t bytes,
211     bus_size_t alignment)
212 {
213 	bus_size_t boundary;
214 	int err;
215 
216 	if (bytes > 4096 && alignment == 4096)
217 		boundary = 0;
218 	else
219 		boundary = 4096;
220 
221 	err = bus_dmamem_coherent(sc->parent_dmat, alignment, boundary,
222 	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, bytes,
223 	    BUS_DMA_WAITOK | BUS_DMA_ZERO, dma);
224 	if (err != 0) {
225 		device_printf(sc->dev, "bus_dmamem_coherent failed: %d\n", err);
226 		return err;
227 	}
228 	return 0;
229 }
230 
231 static void
232 mxge_dma_free(bus_dmamem_t *dma)
233 {
234 	bus_dmamap_unload(dma->dmem_tag, dma->dmem_map);
235 	bus_dmamem_free(dma->dmem_tag, dma->dmem_addr, dma->dmem_map);
236 	bus_dma_tag_destroy(dma->dmem_tag);
237 }
238 
239 /*
240  * The eeprom strings on the lanaiX have the format
241  * SN=x\0
242  * MAC=x:x:x:x:x:x\0
243  * PC=text\0
244  */
245 static int
246 mxge_parse_strings(mxge_softc_t *sc)
247 {
248 	const char *ptr;
249 	int i, found_mac, found_sn2;
250 	char *endptr;
251 
252 	ptr = sc->eeprom_strings;
253 	found_mac = 0;
254 	found_sn2 = 0;
255 	while (*ptr != '\0') {
256 		if (strncmp(ptr, "MAC=", 4) == 0) {
257 			ptr += 4;
258 			for (i = 0;;) {
259 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
260 				if (endptr - ptr != 2)
261 					goto abort;
262 				ptr = endptr;
263 				if (++i == 6)
264 					break;
265 				if (*ptr++ != ':')
266 					goto abort;
267 			}
268 			found_mac = 1;
269 		} else if (strncmp(ptr, "PC=", 3) == 0) {
270 			ptr += 3;
271 			strlcpy(sc->product_code_string, ptr,
272 			    sizeof(sc->product_code_string));
273 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
274 			ptr += 3;
275 			strlcpy(sc->serial_number_string, ptr,
276 			    sizeof(sc->serial_number_string));
277 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
278 			/* SN2 takes precedence over SN */
279 			ptr += 4;
280 			found_sn2 = 1;
281 			strlcpy(sc->serial_number_string, ptr,
282 			    sizeof(sc->serial_number_string));
283 		}
284 		while (*ptr++ != '\0') {}
285 	}
286 
287 	if (found_mac)
288 		return 0;
289 
290 abort:
291 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
292 	return ENXIO;
293 }
294 
295 #if defined(__x86_64__)
296 
297 static void
298 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
299 {
300 	uint32_t val;
301 	unsigned long base, off;
302 	char *va, *cfgptr;
303 	device_t pdev, mcp55;
304 	uint16_t vendor_id, device_id, word;
305 	uintptr_t bus, slot, func, ivend, idev;
306 	uint32_t *ptr32;
307 
308 	if (!mxge_nvidia_ecrc_enable)
309 		return;
310 
311 	pdev = device_get_parent(device_get_parent(sc->dev));
312 	if (pdev == NULL) {
313 		device_printf(sc->dev, "could not find parent?\n");
314 		return;
315 	}
316 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
317 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
318 
319 	if (vendor_id != 0x10de)
320 		return;
321 
322 	base = 0;
323 
324 	if (device_id == 0x005d) {
325 		/* ck804, base address is magic */
326 		base = 0xe0000000UL;
327 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
328 		/* mcp55, base address stored in chipset */
329 		mcp55 = pci_find_bsf(0, 0, 0);
330 		if (mcp55 &&
331 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
332 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
333 			word = pci_read_config(mcp55, 0x90, 2);
334 			base = ((unsigned long)word & 0x7ffeU) << 25;
335 		}
336 	}
337 	if (!base)
338 		return;
339 
340 	/*
341 	 * XXXX
342 	 * Test below is commented because it is believed that doing
343 	 * config read/write beyond 0xff will access the config space
344 	 * for the next larger function.  Uncomment this and remove
345 	 * the hacky pmap_mapdev() way of accessing config space when
346 	 * DragonFly grows support for extended pcie config space access.
347 	 */
348 #if 0
349 	/*
350 	 * See if we can, by some miracle, access the extended
351 	 * config space
352 	 */
353 	val = pci_read_config(pdev, 0x178, 4);
354 	if (val != 0xffffffff) {
355 		val |= 0x40;
356 		pci_write_config(pdev, 0x178, val, 4);
357 		return;
358 	}
359 #endif
360 	/*
361 	 * Rather than using normal pci config space writes, we must
362 	 * map the Nvidia config space ourselves.  This is because on
363 	 * opteron/nvidia class machine the 0xe000000 mapping is
364 	 * handled by the nvidia chipset, that means the internal PCI
365 	 * device (the on-chip northbridge), or the amd-8131 bridge
366 	 * and things behind them are not visible by this method.
367 	 */
368 
369 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
370 		      PCI_IVAR_BUS, &bus);
371 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
372 		      PCI_IVAR_SLOT, &slot);
373 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
374 		      PCI_IVAR_FUNCTION, &func);
375 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
376 		      PCI_IVAR_VENDOR, &ivend);
377 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
378 		      PCI_IVAR_DEVICE, &idev);
379 
380 	off =  base + 0x00100000UL * (unsigned long)bus +
381 	    0x00001000UL * (unsigned long)(func + 8 * slot);
382 
383 	/* map it into the kernel */
384 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
385 	if (va == NULL) {
386 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
387 		return;
388 	}
389 	/* get a pointer to the config space mapped into the kernel */
390 	cfgptr = va + (off & PAGE_MASK);
391 
392 	/* make sure that we can really access it */
393 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
394 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
395 	if (!(vendor_id == ivend && device_id == idev)) {
396 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
397 		    vendor_id, device_id);
398 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
399 		return;
400 	}
401 
402 	ptr32 = (uint32_t*)(cfgptr + 0x178);
403 	val = *ptr32;
404 
405 	if (val == 0xffffffff) {
406 		device_printf(sc->dev, "extended mapping failed\n");
407 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
408 		return;
409 	}
410 	*ptr32 = val | 0x40;
411 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
412 	if (bootverbose) {
413 		device_printf(sc->dev, "Enabled ECRC on upstream "
414 		    "Nvidia bridge at %d:%d:%d\n",
415 		    (int)bus, (int)slot, (int)func);
416 	}
417 }
418 
419 #else	/* __x86_64__ */
420 
421 static void
422 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
423 {
424 	device_printf(sc->dev, "Nforce 4 chipset on non-x86/x86_64!?!?!\n");
425 }
426 
427 #endif
428 
429 static int
430 mxge_dma_test(mxge_softc_t *sc, int test_type)
431 {
432 	mxge_cmd_t cmd;
433 	bus_addr_t dmatest_bus = sc->dmabench_dma.dmem_busaddr;
434 	int status;
435 	uint32_t len;
436 	const char *test = " ";
437 
438 	/*
439 	 * Run a small DMA test.
440 	 * The magic multipliers to the length tell the firmware
441 	 * to do DMA read, write, or read+write tests.  The
442 	 * results are returned in cmd.data0.  The upper 16
443 	 * bits of the return is the number of transfers completed.
444 	 * The lower 16 bits is the time in 0.5us ticks that the
445 	 * transfers took to complete.
446 	 */
447 
448 	len = sc->tx_boundary;
449 
450 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
451 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
452 	cmd.data2 = len * 0x10000;
453 	status = mxge_send_cmd(sc, test_type, &cmd);
454 	if (status != 0) {
455 		test = "read";
456 		goto abort;
457 	}
458 	sc->read_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
459 
460 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
461 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
462 	cmd.data2 = len * 0x1;
463 	status = mxge_send_cmd(sc, test_type, &cmd);
464 	if (status != 0) {
465 		test = "write";
466 		goto abort;
467 	}
468 	sc->write_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
469 
470 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
471 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
472 	cmd.data2 = len * 0x10001;
473 	status = mxge_send_cmd(sc, test_type, &cmd);
474 	if (status != 0) {
475 		test = "read/write";
476 		goto abort;
477 	}
478 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
479 	    (cmd.data0 & 0xffff);
480 
481 abort:
482 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST) {
483 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
484 		    test, status);
485 	}
486 	return status;
487 }
488 
489 /*
490  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
491  * when the PCI-E Completion packets are aligned on an 8-byte
492  * boundary.  Some PCI-E chip sets always align Completion packets; on
493  * the ones that do not, the alignment can be enforced by enabling
494  * ECRC generation (if supported).
495  *
496  * When PCI-E Completion packets are not aligned, it is actually more
497  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
498  *
499  * If the driver can neither enable ECRC nor verify that it has
500  * already been enabled, then it must use a firmware image which works
501  * around unaligned completion packets (ethp_z8e.dat), and it should
502  * also ensure that it never gives the device a Read-DMA which is
503  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
504  * enabled, then the driver should use the aligned (eth_z8e.dat)
505  * firmware image, and set tx_boundary to 4KB.
506  */
507 static int
508 mxge_firmware_probe(mxge_softc_t *sc)
509 {
510 	device_t dev = sc->dev;
511 	int reg, status;
512 	uint16_t pectl;
513 
514 	sc->tx_boundary = 4096;
515 
516 	/*
517 	 * Verify the max read request size was set to 4KB
518 	 * before trying the test with 4KB.
519 	 */
520 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
521 		pectl = pci_read_config(dev, reg + 0x8, 2);
522 		if ((pectl & (5 << 12)) != (5 << 12)) {
523 			device_printf(dev, "Max Read Req. size != 4k (0x%x)\n",
524 			    pectl);
525 			sc->tx_boundary = 2048;
526 		}
527 	}
528 
529 	/*
530 	 * Load the optimized firmware (which assumes aligned PCIe
531 	 * completions) in order to see if it works on this host.
532 	 */
533 	sc->fw_name = mxge_fw_aligned;
534 	status = mxge_load_firmware(sc, 1);
535 	if (status != 0)
536 		return status;
537 
538 	/*
539 	 * Enable ECRC if possible
540 	 */
541 	mxge_enable_nvidia_ecrc(sc);
542 
543 	/*
544 	 * Run a DMA test which watches for unaligned completions and
545 	 * aborts on the first one seen.  Not required on Z8ES or newer.
546 	 */
547 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
548 		return 0;
549 
550 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
551 	if (status == 0)
552 		return 0; /* keep the aligned firmware */
553 
554 	if (status != E2BIG)
555 		device_printf(dev, "DMA test failed: %d\n", status);
556 	if (status == ENOSYS) {
557 		device_printf(dev, "Falling back to ethp! "
558 		    "Please install up to date fw\n");
559 	}
560 	return status;
561 }
562 
563 static int
564 mxge_select_firmware(mxge_softc_t *sc)
565 {
566 	int aligned = 0;
567 	int force_firmware = mxge_force_firmware;
568 
569 	if (sc->throttle)
570 		force_firmware = sc->throttle;
571 
572 	if (force_firmware != 0) {
573 		if (force_firmware == 1)
574 			aligned = 1;
575 		else
576 			aligned = 0;
577 		if (bootverbose) {
578 			device_printf(sc->dev,
579 			    "Assuming %s completions (forced)\n",
580 			    aligned ? "aligned" : "unaligned");
581 		}
582 		goto abort;
583 	}
584 
585 	/*
586 	 * If the PCIe link width is 4 or less, we can use the aligned
587 	 * firmware and skip any checks
588 	 */
589 	if (sc->link_width != 0 && sc->link_width <= 4) {
590 		device_printf(sc->dev, "PCIe x%d Link, "
591 		    "expect reduced performance\n", sc->link_width);
592 		aligned = 1;
593 		goto abort;
594 	}
595 
596 	if (mxge_firmware_probe(sc) == 0)
597 		return 0;
598 
599 abort:
600 	if (aligned) {
601 		sc->fw_name = mxge_fw_aligned;
602 		sc->tx_boundary = 4096;
603 	} else {
604 		sc->fw_name = mxge_fw_unaligned;
605 		sc->tx_boundary = 2048;
606 	}
607 	return mxge_load_firmware(sc, 0);
608 }
609 
610 static int
611 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
612 {
613 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
614 		if_printf(sc->ifp, "Bad firmware type: 0x%x\n",
615 		    be32toh(hdr->mcp_type));
616 		return EIO;
617 	}
618 
619 	/* Save firmware version for sysctl */
620 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
621 	if (bootverbose)
622 		if_printf(sc->ifp, "firmware id: %s\n", hdr->version);
623 
624 	ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
625 	    &sc->fw_ver_minor, &sc->fw_ver_tiny);
626 
627 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR &&
628 	      sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
629 		if_printf(sc->ifp, "Found firmware version %s\n",
630 		    sc->fw_version);
631 		if_printf(sc->ifp, "Driver needs %d.%d\n",
632 		    MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
633 		return EINVAL;
634 	}
635 	return 0;
636 }
637 
638 static void *
639 z_alloc(void *nil, u_int items, u_int size)
640 {
641 	return kmalloc(items * size, M_TEMP, M_WAITOK);
642 }
643 
644 static void
645 z_free(void *nil, void *ptr)
646 {
647 	kfree(ptr, M_TEMP);
648 }
649 
650 static int
651 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
652 {
653 	z_stream zs;
654 	char *inflate_buffer;
655 	const struct firmware *fw;
656 	const mcp_gen_header_t *hdr;
657 	unsigned hdr_offset;
658 	int status;
659 	unsigned int i;
660 	char dummy;
661 	size_t fw_len;
662 
663 	fw = firmware_get(sc->fw_name);
664 	if (fw == NULL) {
665 		if_printf(sc->ifp, "Could not find firmware image %s\n",
666 		    sc->fw_name);
667 		return ENOENT;
668 	}
669 
670 	/* Setup zlib and decompress f/w */
671 	bzero(&zs, sizeof(zs));
672 	zs.zalloc = z_alloc;
673 	zs.zfree = z_free;
674 	status = inflateInit(&zs);
675 	if (status != Z_OK) {
676 		status = EIO;
677 		goto abort_with_fw;
678 	}
679 
680 	/*
681 	 * The uncompressed size is stored as the firmware version,
682 	 * which would otherwise go unused
683 	 */
684 	fw_len = (size_t)fw->version;
685 	inflate_buffer = kmalloc(fw_len, M_TEMP, M_WAITOK);
686 	zs.avail_in = fw->datasize;
687 	zs.next_in = __DECONST(char *, fw->data);
688 	zs.avail_out = fw_len;
689 	zs.next_out = inflate_buffer;
690 	status = inflate(&zs, Z_FINISH);
691 	if (status != Z_STREAM_END) {
692 		if_printf(sc->ifp, "zlib %d\n", status);
693 		status = EIO;
694 		goto abort_with_buffer;
695 	}
696 
697 	/* Check id */
698 	hdr_offset =
699 	htobe32(*(const uint32_t *)(inflate_buffer + MCP_HEADER_PTR_OFFSET));
700 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
701 		if_printf(sc->ifp, "Bad firmware file");
702 		status = EIO;
703 		goto abort_with_buffer;
704 	}
705 	hdr = (const void*)(inflate_buffer + hdr_offset);
706 
707 	status = mxge_validate_firmware(sc, hdr);
708 	if (status != 0)
709 		goto abort_with_buffer;
710 
711 	/* Copy the inflated firmware to NIC SRAM. */
712 	for (i = 0; i < fw_len; i += 256) {
713 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i, inflate_buffer + i,
714 		    min(256U, (unsigned)(fw_len - i)));
715 		wmb();
716 		dummy = *sc->sram;
717 		wmb();
718 	}
719 
720 	*limit = fw_len;
721 	status = 0;
722 abort_with_buffer:
723 	kfree(inflate_buffer, M_TEMP);
724 	inflateEnd(&zs);
725 abort_with_fw:
726 	firmware_put(fw, FIRMWARE_UNLOAD);
727 	return status;
728 }
729 
730 /*
731  * Enable or disable periodic RDMAs from the host to make certain
732  * chipsets resend dropped PCIe messages
733  */
734 static void
735 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
736 {
737 	char buf_bytes[72];
738 	volatile uint32_t *confirm;
739 	volatile char *submit;
740 	uint32_t *buf, dma_low, dma_high;
741 	int i;
742 
743 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
744 
745 	/* Clear confirmation addr */
746 	confirm = (volatile uint32_t *)sc->cmd;
747 	*confirm = 0;
748 	wmb();
749 
750 	/*
751 	 * Send an rdma command to the PCIe engine, and wait for the
752 	 * response in the confirmation address.  The firmware should
753 	 * write a -1 there to indicate it is alive and well
754 	 */
755 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
756 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
757 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
758 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
759 	buf[2] = htobe32(0xffffffff);		/* confirm data */
760 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
761 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
762 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
763 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
764 	buf[5] = htobe32(enable);		/* enable? */
765 
766 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
767 
768 	mxge_pio_copy(submit, buf, 64);
769 	wmb();
770 	DELAY(1000);
771 	wmb();
772 	i = 0;
773 	while (*confirm != 0xffffffff && i < 20) {
774 		DELAY(1000);
775 		i++;
776 	}
777 	if (*confirm != 0xffffffff) {
778 		if_printf(sc->ifp, "dummy rdma %s failed (%p = 0x%x)",
779 		    (enable ? "enable" : "disable"), confirm, *confirm);
780 	}
781 }
782 
783 static int
784 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
785 {
786 	mcp_cmd_t *buf;
787 	char buf_bytes[sizeof(*buf) + 8];
788 	volatile mcp_cmd_response_t *response = sc->cmd;
789 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
790 	uint32_t dma_low, dma_high;
791 	int err, sleep_total = 0;
792 
793 	/* Ensure buf is aligned to 8 bytes */
794 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
795 
796 	buf->data0 = htobe32(data->data0);
797 	buf->data1 = htobe32(data->data1);
798 	buf->data2 = htobe32(data->data2);
799 	buf->cmd = htobe32(cmd);
800 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
801 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
802 
803 	buf->response_addr.low = htobe32(dma_low);
804 	buf->response_addr.high = htobe32(dma_high);
805 
806 	response->result = 0xffffffff;
807 	wmb();
808 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
809 
810 	/*
811 	 * Wait up to 20ms
812 	 */
813 	err = EAGAIN;
814 	for (sleep_total = 0; sleep_total < 20; sleep_total++) {
815 		wmb();
816 		switch (be32toh(response->result)) {
817 		case 0:
818 			data->data0 = be32toh(response->data);
819 			err = 0;
820 			break;
821 		case 0xffffffff:
822 			DELAY(1000);
823 			break;
824 		case MXGEFW_CMD_UNKNOWN:
825 			err = ENOSYS;
826 			break;
827 		case MXGEFW_CMD_ERROR_UNALIGNED:
828 			err = E2BIG;
829 			break;
830 		case MXGEFW_CMD_ERROR_BUSY:
831 			err = EBUSY;
832 			break;
833 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
834 			err = ENXIO;
835 			break;
836 		default:
837 			if_printf(sc->ifp, "command %d failed, result = %d\n",
838 			    cmd, be32toh(response->result));
839 			err = ENXIO;
840 			break;
841 		}
842 		if (err != EAGAIN)
843 			break;
844 	}
845 	if (err == EAGAIN) {
846 		if_printf(sc->ifp, "command %d timed out result = %d\n",
847 		    cmd, be32toh(response->result));
848 	}
849 	return err;
850 }
851 
852 static int
853 mxge_adopt_running_firmware(mxge_softc_t *sc)
854 {
855 	struct mcp_gen_header *hdr;
856 	const size_t bytes = sizeof(struct mcp_gen_header);
857 	size_t hdr_offset;
858 	int status;
859 
860 	/*
861 	 * Find running firmware header
862 	 */
863 	hdr_offset =
864 	htobe32(*(volatile uint32_t *)(sc->sram + MCP_HEADER_PTR_OFFSET));
865 
866 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
867 		if_printf(sc->ifp, "Running firmware has bad header offset "
868 		    "(%zu)\n", hdr_offset);
869 		return EIO;
870 	}
871 
872 	/*
873 	 * Copy header of running firmware from SRAM to host memory to
874 	 * validate firmware
875 	 */
876 	hdr = kmalloc(bytes, M_DEVBUF, M_WAITOK);
877 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
878 	    rman_get_bushandle(sc->mem_res), hdr_offset, (char *)hdr, bytes);
879 	status = mxge_validate_firmware(sc, hdr);
880 	kfree(hdr, M_DEVBUF);
881 
882 	/*
883 	 * Check to see if adopted firmware has bug where adopting
884 	 * it will cause broadcasts to be filtered unless the NIC
885 	 * is kept in ALLMULTI mode
886 	 */
887 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
888 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
889 		sc->adopted_rx_filter_bug = 1;
890 		if_printf(sc->ifp, "Adopting fw %d.%d.%d: "
891 		    "working around rx filter bug\n",
892 		    sc->fw_ver_major, sc->fw_ver_minor, sc->fw_ver_tiny);
893 	}
894 
895 	return status;
896 }
897 
898 static int
899 mxge_load_firmware(mxge_softc_t *sc, int adopt)
900 {
901 	volatile uint32_t *confirm;
902 	volatile char *submit;
903 	char buf_bytes[72];
904 	uint32_t *buf, size, dma_low, dma_high;
905 	int status, i;
906 
907 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
908 
909 	size = sc->sram_size;
910 	status = mxge_load_firmware_helper(sc, &size);
911 	if (status) {
912 		if (!adopt)
913 			return status;
914 
915 		/*
916 		 * Try to use the currently running firmware, if
917 		 * it is new enough
918 		 */
919 		status = mxge_adopt_running_firmware(sc);
920 		if (status) {
921 			if_printf(sc->ifp,
922 			    "failed to adopt running firmware\n");
923 			return status;
924 		}
925 		if_printf(sc->ifp, "Successfully adopted running firmware\n");
926 
927 		if (sc->tx_boundary == 4096) {
928 			if_printf(sc->ifp,
929 			     "Using firmware currently running on NIC.  "
930 			     "For optimal\n");
931 			if_printf(sc->ifp, "performance consider loading "
932 			     "optimized firmware\n");
933 		}
934 		sc->fw_name = mxge_fw_unaligned;
935 		sc->tx_boundary = 2048;
936 		return 0;
937 	}
938 
939 	/* Clear confirmation addr */
940 	confirm = (volatile uint32_t *)sc->cmd;
941 	*confirm = 0;
942 	wmb();
943 
944 	/*
945 	 * Send a reload command to the bootstrap MCP, and wait for the
946 	 * response in the confirmation address.  The firmware should
947 	 * write a -1 there to indicate it is alive and well
948 	 */
949 
950 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
951 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
952 
953 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
954 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
955 	buf[2] = htobe32(0xffffffff);	/* confirm data */
956 
957 	/*
958 	 * FIX: All newest firmware should un-protect the bottom of
959 	 * the sram before handoff. However, the very first interfaces
960 	 * do not. Therefore the handoff copy must skip the first 8 bytes
961 	 */
962 					/* where the code starts*/
963 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
964 	buf[4] = htobe32(size - 8); 	/* length of code */
965 	buf[5] = htobe32(8);		/* where to copy to */
966 	buf[6] = htobe32(0);		/* where to jump to */
967 
968 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
969 	mxge_pio_copy(submit, buf, 64);
970 	wmb();
971 	DELAY(1000);
972 	wmb();
973 	i = 0;
974 	while (*confirm != 0xffffffff && i < 20) {
975 		DELAY(1000*10);
976 		i++;
977 	}
978 	if (*confirm != 0xffffffff) {
979 		if_printf(sc->ifp,"handoff failed (%p = 0x%x)",
980 		    confirm, *confirm);
981 		return ENXIO;
982 	}
983 	return 0;
984 }
985 
986 static int
987 mxge_update_mac_address(mxge_softc_t *sc)
988 {
989 	mxge_cmd_t cmd;
990 	uint8_t *addr = sc->mac_addr;
991 
992 	cmd.data0 = (addr[0] << 24) | (addr[1] << 16) |
993 	    (addr[2] << 8) | addr[3];
994 	cmd.data1 = (addr[4] << 8) | (addr[5]);
995 	return mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
996 }
997 
998 static int
999 mxge_change_pause(mxge_softc_t *sc, int pause)
1000 {
1001 	mxge_cmd_t cmd;
1002 	int status;
1003 
1004 	bzero(&cmd, sizeof(cmd));	/* silence gcc warning */
1005 	if (pause)
1006 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL, &cmd);
1007 	else
1008 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL, &cmd);
1009 	if (status) {
1010 		if_printf(sc->ifp, "Failed to set flow control mode\n");
1011 		return ENXIO;
1012 	}
1013 	sc->pause = pause;
1014 	return 0;
1015 }
1016 
1017 static void
1018 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1019 {
1020 	mxge_cmd_t cmd;
1021 	int status;
1022 
1023 	bzero(&cmd, sizeof(cmd));	/* avoid gcc warning */
1024 	if (mxge_always_promisc)
1025 		promisc = 1;
1026 
1027 	if (promisc)
1028 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC, &cmd);
1029 	else
1030 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC, &cmd);
1031 	if (status)
1032 		if_printf(sc->ifp, "Failed to set promisc mode\n");
1033 }
1034 
1035 static void
1036 mxge_set_multicast_list(mxge_softc_t *sc)
1037 {
1038 	mxge_cmd_t cmd;
1039 	struct ifmultiaddr *ifma;
1040 	struct ifnet *ifp = sc->ifp;
1041 	int err;
1042 
1043 	/* This firmware is known to not support multicast */
1044 	if (!sc->fw_multicast_support)
1045 		return;
1046 
1047 	/* Disable multicast filtering while we play with the lists*/
1048 	bzero(&cmd, sizeof(cmd));	/* silence gcc warning */
1049 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1050 	if (err != 0) {
1051 		if_printf(ifp, "Failed MXGEFW_ENABLE_ALLMULTI, "
1052 		    "error status: %d\n", err);
1053 		return;
1054 	}
1055 
1056 	if (sc->adopted_rx_filter_bug)
1057 		return;
1058 
1059 	if (ifp->if_flags & IFF_ALLMULTI) {
1060 		/* Request to disable multicast filtering, so quit here */
1061 		return;
1062 	}
1063 
1064 	/* Flush all the filters */
1065 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1066 	if (err != 0) {
1067 		if_printf(ifp, "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, "
1068 		    "error status: %d\n", err);
1069 		return;
1070 	}
1071 
1072 	/*
1073 	 * Walk the multicast list, and add each address
1074 	 */
1075 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1076 		if (ifma->ifma_addr->sa_family != AF_LINK)
1077 			continue;
1078 
1079 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1080 		    &cmd.data0, 4);
1081 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1082 		    &cmd.data1, 2);
1083 		cmd.data0 = htonl(cmd.data0);
1084 		cmd.data1 = htonl(cmd.data1);
1085 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1086 		if (err != 0) {
1087 			if_printf(ifp, "Failed MXGEFW_JOIN_MULTICAST_GROUP, "
1088 			    "error status: %d\n", err);
1089 			/* Abort, leaving multicast filtering off */
1090 			return;
1091 		}
1092 	}
1093 
1094 	/* Enable multicast filtering */
1095 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1096 	if (err != 0) {
1097 		if_printf(ifp, "Failed MXGEFW_DISABLE_ALLMULTI, "
1098 		    "error status: %d\n", err);
1099 	}
1100 }
1101 
1102 #if 0
1103 static int
1104 mxge_max_mtu(mxge_softc_t *sc)
1105 {
1106 	mxge_cmd_t cmd;
1107 	int status;
1108 
1109 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1110 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1111 
1112 	/* try to set nbufs to see if it we can
1113 	   use virtually contiguous jumbos */
1114 	cmd.data0 = 0;
1115 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1116 			       &cmd);
1117 	if (status == 0)
1118 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1119 
1120 	/* otherwise, we're limited to MJUMPAGESIZE */
1121 	return MJUMPAGESIZE - MXGEFW_PAD;
1122 }
1123 #endif
1124 
1125 static int
1126 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1127 {
1128 	struct mxge_slice_state *ss;
1129 	mxge_rx_done_t *rx_done;
1130 	volatile uint32_t *irq_claim;
1131 	mxge_cmd_t cmd;
1132 	int slice, status, rx_intr_size;
1133 
1134 	/*
1135 	 * Try to send a reset command to the card to see if it
1136 	 * is alive
1137 	 */
1138 	memset(&cmd, 0, sizeof (cmd));
1139 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1140 	if (status != 0) {
1141 		if_printf(sc->ifp, "failed reset\n");
1142 		return ENXIO;
1143 	}
1144 
1145 	mxge_dummy_rdma(sc, 1);
1146 
1147 	/*
1148 	 * Set the intrq size
1149 	 * XXX assume 4byte mcp_slot
1150 	 */
1151 	rx_intr_size = sc->rx_intr_slots * sizeof(mcp_slot_t);
1152 	cmd.data0 = rx_intr_size;
1153 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1154 
1155 	/*
1156 	 * Even though we already know how many slices are supported
1157 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1158 	 * has magic side effects, and must be called after a reset.
1159 	 * It must be called prior to calling any RSS related cmds,
1160 	 * including assigning an interrupt queue for anything but
1161 	 * slice 0.  It must also be called *after*
1162 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1163 	 * the firmware to compute offsets.
1164 	 */
1165 	if (sc->num_slices > 1) {
1166 		/* Ask the maximum number of slices it supports */
1167 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
1168 		if (status != 0) {
1169 			if_printf(sc->ifp, "failed to get number of slices\n");
1170 			return status;
1171 		}
1172 
1173 		/*
1174 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1175 		 * to setting up the interrupt queue DMA
1176 		 */
1177 		cmd.data0 = sc->num_slices;
1178 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1179 		if (sc->num_tx_rings > 1)
1180 			cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1181 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES, &cmd);
1182 		if (status != 0) {
1183 			if_printf(sc->ifp, "failed to set number of slices\n");
1184 			return status;
1185 		}
1186 	}
1187 
1188 	if (interrupts_setup) {
1189 		/* Now exchange information about interrupts  */
1190 		for (slice = 0; slice < sc->num_slices; slice++) {
1191 			ss = &sc->ss[slice];
1192 
1193 			rx_done = &ss->rx_data.rx_done;
1194 			memset(rx_done->entry, 0, rx_intr_size);
1195 
1196 			cmd.data0 =
1197 			    MXGE_LOWPART_TO_U32(ss->rx_done_dma.dmem_busaddr);
1198 			cmd.data1 =
1199 			    MXGE_HIGHPART_TO_U32(ss->rx_done_dma.dmem_busaddr);
1200 			cmd.data2 = slice;
1201 			status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA,
1202 			    &cmd);
1203 		}
1204 	}
1205 
1206 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET,
1207 	    &cmd);
1208 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1209 
1210 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1211 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1212 
1213 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, &cmd);
1214 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1215 
1216 	if (status != 0) {
1217 		if_printf(sc->ifp, "failed set interrupt parameters\n");
1218 		return status;
1219 	}
1220 
1221 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1222 
1223 	/* Run a DMA benchmark */
1224 	mxge_dma_test(sc, MXGEFW_DMA_TEST);
1225 
1226 	for (slice = 0; slice < sc->num_slices; slice++) {
1227 		ss = &sc->ss[slice];
1228 
1229 		ss->irq_claim = irq_claim + (2 * slice);
1230 
1231 		/* Reset mcp/driver shared state back to 0 */
1232 		ss->rx_data.rx_done.idx = 0;
1233 		ss->tx.req = 0;
1234 		ss->tx.done = 0;
1235 		ss->tx.pkt_done = 0;
1236 		ss->tx.queue_active = 0;
1237 		ss->tx.activate = 0;
1238 		ss->tx.deactivate = 0;
1239 		ss->rx_data.rx_big.cnt = 0;
1240 		ss->rx_data.rx_small.cnt = 0;
1241 		if (ss->fw_stats != NULL)
1242 			bzero(ss->fw_stats, sizeof(*ss->fw_stats));
1243 	}
1244 	sc->rdma_tags_available = 15;
1245 
1246 	status = mxge_update_mac_address(sc);
1247 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1248 	mxge_change_pause(sc, sc->pause);
1249 	mxge_set_multicast_list(sc);
1250 
1251 	if (sc->throttle) {
1252 		cmd.data0 = sc->throttle;
1253 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd))
1254 			if_printf(sc->ifp, "can't enable throttle\n");
1255 	}
1256 	return status;
1257 }
1258 
1259 static int
1260 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1261 {
1262 	mxge_cmd_t cmd;
1263 	mxge_softc_t *sc;
1264 	int err;
1265 	unsigned int throttle;
1266 
1267 	sc = arg1;
1268 	throttle = sc->throttle;
1269 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1270 	if (err != 0)
1271 		return err;
1272 
1273 	if (throttle == sc->throttle)
1274 		return 0;
1275 
1276 	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1277 		return EINVAL;
1278 
1279 	ifnet_serialize_all(sc->ifp);
1280 
1281 	cmd.data0 = throttle;
1282 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1283 	if (err == 0)
1284 		sc->throttle = throttle;
1285 
1286 	ifnet_deserialize_all(sc->ifp);
1287 	return err;
1288 }
1289 
1290 static int
1291 mxge_change_use_rss(SYSCTL_HANDLER_ARGS)
1292 {
1293 	mxge_softc_t *sc;
1294 	int err, use_rss;
1295 
1296 	sc = arg1;
1297 	use_rss = sc->use_rss;
1298 	err = sysctl_handle_int(oidp, &use_rss, arg2, req);
1299 	if (err != 0)
1300 		return err;
1301 
1302 	if (use_rss == sc->use_rss)
1303 		return 0;
1304 
1305 	ifnet_serialize_all(sc->ifp);
1306 
1307 	sc->use_rss = use_rss;
1308 	if (sc->ifp->if_flags & IFF_RUNNING) {
1309 		mxge_close(sc, 0);
1310 		mxge_open(sc);
1311 	}
1312 
1313 	ifnet_deserialize_all(sc->ifp);
1314 	return err;
1315 }
1316 
1317 static int
1318 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1319 {
1320 	mxge_softc_t *sc;
1321 	unsigned int intr_coal_delay;
1322 	int err;
1323 
1324 	sc = arg1;
1325 	intr_coal_delay = sc->intr_coal_delay;
1326 	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1327 	if (err != 0)
1328 		return err;
1329 
1330 	if (intr_coal_delay == sc->intr_coal_delay)
1331 		return 0;
1332 
1333 	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1334 		return EINVAL;
1335 
1336 	ifnet_serialize_all(sc->ifp);
1337 
1338 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1339 	sc->intr_coal_delay = intr_coal_delay;
1340 
1341 	ifnet_deserialize_all(sc->ifp);
1342 	return err;
1343 }
1344 
1345 static int
1346 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1347 {
1348 	int err;
1349 
1350 	if (arg1 == NULL)
1351 		return EFAULT;
1352 	arg2 = be32toh(*(int *)arg1);
1353 	arg1 = NULL;
1354 	err = sysctl_handle_int(oidp, arg1, arg2, req);
1355 
1356 	return err;
1357 }
1358 
1359 static void
1360 mxge_rem_sysctls(mxge_softc_t *sc)
1361 {
1362 	if (sc->ss != NULL) {
1363 		struct mxge_slice_state *ss;
1364 		int slice;
1365 
1366 		for (slice = 0; slice < sc->num_slices; slice++) {
1367 			ss = &sc->ss[slice];
1368 			if (ss->sysctl_tree != NULL) {
1369 				sysctl_ctx_free(&ss->sysctl_ctx);
1370 				ss->sysctl_tree = NULL;
1371 			}
1372 		}
1373 	}
1374 
1375 	if (sc->slice_sysctl_tree != NULL) {
1376 		sysctl_ctx_free(&sc->slice_sysctl_ctx);
1377 		sc->slice_sysctl_tree = NULL;
1378 	}
1379 }
1380 
1381 static void
1382 mxge_add_sysctls(mxge_softc_t *sc)
1383 {
1384 	struct sysctl_ctx_list *ctx;
1385 	struct sysctl_oid_list *children;
1386 	mcp_irq_data_t *fw;
1387 	struct mxge_slice_state *ss;
1388 	int slice;
1389 	char slice_num[8];
1390 
1391 	ctx = device_get_sysctl_ctx(sc->dev);
1392 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1393 	fw = sc->ss[0].fw_stats;
1394 
1395 	/*
1396 	 * Random information
1397 	 */
1398 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version",
1399 	    CTLFLAG_RD, &sc->fw_version, 0, "firmware version");
1400 
1401 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "serial_number",
1402 	    CTLFLAG_RD, &sc->serial_number_string, 0, "serial number");
1403 
1404 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "product_code",
1405 	    CTLFLAG_RD, &sc->product_code_string, 0, "product code");
1406 
1407 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "pcie_link_width",
1408 	    CTLFLAG_RD, &sc->link_width, 0, "link width");
1409 
1410 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_boundary",
1411 	    CTLFLAG_RD, &sc->tx_boundary, 0, "tx boundary");
1412 
1413 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_combine",
1414 	    CTLFLAG_RD, &sc->wc, 0, "write combining PIO");
1415 
1416 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_dma_MBs",
1417 	    CTLFLAG_RD, &sc->read_dma, 0, "DMA Read speed in MB/s");
1418 
1419 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_dma_MBs",
1420 	    CTLFLAG_RD, &sc->write_dma, 0, "DMA Write speed in MB/s");
1421 
1422 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_write_dma_MBs",
1423 	    CTLFLAG_RD, &sc->read_write_dma, 0,
1424 	    "DMA concurrent Read/Write speed in MB/s");
1425 
1426 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "watchdog_resets",
1427 	    CTLFLAG_RD, &sc->watchdog_resets, 0,
1428 	    "Number of times NIC was reset");
1429 
1430 	/*
1431 	 * Performance related tunables
1432 	 */
1433 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "intr_coal_delay",
1434 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_intr_coal, "I",
1435 	    "Interrupt coalescing delay in usecs");
1436 
1437 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "throttle",
1438 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_throttle, "I",
1439 	    "Transmit throttling");
1440 
1441 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "use_rss",
1442 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_use_rss, "I",
1443 	    "Use RSS");
1444 
1445 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "deassert_wait",
1446 	    CTLFLAG_RW, &mxge_deassert_wait, 0,
1447 	    "Wait for IRQ line to go low in ihandler");
1448 
1449 	/*
1450 	 * Stats block from firmware is in network byte order.
1451 	 * Need to swap it
1452 	 */
1453 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "link_up",
1454 	    CTLTYPE_INT|CTLFLAG_RD, &fw->link_up, 0,
1455 	    mxge_handle_be32, "I", "link up");
1456 
1457 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_tags_available",
1458 	    CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available, 0,
1459 	    mxge_handle_be32, "I", "rdma_tags_available");
1460 
1461 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_crc32",
1462 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_crc32, 0,
1463 	    mxge_handle_be32, "I", "dropped_bad_crc32");
1464 
1465 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_phy",
1466 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_phy, 0,
1467 	    mxge_handle_be32, "I", "dropped_bad_phy");
1468 
1469 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_error_or_filtered",
1470 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_error_or_filtered, 0,
1471 	    mxge_handle_be32, "I", "dropped_link_error_or_filtered");
1472 
1473 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_overflow",
1474 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow, 0,
1475 	    mxge_handle_be32, "I", "dropped_link_overflow");
1476 
1477 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_multicast_filtered",
1478 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_multicast_filtered, 0,
1479 	    mxge_handle_be32, "I", "dropped_multicast_filtered");
1480 
1481 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_big_buffer",
1482 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer, 0,
1483 	    mxge_handle_be32, "I", "dropped_no_big_buffer");
1484 
1485 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_small_buffer",
1486 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_small_buffer, 0,
1487 	    mxge_handle_be32, "I", "dropped_no_small_buffer");
1488 
1489 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_overrun",
1490 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun, 0,
1491 	    mxge_handle_be32, "I", "dropped_overrun");
1492 
1493 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_pause",
1494 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_pause, 0,
1495 	    mxge_handle_be32, "I", "dropped_pause");
1496 
1497 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_runt",
1498 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt, 0,
1499 	    mxge_handle_be32, "I", "dropped_runt");
1500 
1501 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_unicast_filtered",
1502 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered, 0,
1503 	    mxge_handle_be32, "I", "dropped_unicast_filtered");
1504 
1505 	/* add counters exported for debugging from all slices */
1506 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1507 	sc->slice_sysctl_tree = SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx,
1508 	    children, OID_AUTO, "slice", CTLFLAG_RD, 0, "");
1509 	if (sc->slice_sysctl_tree == NULL) {
1510 		device_printf(sc->dev, "can't add slice sysctl node\n");
1511 		return;
1512 	}
1513 
1514 	for (slice = 0; slice < sc->num_slices; slice++) {
1515 		ss = &sc->ss[slice];
1516 		sysctl_ctx_init(&ss->sysctl_ctx);
1517 		ctx = &ss->sysctl_ctx;
1518 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1519 		ksprintf(slice_num, "%d", slice);
1520 		ss->sysctl_tree = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
1521 		    slice_num, CTLFLAG_RD, 0, "");
1522 		if (ss->sysctl_tree == NULL) {
1523 			device_printf(sc->dev,
1524 			    "can't add %d slice sysctl node\n", slice);
1525 			return;	/* XXX continue? */
1526 		}
1527 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1528 
1529 		/*
1530 		 * XXX change to ULONG
1531 		 */
1532 
1533 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_small_cnt",
1534 		    CTLFLAG_RD, &ss->rx_data.rx_small.cnt, 0, "rx_small_cnt");
1535 
1536 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_big_cnt",
1537 		    CTLFLAG_RD, &ss->rx_data.rx_big.cnt, 0, "rx_small_cnt");
1538 
1539 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_req",
1540 		    CTLFLAG_RD, &ss->tx.req, 0, "tx_req");
1541 
1542 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_done",
1543 		    CTLFLAG_RD, &ss->tx.done, 0, "tx_done");
1544 
1545 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_pkt_done",
1546 		    CTLFLAG_RD, &ss->tx.pkt_done, 0, "tx_done");
1547 
1548 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_queue_active",
1549 		    CTLFLAG_RD, &ss->tx.queue_active, 0, "tx_queue_active");
1550 
1551 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_activate",
1552 		    CTLFLAG_RD, &ss->tx.activate, 0, "tx_activate");
1553 
1554 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_deactivate",
1555 		    CTLFLAG_RD, &ss->tx.deactivate, 0, "tx_deactivate");
1556 	}
1557 }
1558 
1559 /*
1560  * Copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1561  * backwards one at a time and handle ring wraps
1562  */
1563 static __inline void
1564 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1565     mcp_kreq_ether_send_t *src, int cnt)
1566 {
1567 	int idx, starting_slot;
1568 
1569 	starting_slot = tx->req;
1570 	while (cnt > 1) {
1571 		cnt--;
1572 		idx = (starting_slot + cnt) & tx->mask;
1573 		mxge_pio_copy(&tx->lanai[idx], &src[cnt], sizeof(*src));
1574 		wmb();
1575 	}
1576 }
1577 
1578 /*
1579  * Copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1580  * at most 32 bytes at a time, so as to avoid involving the software
1581  * pio handler in the nic.  We re-write the first segment's flags
1582  * to mark them valid only after writing the entire chain
1583  */
1584 static __inline void
1585 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src, int cnt)
1586 {
1587 	int idx, i;
1588 	uint32_t *src_ints;
1589 	volatile uint32_t *dst_ints;
1590 	mcp_kreq_ether_send_t *srcp;
1591 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1592 	uint8_t last_flags;
1593 
1594 	idx = tx->req & tx->mask;
1595 
1596 	last_flags = src->flags;
1597 	src->flags = 0;
1598 	wmb();
1599 	dst = dstp = &tx->lanai[idx];
1600 	srcp = src;
1601 
1602 	if ((idx + cnt) < tx->mask) {
1603 		for (i = 0; i < cnt - 1; i += 2) {
1604 			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1605 			wmb(); /* force write every 32 bytes */
1606 			srcp += 2;
1607 			dstp += 2;
1608 		}
1609 	} else {
1610 		/*
1611 		 * Submit all but the first request, and ensure
1612 		 * that it is submitted below
1613 		 */
1614 		mxge_submit_req_backwards(tx, src, cnt);
1615 		i = 0;
1616 	}
1617 	if (i < cnt) {
1618 		/* Submit the first request */
1619 		mxge_pio_copy(dstp, srcp, sizeof(*src));
1620 		wmb(); /* barrier before setting valid flag */
1621 	}
1622 
1623 	/* Re-write the last 32-bits with the valid flags */
1624 	src->flags = last_flags;
1625 	src_ints = (uint32_t *)src;
1626 	src_ints+=3;
1627 	dst_ints = (volatile uint32_t *)dst;
1628 	dst_ints+=3;
1629 	*dst_ints = *src_ints;
1630 	tx->req += cnt;
1631 	wmb();
1632 }
1633 
1634 static int
1635 mxge_pullup_tso(struct mbuf **mp)
1636 {
1637 	int hoff, iphlen, thoff;
1638 	struct mbuf *m;
1639 
1640 	m = *mp;
1641 	KASSERT(M_WRITABLE(m), ("TSO mbuf not writable"));
1642 
1643 	iphlen = m->m_pkthdr.csum_iphlen;
1644 	thoff = m->m_pkthdr.csum_thlen;
1645 	hoff = m->m_pkthdr.csum_lhlen;
1646 
1647 	KASSERT(iphlen > 0, ("invalid ip hlen"));
1648 	KASSERT(thoff > 0, ("invalid tcp hlen"));
1649 	KASSERT(hoff > 0, ("invalid ether hlen"));
1650 
1651 	if (__predict_false(m->m_len < hoff + iphlen + thoff)) {
1652 		m = m_pullup(m, hoff + iphlen + thoff);
1653 		if (m == NULL) {
1654 			*mp = NULL;
1655 			return ENOBUFS;
1656 		}
1657 		*mp = m;
1658 	}
1659 	return 0;
1660 }
1661 
1662 static int
1663 mxge_encap_tso(mxge_tx_ring_t *tx, struct mxge_buffer_state *info_map,
1664     struct mbuf *m, int busdma_seg_cnt)
1665 {
1666 	mcp_kreq_ether_send_t *req;
1667 	bus_dma_segment_t *seg;
1668 	uint32_t low, high_swapped;
1669 	int len, seglen, cum_len, cum_len_next;
1670 	int next_is_first, chop, cnt, rdma_count, small;
1671 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1672 	uint8_t flags, flags_next;
1673 	struct mxge_buffer_state *info_last;
1674 	bus_dmamap_t map = info_map->map;
1675 
1676 	mss = m->m_pkthdr.tso_segsz;
1677 
1678 	/*
1679 	 * Negative cum_len signifies to the send loop that we are
1680 	 * still in the header portion of the TSO packet.
1681 	 */
1682 	cum_len = -(m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen +
1683 	    m->m_pkthdr.csum_thlen);
1684 
1685 	/*
1686 	 * TSO implies checksum offload on this hardware
1687 	 */
1688 	cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1689 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1690 
1691 	/*
1692 	 * For TSO, pseudo_hdr_offset holds mss.  The firmware figures
1693 	 * out where to put the checksum by parsing the header.
1694 	 */
1695 	pseudo_hdr_offset = htobe16(mss);
1696 
1697 	req = tx->req_list;
1698 	seg = tx->seg_list;
1699 	cnt = 0;
1700 	rdma_count = 0;
1701 
1702 	/*
1703 	 * "rdma_count" is the number of RDMAs belonging to the current
1704 	 * packet BEFORE the current send request.  For non-TSO packets,
1705 	 * this is equal to "count".
1706 	 *
1707 	 * For TSO packets, rdma_count needs to be reset to 0 after a
1708 	 * segment cut.
1709 	 *
1710 	 * The rdma_count field of the send request is the number of
1711 	 * RDMAs of the packet starting at that request.  For TSO send
1712 	 * requests with one ore more cuts in the middle, this is the
1713 	 * number of RDMAs starting after the last cut in the request.
1714 	 * All previous segments before the last cut implicitly have 1
1715 	 * RDMA.
1716 	 *
1717 	 * Since the number of RDMAs is not known beforehand, it must be
1718 	 * filled-in retroactively - after each segmentation cut or at
1719 	 * the end of the entire packet.
1720 	 */
1721 
1722 	while (busdma_seg_cnt) {
1723 		/*
1724 		 * Break the busdma segment up into pieces
1725 		 */
1726 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1727 		high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1728 		len = seg->ds_len;
1729 
1730 		while (len) {
1731 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1732 			seglen = len;
1733 			cum_len_next = cum_len + seglen;
1734 			(req - rdma_count)->rdma_count = rdma_count + 1;
1735 			if (__predict_true(cum_len >= 0)) {
1736 				/* Payload */
1737 				chop = (cum_len_next > mss);
1738 				cum_len_next = cum_len_next % mss;
1739 				next_is_first = (cum_len_next == 0);
1740 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1741 				flags_next |=
1742 				    next_is_first * MXGEFW_FLAGS_FIRST;
1743 				rdma_count |= -(chop | next_is_first);
1744 				rdma_count += chop & !next_is_first;
1745 			} else if (cum_len_next >= 0) {
1746 				/* Header ends */
1747 				rdma_count = -1;
1748 				cum_len_next = 0;
1749 				seglen = -cum_len;
1750 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1751 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1752 				    MXGEFW_FLAGS_FIRST |
1753 				    (small * MXGEFW_FLAGS_SMALL);
1754 			}
1755 
1756 			req->addr_high = high_swapped;
1757 			req->addr_low = htobe32(low);
1758 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1759 			req->pad = 0;
1760 			req->rdma_count = 1;
1761 			req->length = htobe16(seglen);
1762 			req->cksum_offset = cksum_offset;
1763 			req->flags =
1764 			    flags | ((cum_len & 1) * MXGEFW_FLAGS_ALIGN_ODD);
1765 			low += seglen;
1766 			len -= seglen;
1767 			cum_len = cum_len_next;
1768 			flags = flags_next;
1769 			req++;
1770 			cnt++;
1771 			rdma_count++;
1772 			if (__predict_false(cksum_offset > seglen))
1773 				cksum_offset -= seglen;
1774 			else
1775 				cksum_offset = 0;
1776 			if (__predict_false(cnt > tx->max_desc))
1777 				goto drop;
1778 		}
1779 		busdma_seg_cnt--;
1780 		seg++;
1781 	}
1782 	(req - rdma_count)->rdma_count = rdma_count;
1783 
1784 	do {
1785 		req--;
1786 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1787 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1788 
1789 	info_last = &tx->info[((cnt - 1) + tx->req) & tx->mask];
1790 
1791 	info_map->map = info_last->map;
1792 	info_last->map = map;
1793 	info_last->m = m;
1794 
1795 	mxge_submit_req(tx, tx->req_list, cnt);
1796 
1797 	if (tx->send_go != NULL && tx->queue_active == 0) {
1798 		/* Tell the NIC to start polling this slice */
1799 		*tx->send_go = 1;
1800 		tx->queue_active = 1;
1801 		tx->activate++;
1802 		wmb();
1803 	}
1804 	return 0;
1805 
1806 drop:
1807 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1808 	m_freem(m);
1809 	return ENOBUFS;
1810 }
1811 
1812 static int
1813 mxge_encap(mxge_tx_ring_t *tx, struct mbuf *m, bus_addr_t zeropad)
1814 {
1815 	mcp_kreq_ether_send_t *req;
1816 	bus_dma_segment_t *seg;
1817 	bus_dmamap_t map;
1818 	int cnt, cum_len, err, i, idx, odd_flag;
1819 	uint16_t pseudo_hdr_offset;
1820 	uint8_t flags, cksum_offset;
1821 	struct mxge_buffer_state *info_map, *info_last;
1822 
1823 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
1824 		err = mxge_pullup_tso(&m);
1825 		if (__predict_false(err))
1826 			return err;
1827 	}
1828 
1829 	/*
1830 	 * Map the frame for DMA
1831 	 */
1832 	idx = tx->req & tx->mask;
1833 	info_map = &tx->info[idx];
1834 	map = info_map->map;
1835 
1836 	err = bus_dmamap_load_mbuf_defrag(tx->dmat, map, &m,
1837 	    tx->seg_list, tx->max_desc - 2, &cnt, BUS_DMA_NOWAIT);
1838 	if (__predict_false(err != 0))
1839 		goto drop;
1840 	bus_dmamap_sync(tx->dmat, map, BUS_DMASYNC_PREWRITE);
1841 
1842 	/*
1843 	 * TSO is different enough, we handle it in another routine
1844 	 */
1845 	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1846 		return mxge_encap_tso(tx, info_map, m, cnt);
1847 
1848 	req = tx->req_list;
1849 	cksum_offset = 0;
1850 	pseudo_hdr_offset = 0;
1851 	flags = MXGEFW_FLAGS_NO_TSO;
1852 
1853 	/*
1854 	 * Checksum offloading
1855 	 */
1856 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1857 		cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1858 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1859 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1860 		req->cksum_offset = cksum_offset;
1861 		flags |= MXGEFW_FLAGS_CKSUM;
1862 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1863 	} else {
1864 		odd_flag = 0;
1865 	}
1866 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1867 		flags |= MXGEFW_FLAGS_SMALL;
1868 
1869 	/*
1870 	 * Convert segments into a request list
1871 	 */
1872 	cum_len = 0;
1873 	seg = tx->seg_list;
1874 	req->flags = MXGEFW_FLAGS_FIRST;
1875 	for (i = 0; i < cnt; i++) {
1876 		req->addr_low = htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1877 		req->addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1878 		req->length = htobe16(seg->ds_len);
1879 		req->cksum_offset = cksum_offset;
1880 		if (cksum_offset > seg->ds_len)
1881 			cksum_offset -= seg->ds_len;
1882 		else
1883 			cksum_offset = 0;
1884 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1885 		req->pad = 0; /* complete solid 16-byte block */
1886 		req->rdma_count = 1;
1887 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1888 		cum_len += seg->ds_len;
1889 		seg++;
1890 		req++;
1891 		req->flags = 0;
1892 	}
1893 	req--;
1894 
1895 	/*
1896 	 * Pad runt to 60 bytes
1897 	 */
1898 	if (cum_len < 60) {
1899 		req++;
1900 		req->addr_low = htobe32(MXGE_LOWPART_TO_U32(zeropad));
1901 		req->addr_high = htobe32(MXGE_HIGHPART_TO_U32(zeropad));
1902 		req->length = htobe16(60 - cum_len);
1903 		req->cksum_offset = 0;
1904 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1905 		req->pad = 0; /* complete solid 16-byte block */
1906 		req->rdma_count = 1;
1907 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1908 		cnt++;
1909 	}
1910 
1911 	tx->req_list[0].rdma_count = cnt;
1912 #if 0
1913 	/* print what the firmware will see */
1914 	for (i = 0; i < cnt; i++) {
1915 		kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1916 		    "cso:%d, flags:0x%x, rdma:%d\n",
1917 		    i, (int)ntohl(tx->req_list[i].addr_high),
1918 		    (int)ntohl(tx->req_list[i].addr_low),
1919 		    (int)ntohs(tx->req_list[i].length),
1920 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1921 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1922 		    tx->req_list[i].rdma_count);
1923 	}
1924 	kprintf("--------------\n");
1925 #endif
1926 	info_last = &tx->info[((cnt - 1) + tx->req) & tx->mask];
1927 
1928 	info_map->map = info_last->map;
1929 	info_last->map = map;
1930 	info_last->m = m;
1931 
1932 	mxge_submit_req(tx, tx->req_list, cnt);
1933 
1934 	if (tx->send_go != NULL && tx->queue_active == 0) {
1935 		/* Tell the NIC to start polling this slice */
1936 		*tx->send_go = 1;
1937 		tx->queue_active = 1;
1938 		tx->activate++;
1939 		wmb();
1940 	}
1941 	return 0;
1942 
1943 drop:
1944 	m_freem(m);
1945 	return err;
1946 }
1947 
1948 static void
1949 mxge_start(struct ifnet *ifp, struct ifaltq_subque *ifsq)
1950 {
1951 	mxge_softc_t *sc = ifp->if_softc;
1952 	mxge_tx_ring_t *tx = ifsq_get_priv(ifsq);
1953 	bus_addr_t zeropad;
1954 	int encap = 0;
1955 
1956 	KKASSERT(tx->ifsq == ifsq);
1957 	ASSERT_SERIALIZED(&tx->tx_serialize);
1958 
1959 	if ((ifp->if_flags & IFF_RUNNING) == 0 || ifsq_is_oactive(ifsq))
1960 		return;
1961 
1962 	zeropad = sc->zeropad_dma.dmem_busaddr;
1963 	while (tx->mask - (tx->req - tx->done) > tx->max_desc) {
1964 		struct mbuf *m;
1965 		int error;
1966 
1967 		m = ifsq_dequeue(ifsq);
1968 		if (m == NULL)
1969 			goto done;
1970 
1971 		BPF_MTAP(ifp, m);
1972 		error = mxge_encap(tx, m, zeropad);
1973 		if (!error)
1974 			encap = 1;
1975 		else
1976 			IFNET_STAT_INC(ifp, oerrors, 1);
1977 	}
1978 
1979 	/* Ran out of transmit slots */
1980 	ifsq_set_oactive(ifsq);
1981 done:
1982 	if (encap)
1983 		tx->watchdog.wd_timer = 5;
1984 }
1985 
1986 static void
1987 mxge_watchdog(struct ifaltq_subque *ifsq)
1988 {
1989 	struct ifnet *ifp = ifsq_get_ifp(ifsq);
1990 	struct mxge_softc *sc = ifp->if_softc;
1991 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
1992 	mxge_tx_ring_t *tx = ifsq_get_priv(ifsq);
1993 
1994 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
1995 
1996 	/* Check for pause blocking before resetting */
1997 	if (tx->watchdog_rx_pause == rx_pause) {
1998 		mxge_warn_stuck(sc, tx, 0);
1999 		mxge_watchdog_reset(sc);
2000 		return;
2001 	} else {
2002 		if_printf(ifp, "Flow control blocking xmits, "
2003 		    "check link partner\n");
2004 	}
2005 	tx->watchdog_rx_pause = rx_pause;
2006 }
2007 
2008 /*
2009  * Copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2010  * at most 32 bytes at a time, so as to avoid involving the software
2011  * pio handler in the nic.  We re-write the first segment's low
2012  * DMA address to mark it valid only after we write the entire chunk
2013  * in a burst
2014  */
2015 static __inline void
2016 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2017     mcp_kreq_ether_recv_t *src)
2018 {
2019 	uint32_t low;
2020 
2021 	low = src->addr_low;
2022 	src->addr_low = 0xffffffff;
2023 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2024 	wmb();
2025 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2026 	wmb();
2027 	src->addr_low = low;
2028 	dst->addr_low = low;
2029 	wmb();
2030 }
2031 
2032 static int
2033 mxge_get_buf_small(mxge_rx_ring_t *rx, bus_dmamap_t map, int idx,
2034     boolean_t init)
2035 {
2036 	bus_dma_segment_t seg;
2037 	struct mbuf *m;
2038 	int cnt, err, mflag;
2039 
2040 	mflag = M_NOWAIT;
2041 	if (__predict_false(init))
2042 		mflag = M_WAITOK;
2043 
2044 	m = m_gethdr(mflag, MT_DATA);
2045 	if (m == NULL) {
2046 		err = ENOBUFS;
2047 		if (__predict_false(init)) {
2048 			/*
2049 			 * During initialization, there
2050 			 * is nothing to setup; bail out
2051 			 */
2052 			return err;
2053 		}
2054 		goto done;
2055 	}
2056 	m->m_len = m->m_pkthdr.len = MHLEN;
2057 
2058 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2059 	    &seg, 1, &cnt, BUS_DMA_NOWAIT);
2060 	if (err != 0) {
2061 		m_freem(m);
2062 		if (__predict_false(init)) {
2063 			/*
2064 			 * During initialization, there
2065 			 * is nothing to setup; bail out
2066 			 */
2067 			return err;
2068 		}
2069 		goto done;
2070 	}
2071 
2072 	rx->info[idx].m = m;
2073 	rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2074 	rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2075 
2076 done:
2077 	if ((idx & 7) == 7)
2078 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2079 	return err;
2080 }
2081 
2082 static int
2083 mxge_get_buf_big(mxge_rx_ring_t *rx, bus_dmamap_t map, int idx,
2084     boolean_t init)
2085 {
2086 	bus_dma_segment_t seg;
2087 	struct mbuf *m;
2088 	int cnt, err, mflag;
2089 
2090 	mflag = M_NOWAIT;
2091 	if (__predict_false(init))
2092 		mflag = M_WAITOK;
2093 
2094 	if (rx->cl_size == MCLBYTES)
2095 		m = m_getcl(mflag, MT_DATA, M_PKTHDR);
2096 	else
2097 		m = m_getjcl(mflag, MT_DATA, M_PKTHDR, MJUMPAGESIZE);
2098 	if (m == NULL) {
2099 		err = ENOBUFS;
2100 		if (__predict_false(init)) {
2101 			/*
2102 			 * During initialization, there
2103 			 * is nothing to setup; bail out
2104 			 */
2105 			return err;
2106 		}
2107 		goto done;
2108 	}
2109 	m->m_len = m->m_pkthdr.len = rx->cl_size;
2110 
2111 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2112 	    &seg, 1, &cnt, BUS_DMA_NOWAIT);
2113 	if (err != 0) {
2114 		m_freem(m);
2115 		if (__predict_false(init)) {
2116 			/*
2117 			 * During initialization, there
2118 			 * is nothing to setup; bail out
2119 			 */
2120 			return err;
2121 		}
2122 		goto done;
2123 	}
2124 
2125 	rx->info[idx].m = m;
2126 	rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2127 	rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2128 
2129 done:
2130 	if ((idx & 7) == 7)
2131 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2132 	return err;
2133 }
2134 
2135 /*
2136  * Myri10GE hardware checksums are not valid if the sender
2137  * padded the frame with non-zero padding.  This is because
2138  * the firmware just does a simple 16-bit 1s complement
2139  * checksum across the entire frame, excluding the first 14
2140  * bytes.  It is best to simply to check the checksum and
2141  * tell the stack about it only if the checksum is good
2142  */
2143 static __inline uint16_t
2144 mxge_rx_csum(struct mbuf *m, int csum)
2145 {
2146 	const struct ether_header *eh;
2147 	const struct ip *ip;
2148 	uint16_t c;
2149 
2150 	eh = mtod(m, const struct ether_header *);
2151 
2152 	/* Only deal with IPv4 TCP & UDP for now */
2153 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2154 		return 1;
2155 
2156 	ip = (const struct ip *)(eh + 1);
2157 	if (__predict_false(ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP))
2158 		return 1;
2159 
2160 #ifdef INET
2161 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2162 	    htonl(ntohs(csum) + ntohs(ip->ip_len) +
2163 	          - (ip->ip_hl << 2) + ip->ip_p));
2164 #else
2165 	c = 1;
2166 #endif
2167 	c ^= 0xffff;
2168 	return c;
2169 }
2170 
2171 static void
2172 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2173 {
2174 	struct ether_vlan_header *evl;
2175 	uint32_t partial;
2176 
2177 	evl = mtod(m, struct ether_vlan_header *);
2178 
2179 	/*
2180 	 * Fix checksum by subtracting EVL_ENCAPLEN bytes after
2181 	 * what the firmware thought was the end of the ethernet
2182 	 * header.
2183 	 */
2184 
2185 	/* Put checksum into host byte order */
2186 	*csum = ntohs(*csum);
2187 
2188 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2189 	*csum += ~partial;
2190 	*csum += ((*csum) < ~partial);
2191 	*csum = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2192 	*csum = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2193 
2194 	/*
2195 	 * Restore checksum to network byte order;
2196 	 * later consumers expect this
2197 	 */
2198 	*csum = htons(*csum);
2199 
2200 	/* save the tag */
2201 	m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2202 	m->m_flags |= M_VLANTAG;
2203 
2204 	/*
2205 	 * Remove the 802.1q header by copying the Ethernet
2206 	 * addresses over it and adjusting the beginning of
2207 	 * the data in the mbuf.  The encapsulated Ethernet
2208 	 * type field is already in place.
2209 	 */
2210 	bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2211 	    ETHER_HDR_LEN - ETHER_TYPE_LEN);
2212 	m_adj(m, EVL_ENCAPLEN);
2213 }
2214 
2215 
2216 static __inline void
2217 mxge_rx_done_big(struct ifnet *ifp, mxge_rx_ring_t *rx,
2218     uint32_t len, uint32_t csum)
2219 {
2220 	struct mbuf *m;
2221 	const struct ether_header *eh;
2222 	bus_dmamap_t old_map;
2223 	int idx;
2224 
2225 	idx = rx->cnt & rx->mask;
2226 	rx->cnt++;
2227 
2228 	/* Save a pointer to the received mbuf */
2229 	m = rx->info[idx].m;
2230 
2231 	/* Try to replace the received mbuf */
2232 	if (mxge_get_buf_big(rx, rx->extra_map, idx, FALSE)) {
2233 		/* Drop the frame -- the old mbuf is re-cycled */
2234 		IFNET_STAT_INC(ifp, ierrors, 1);
2235 		return;
2236 	}
2237 
2238 	/* Unmap the received buffer */
2239 	old_map = rx->info[idx].map;
2240 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2241 	bus_dmamap_unload(rx->dmat, old_map);
2242 
2243 	/* Swap the bus_dmamap_t's */
2244 	rx->info[idx].map = rx->extra_map;
2245 	rx->extra_map = old_map;
2246 
2247 	/*
2248 	 * mcp implicitly skips 1st 2 bytes so that packet is properly
2249 	 * aligned
2250 	 */
2251 	m->m_data += MXGEFW_PAD;
2252 
2253 	m->m_pkthdr.rcvif = ifp;
2254 	m->m_len = m->m_pkthdr.len = len;
2255 
2256 	IFNET_STAT_INC(ifp, ipackets, 1);
2257 
2258 	eh = mtod(m, const struct ether_header *);
2259 	if (eh->ether_type == htons(ETHERTYPE_VLAN))
2260 		mxge_vlan_tag_remove(m, &csum);
2261 
2262 	/* If the checksum is valid, mark it in the mbuf header */
2263 	if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2264 	    mxge_rx_csum(m, csum) == 0) {
2265 		/* Tell the stack that the checksum is good */
2266 		m->m_pkthdr.csum_data = 0xffff;
2267 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2268 		    CSUM_DATA_VALID;
2269 	}
2270 	ifp->if_input(ifp, m, NULL, -1);
2271 }
2272 
2273 static __inline void
2274 mxge_rx_done_small(struct ifnet *ifp, mxge_rx_ring_t *rx,
2275     uint32_t len, uint32_t csum)
2276 {
2277 	const struct ether_header *eh;
2278 	struct mbuf *m;
2279 	bus_dmamap_t old_map;
2280 	int idx;
2281 
2282 	idx = rx->cnt & rx->mask;
2283 	rx->cnt++;
2284 
2285 	/* Save a pointer to the received mbuf */
2286 	m = rx->info[idx].m;
2287 
2288 	/* Try to replace the received mbuf */
2289 	if (mxge_get_buf_small(rx, rx->extra_map, idx, FALSE)) {
2290 		/* Drop the frame -- the old mbuf is re-cycled */
2291 		IFNET_STAT_INC(ifp, ierrors, 1);
2292 		return;
2293 	}
2294 
2295 	/* Unmap the received buffer */
2296 	old_map = rx->info[idx].map;
2297 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2298 	bus_dmamap_unload(rx->dmat, old_map);
2299 
2300 	/* Swap the bus_dmamap_t's */
2301 	rx->info[idx].map = rx->extra_map;
2302 	rx->extra_map = old_map;
2303 
2304 	/*
2305 	 * mcp implicitly skips 1st 2 bytes so that packet is properly
2306 	 * aligned
2307 	 */
2308 	m->m_data += MXGEFW_PAD;
2309 
2310 	m->m_pkthdr.rcvif = ifp;
2311 	m->m_len = m->m_pkthdr.len = len;
2312 
2313 	IFNET_STAT_INC(ifp, ipackets, 1);
2314 
2315 	eh = mtod(m, const struct ether_header *);
2316 	if (eh->ether_type == htons(ETHERTYPE_VLAN))
2317 		mxge_vlan_tag_remove(m, &csum);
2318 
2319 	/* If the checksum is valid, mark it in the mbuf header */
2320 	if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2321 	    mxge_rx_csum(m, csum) == 0) {
2322 		/* Tell the stack that the checksum is good */
2323 		m->m_pkthdr.csum_data = 0xffff;
2324 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2325 		    CSUM_DATA_VALID;
2326 	}
2327 	ifp->if_input(ifp, m, NULL, -1);
2328 }
2329 
2330 static __inline void
2331 mxge_clean_rx_done(struct ifnet *ifp, struct mxge_rx_data *rx_data, int cycle)
2332 {
2333 	mxge_rx_done_t *rx_done = &rx_data->rx_done;
2334 
2335 	while (rx_done->entry[rx_done->idx].length != 0 && cycle != 0) {
2336 		uint16_t length, checksum;
2337 
2338 		length = ntohs(rx_done->entry[rx_done->idx].length);
2339 		rx_done->entry[rx_done->idx].length = 0;
2340 
2341 		checksum = rx_done->entry[rx_done->idx].checksum;
2342 
2343 		if (length <= MXGE_RX_SMALL_BUFLEN) {
2344 			mxge_rx_done_small(ifp, &rx_data->rx_small,
2345 			    length, checksum);
2346 		} else {
2347 			mxge_rx_done_big(ifp, &rx_data->rx_big,
2348 			    length, checksum);
2349 		}
2350 
2351 		rx_done->idx++;
2352 		rx_done->idx &= rx_done->mask;
2353 		--cycle;
2354 	}
2355 }
2356 
2357 static __inline void
2358 mxge_tx_done(struct ifnet *ifp, mxge_tx_ring_t *tx, uint32_t mcp_idx)
2359 {
2360 	ASSERT_SERIALIZED(&tx->tx_serialize);
2361 
2362 	while (tx->pkt_done != mcp_idx) {
2363 		struct mbuf *m;
2364 		int idx;
2365 
2366 		idx = tx->done & tx->mask;
2367 		tx->done++;
2368 
2369 		m = tx->info[idx].m;
2370 		/*
2371 		 * mbuf and DMA map only attached to the first
2372 		 * segment per-mbuf.
2373 		 */
2374 		if (m != NULL) {
2375 			tx->pkt_done++;
2376 			IFNET_STAT_INC(ifp, opackets, 1);
2377 			tx->info[idx].m = NULL;
2378 			bus_dmamap_unload(tx->dmat, tx->info[idx].map);
2379 			m_freem(m);
2380 		}
2381 	}
2382 
2383 	/*
2384 	 * If we have space, clear OACTIVE to tell the stack that
2385 	 * its OK to send packets
2386 	 */
2387 	if (tx->req - tx->done < (tx->mask + 1) / 2) {
2388 		ifsq_clr_oactive(tx->ifsq);
2389 		if (tx->req == tx->done) {
2390 			/* Reset watchdog */
2391 			tx->watchdog.wd_timer = 0;
2392 		}
2393 	}
2394 
2395 	if (!ifsq_is_empty(tx->ifsq))
2396 		ifsq_devstart(tx->ifsq);
2397 
2398 	if (tx->send_stop != NULL && tx->req == tx->done) {
2399 		/*
2400 		 * Let the NIC stop polling this queue, since there
2401 		 * are no more transmits pending
2402 		 */
2403 		*tx->send_stop = 1;
2404 		tx->queue_active = 0;
2405 		tx->deactivate++;
2406 		wmb();
2407 	}
2408 }
2409 
2410 static struct mxge_media_type mxge_xfp_media_types[] = {
2411 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2412 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2413 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2414 	{IFM_NONE,	(1 << 5),	"10GBASE-ER"},
2415 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2416 	{IFM_NONE,	(1 << 3),	"10GBASE-SW"},
2417 	{IFM_NONE,	(1 << 2),	"10GBASE-LW"},
2418 	{IFM_NONE,	(1 << 1),	"10GBASE-EW"},
2419 	{IFM_NONE,	(1 << 0),	"Reserved"}
2420 };
2421 
2422 static struct mxge_media_type mxge_sfp_media_types[] = {
2423 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2424 	{IFM_NONE,	(1 << 7),	"Reserved"},
2425 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2426 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2427 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2428 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2429 };
2430 
2431 static void
2432 mxge_media_set(mxge_softc_t *sc, int media_type)
2433 {
2434 	int fc_opt = 0;
2435 
2436 	if (media_type == IFM_NONE)
2437 		return;
2438 
2439 	if (sc->pause)
2440 		fc_opt = IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE;
2441 
2442 	ifmedia_add(&sc->media, MXGE_IFM | media_type, 0, NULL);
2443 	ifmedia_set(&sc->media, MXGE_IFM | media_type | fc_opt);
2444 
2445 	sc->current_media = media_type;
2446 }
2447 
2448 static void
2449 mxge_media_unset(mxge_softc_t *sc)
2450 {
2451 	ifmedia_removeall(&sc->media);
2452 	sc->current_media = IFM_NONE;
2453 }
2454 
2455 static void
2456 mxge_media_init(mxge_softc_t *sc)
2457 {
2458 	const char *ptr;
2459 	int i;
2460 
2461 	mxge_media_unset(sc);
2462 
2463 	/*
2464 	 * Parse the product code to deterimine the interface type
2465 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2466 	 * after the 3rd dash in the driver's cached copy of the
2467 	 * EEPROM's product code string.
2468 	 */
2469 	ptr = sc->product_code_string;
2470 	if (ptr == NULL) {
2471 		if_printf(sc->ifp, "Missing product code\n");
2472 		return;
2473 	}
2474 
2475 	for (i = 0; i < 3; i++, ptr++) {
2476 		ptr = strchr(ptr, '-');
2477 		if (ptr == NULL) {
2478 			if_printf(sc->ifp, "only %d dashes in PC?!?\n", i);
2479 			return;
2480 		}
2481 	}
2482 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2483 		/* -C is CX4 */
2484 		sc->connector = MXGE_CX4;
2485 		mxge_media_set(sc, IFM_10G_CX4);
2486 	} else if (*ptr == 'Q') {
2487 		/* -Q is Quad Ribbon Fiber */
2488 		sc->connector = MXGE_QRF;
2489 		if_printf(sc->ifp, "Quad Ribbon Fiber Media\n");
2490 		/* DragonFly has no media type for Quad ribbon fiber */
2491 	} else if (*ptr == 'R') {
2492 		/* -R is XFP */
2493 		sc->connector = MXGE_XFP;
2494 		/* NOTE: ifmedia will be installed later */
2495 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2496 		/* -S or -2S is SFP+ */
2497 		sc->connector = MXGE_SFP;
2498 		/* NOTE: ifmedia will be installed later */
2499 	} else {
2500 		sc->connector = MXGE_UNK;
2501 		if_printf(sc->ifp, "Unknown media type: %c\n", *ptr);
2502 	}
2503 }
2504 
2505 /*
2506  * Determine the media type for a NIC.  Some XFPs will identify
2507  * themselves only when their link is up, so this is initiated via a
2508  * link up interrupt.  However, this can potentially take up to
2509  * several milliseconds, so it is run via the watchdog routine, rather
2510  * than in the interrupt handler itself.
2511  */
2512 static void
2513 mxge_media_probe(mxge_softc_t *sc)
2514 {
2515 	mxge_cmd_t cmd;
2516 	const char *cage_type;
2517 	struct mxge_media_type *mxge_media_types = NULL;
2518 	int i, err, ms, mxge_media_type_entries;
2519 	uint32_t byte;
2520 
2521 	sc->need_media_probe = 0;
2522 
2523 	if (sc->connector == MXGE_XFP) {
2524 		/* -R is XFP */
2525 		mxge_media_types = mxge_xfp_media_types;
2526 		mxge_media_type_entries = NELEM(mxge_xfp_media_types);
2527 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2528 		cage_type = "XFP";
2529 	} else 	if (sc->connector == MXGE_SFP) {
2530 		/* -S or -2S is SFP+ */
2531 		mxge_media_types = mxge_sfp_media_types;
2532 		mxge_media_type_entries = NELEM(mxge_sfp_media_types);
2533 		cage_type = "SFP+";
2534 		byte = 3;
2535 	} else {
2536 		/* nothing to do; media type cannot change */
2537 		return;
2538 	}
2539 
2540 	/*
2541 	 * At this point we know the NIC has an XFP cage, so now we
2542 	 * try to determine what is in the cage by using the
2543 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2544 	 * register.  We read just one byte, which may take over
2545 	 * a millisecond
2546 	 */
2547 
2548 	bzero(&cmd, sizeof(cmd));	/* silence gcc warning */
2549 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2550 	cmd.data1 = byte;
2551 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2552 	if (err != MXGEFW_CMD_OK) {
2553 		if (err == MXGEFW_CMD_ERROR_I2C_FAILURE)
2554 			if_printf(sc->ifp, "failed to read XFP\n");
2555 		else if (err == MXGEFW_CMD_ERROR_I2C_ABSENT)
2556 			if_printf(sc->ifp, "Type R/S with no XFP!?!?\n");
2557 		else
2558 			if_printf(sc->ifp, "I2C read failed, err: %d", err);
2559 		mxge_media_unset(sc);
2560 		return;
2561 	}
2562 
2563 	/* Now we wait for the data to be cached */
2564 	cmd.data0 = byte;
2565 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2566 	for (ms = 0; err == EBUSY && ms < 50; ms++) {
2567 		DELAY(1000);
2568 		cmd.data0 = byte;
2569 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2570 	}
2571 	if (err != MXGEFW_CMD_OK) {
2572 		if_printf(sc->ifp, "failed to read %s (%d, %dms)\n",
2573 		    cage_type, err, ms);
2574 		mxge_media_unset(sc);
2575 		return;
2576 	}
2577 
2578 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2579 		if (bootverbose) {
2580 			if_printf(sc->ifp, "%s:%s\n", cage_type,
2581 			    mxge_media_types[0].name);
2582 		}
2583 		if (sc->current_media != mxge_media_types[0].flag) {
2584 			mxge_media_unset(sc);
2585 			mxge_media_set(sc, mxge_media_types[0].flag);
2586 		}
2587 		return;
2588 	}
2589 	for (i = 1; i < mxge_media_type_entries; i++) {
2590 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2591 			if (bootverbose) {
2592 				if_printf(sc->ifp, "%s:%s\n", cage_type,
2593 				    mxge_media_types[i].name);
2594 			}
2595 
2596 			if (sc->current_media != mxge_media_types[i].flag) {
2597 				mxge_media_unset(sc);
2598 				mxge_media_set(sc, mxge_media_types[i].flag);
2599 			}
2600 			return;
2601 		}
2602 	}
2603 	mxge_media_unset(sc);
2604 	if (bootverbose) {
2605 		if_printf(sc->ifp, "%s media 0x%x unknown\n", cage_type,
2606 		    cmd.data0);
2607 	}
2608 }
2609 
2610 static void
2611 mxge_intr_status(struct mxge_softc *sc, const mcp_irq_data_t *stats)
2612 {
2613 	if (sc->link_state != stats->link_up) {
2614 		sc->link_state = stats->link_up;
2615 		if (sc->link_state) {
2616 			sc->ifp->if_link_state = LINK_STATE_UP;
2617 			if_link_state_change(sc->ifp);
2618 			if (bootverbose)
2619 				if_printf(sc->ifp, "link up\n");
2620 		} else {
2621 			sc->ifp->if_link_state = LINK_STATE_DOWN;
2622 			if_link_state_change(sc->ifp);
2623 			if (bootverbose)
2624 				if_printf(sc->ifp, "link down\n");
2625 		}
2626 		sc->need_media_probe = 1;
2627 	}
2628 
2629 	if (sc->rdma_tags_available != be32toh(stats->rdma_tags_available)) {
2630 		sc->rdma_tags_available = be32toh(stats->rdma_tags_available);
2631 		if_printf(sc->ifp, "RDMA timed out! %d tags left\n",
2632 		    sc->rdma_tags_available);
2633 	}
2634 
2635 	if (stats->link_down) {
2636 		sc->down_cnt += stats->link_down;
2637 		sc->link_state = 0;
2638 		sc->ifp->if_link_state = LINK_STATE_DOWN;
2639 		if_link_state_change(sc->ifp);
2640 	}
2641 }
2642 
2643 static void
2644 mxge_serialize_skipmain(struct mxge_softc *sc)
2645 {
2646 	lwkt_serialize_array_enter(sc->serializes, sc->nserialize, 1);
2647 }
2648 
2649 static void
2650 mxge_deserialize_skipmain(struct mxge_softc *sc)
2651 {
2652 	lwkt_serialize_array_exit(sc->serializes, sc->nserialize, 1);
2653 }
2654 
2655 static void
2656 mxge_legacy(void *arg)
2657 {
2658 	struct mxge_slice_state *ss = arg;
2659 	mxge_softc_t *sc = ss->sc;
2660 	mcp_irq_data_t *stats = ss->fw_stats;
2661 	mxge_tx_ring_t *tx = &ss->tx;
2662 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2663 	uint32_t send_done_count;
2664 	uint8_t valid;
2665 
2666 	ASSERT_SERIALIZED(&sc->main_serialize);
2667 
2668 	/* Make sure the DMA has finished */
2669 	if (!stats->valid)
2670 		return;
2671 	valid = stats->valid;
2672 
2673 	/* Lower legacy IRQ */
2674 	*sc->irq_deassert = 0;
2675 	if (!mxge_deassert_wait) {
2676 		/* Don't wait for conf. that irq is low */
2677 		stats->valid = 0;
2678 	}
2679 
2680 	mxge_serialize_skipmain(sc);
2681 
2682 	/*
2683 	 * Loop while waiting for legacy irq deassertion
2684 	 * XXX do we really want to loop?
2685 	 */
2686 	do {
2687 		/* Check for transmit completes and receives */
2688 		send_done_count = be32toh(stats->send_done_count);
2689 		while ((send_done_count != tx->pkt_done) ||
2690 		       (rx_done->entry[rx_done->idx].length != 0)) {
2691 			if (send_done_count != tx->pkt_done) {
2692 				mxge_tx_done(&sc->arpcom.ac_if, tx,
2693 				    (int)send_done_count);
2694 			}
2695 			mxge_clean_rx_done(&sc->arpcom.ac_if, &ss->rx_data, -1);
2696 			send_done_count = be32toh(stats->send_done_count);
2697 		}
2698 		if (mxge_deassert_wait)
2699 			wmb();
2700 	} while (*((volatile uint8_t *)&stats->valid));
2701 
2702 	mxge_deserialize_skipmain(sc);
2703 
2704 	/* Fw link & error stats meaningful only on the first slice */
2705 	if (__predict_false(stats->stats_updated))
2706 		mxge_intr_status(sc, stats);
2707 
2708 	/* Check to see if we have rx token to pass back */
2709 	if (valid & 0x1)
2710 		*ss->irq_claim = be32toh(3);
2711 	*(ss->irq_claim + 1) = be32toh(3);
2712 }
2713 
2714 static void
2715 mxge_msi(void *arg)
2716 {
2717 	struct mxge_slice_state *ss = arg;
2718 	mxge_softc_t *sc = ss->sc;
2719 	mcp_irq_data_t *stats = ss->fw_stats;
2720 	mxge_tx_ring_t *tx = &ss->tx;
2721 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2722 	uint32_t send_done_count;
2723 	uint8_t valid;
2724 #ifndef IFPOLL_ENABLE
2725 	const boolean_t polling = FALSE;
2726 #else
2727 	boolean_t polling = FALSE;
2728 #endif
2729 
2730 	ASSERT_SERIALIZED(&sc->main_serialize);
2731 
2732 	/* Make sure the DMA has finished */
2733 	if (__predict_false(!stats->valid))
2734 		return;
2735 
2736 	valid = stats->valid;
2737 	stats->valid = 0;
2738 
2739 #ifdef IFPOLL_ENABLE
2740 	if (sc->arpcom.ac_if.if_flags & IFF_NPOLLING)
2741 		polling = TRUE;
2742 #endif
2743 
2744 	if (!polling) {
2745 		/* Check for receives */
2746 		lwkt_serialize_enter(&ss->rx_data.rx_serialize);
2747 		if (rx_done->entry[rx_done->idx].length != 0)
2748 			mxge_clean_rx_done(&sc->arpcom.ac_if, &ss->rx_data, -1);
2749 		lwkt_serialize_exit(&ss->rx_data.rx_serialize);
2750 	}
2751 
2752 	/*
2753 	 * Check for transmit completes
2754 	 *
2755 	 * NOTE:
2756 	 * Since pkt_done is only changed by mxge_tx_done(),
2757 	 * which is called only in interrupt handler, the
2758 	 * check w/o holding tx serializer is MPSAFE.
2759 	 */
2760 	send_done_count = be32toh(stats->send_done_count);
2761 	if (send_done_count != tx->pkt_done) {
2762 		lwkt_serialize_enter(&tx->tx_serialize);
2763 		mxge_tx_done(&sc->arpcom.ac_if, tx, (int)send_done_count);
2764 		lwkt_serialize_exit(&tx->tx_serialize);
2765 	}
2766 
2767 	if (__predict_false(stats->stats_updated))
2768 		mxge_intr_status(sc, stats);
2769 
2770 	/* Check to see if we have rx token to pass back */
2771 	if (!polling && (valid & 0x1))
2772 		*ss->irq_claim = be32toh(3);
2773 	*(ss->irq_claim + 1) = be32toh(3);
2774 }
2775 
2776 static void
2777 mxge_msix_rx(void *arg)
2778 {
2779 	struct mxge_slice_state *ss = arg;
2780 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2781 
2782 #ifdef IFPOLL_ENABLE
2783 	if (ss->sc->arpcom.ac_if.if_flags & IFF_NPOLLING)
2784 		return;
2785 #endif
2786 
2787 	ASSERT_SERIALIZED(&ss->rx_data.rx_serialize);
2788 
2789 	if (rx_done->entry[rx_done->idx].length != 0)
2790 		mxge_clean_rx_done(&ss->sc->arpcom.ac_if, &ss->rx_data, -1);
2791 
2792 	*ss->irq_claim = be32toh(3);
2793 }
2794 
2795 static void
2796 mxge_msix_rxtx(void *arg)
2797 {
2798 	struct mxge_slice_state *ss = arg;
2799 	mxge_softc_t *sc = ss->sc;
2800 	mcp_irq_data_t *stats = ss->fw_stats;
2801 	mxge_tx_ring_t *tx = &ss->tx;
2802 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2803 	uint32_t send_done_count;
2804 	uint8_t valid;
2805 #ifndef IFPOLL_ENABLE
2806 	const boolean_t polling = FALSE;
2807 #else
2808 	boolean_t polling = FALSE;
2809 #endif
2810 
2811 	ASSERT_SERIALIZED(&ss->rx_data.rx_serialize);
2812 
2813 	/* Make sure the DMA has finished */
2814 	if (__predict_false(!stats->valid))
2815 		return;
2816 
2817 	valid = stats->valid;
2818 	stats->valid = 0;
2819 
2820 #ifdef IFPOLL_ENABLE
2821 	if (sc->arpcom.ac_if.if_flags & IFF_NPOLLING)
2822 		polling = TRUE;
2823 #endif
2824 
2825 	/* Check for receives */
2826 	if (!polling && rx_done->entry[rx_done->idx].length != 0)
2827 		mxge_clean_rx_done(&sc->arpcom.ac_if, &ss->rx_data, -1);
2828 
2829 	/*
2830 	 * Check for transmit completes
2831 	 *
2832 	 * NOTE:
2833 	 * Since pkt_done is only changed by mxge_tx_done(),
2834 	 * which is called only in interrupt handler, the
2835 	 * check w/o holding tx serializer is MPSAFE.
2836 	 */
2837 	send_done_count = be32toh(stats->send_done_count);
2838 	if (send_done_count != tx->pkt_done) {
2839 		lwkt_serialize_enter(&tx->tx_serialize);
2840 		mxge_tx_done(&sc->arpcom.ac_if, tx, (int)send_done_count);
2841 		lwkt_serialize_exit(&tx->tx_serialize);
2842 	}
2843 
2844 	/* Check to see if we have rx token to pass back */
2845 	if (!polling && (valid & 0x1))
2846 		*ss->irq_claim = be32toh(3);
2847 	*(ss->irq_claim + 1) = be32toh(3);
2848 }
2849 
2850 static void
2851 mxge_init(void *arg)
2852 {
2853 	struct mxge_softc *sc = arg;
2854 
2855 	ASSERT_IFNET_SERIALIZED_ALL(sc->ifp);
2856 	if ((sc->ifp->if_flags & IFF_RUNNING) == 0)
2857 		mxge_open(sc);
2858 }
2859 
2860 static void
2861 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2862 {
2863 	int i;
2864 
2865 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
2866 		if (ss->rx_data.rx_big.info[i].m == NULL)
2867 			continue;
2868 		bus_dmamap_unload(ss->rx_data.rx_big.dmat,
2869 		    ss->rx_data.rx_big.info[i].map);
2870 		m_freem(ss->rx_data.rx_big.info[i].m);
2871 		ss->rx_data.rx_big.info[i].m = NULL;
2872 	}
2873 
2874 	for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
2875 		if (ss->rx_data.rx_small.info[i].m == NULL)
2876 			continue;
2877 		bus_dmamap_unload(ss->rx_data.rx_small.dmat,
2878 		    ss->rx_data.rx_small.info[i].map);
2879 		m_freem(ss->rx_data.rx_small.info[i].m);
2880 		ss->rx_data.rx_small.info[i].m = NULL;
2881 	}
2882 
2883 	/* Transmit ring used only on the first slice */
2884 	if (ss->tx.info == NULL)
2885 		return;
2886 
2887 	for (i = 0; i <= ss->tx.mask; i++) {
2888 		if (ss->tx.info[i].m == NULL)
2889 			continue;
2890 		bus_dmamap_unload(ss->tx.dmat, ss->tx.info[i].map);
2891 		m_freem(ss->tx.info[i].m);
2892 		ss->tx.info[i].m = NULL;
2893 	}
2894 }
2895 
2896 static void
2897 mxge_free_mbufs(mxge_softc_t *sc)
2898 {
2899 	int slice;
2900 
2901 	for (slice = 0; slice < sc->num_slices; slice++)
2902 		mxge_free_slice_mbufs(&sc->ss[slice]);
2903 }
2904 
2905 static void
2906 mxge_free_slice_rings(struct mxge_slice_state *ss)
2907 {
2908 	int i;
2909 
2910 	if (ss->rx_data.rx_done.entry != NULL) {
2911 		mxge_dma_free(&ss->rx_done_dma);
2912 		ss->rx_data.rx_done.entry = NULL;
2913 	}
2914 
2915 	if (ss->tx.req_list != NULL) {
2916 		kfree(ss->tx.req_list, M_DEVBUF);
2917 		ss->tx.req_list = NULL;
2918 	}
2919 
2920 	if (ss->tx.seg_list != NULL) {
2921 		kfree(ss->tx.seg_list, M_DEVBUF);
2922 		ss->tx.seg_list = NULL;
2923 	}
2924 
2925 	if (ss->rx_data.rx_small.shadow != NULL) {
2926 		kfree(ss->rx_data.rx_small.shadow, M_DEVBUF);
2927 		ss->rx_data.rx_small.shadow = NULL;
2928 	}
2929 
2930 	if (ss->rx_data.rx_big.shadow != NULL) {
2931 		kfree(ss->rx_data.rx_big.shadow, M_DEVBUF);
2932 		ss->rx_data.rx_big.shadow = NULL;
2933 	}
2934 
2935 	if (ss->tx.info != NULL) {
2936 		if (ss->tx.dmat != NULL) {
2937 			for (i = 0; i <= ss->tx.mask; i++) {
2938 				bus_dmamap_destroy(ss->tx.dmat,
2939 				    ss->tx.info[i].map);
2940 			}
2941 			bus_dma_tag_destroy(ss->tx.dmat);
2942 		}
2943 		kfree(ss->tx.info, M_DEVBUF);
2944 		ss->tx.info = NULL;
2945 	}
2946 
2947 	if (ss->rx_data.rx_small.info != NULL) {
2948 		if (ss->rx_data.rx_small.dmat != NULL) {
2949 			for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
2950 				bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
2951 				    ss->rx_data.rx_small.info[i].map);
2952 			}
2953 			bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
2954 			    ss->rx_data.rx_small.extra_map);
2955 			bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
2956 		}
2957 		kfree(ss->rx_data.rx_small.info, M_DEVBUF);
2958 		ss->rx_data.rx_small.info = NULL;
2959 	}
2960 
2961 	if (ss->rx_data.rx_big.info != NULL) {
2962 		if (ss->rx_data.rx_big.dmat != NULL) {
2963 			for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
2964 				bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
2965 				    ss->rx_data.rx_big.info[i].map);
2966 			}
2967 			bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
2968 			    ss->rx_data.rx_big.extra_map);
2969 			bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
2970 		}
2971 		kfree(ss->rx_data.rx_big.info, M_DEVBUF);
2972 		ss->rx_data.rx_big.info = NULL;
2973 	}
2974 }
2975 
2976 static void
2977 mxge_free_rings(mxge_softc_t *sc)
2978 {
2979 	int slice;
2980 
2981 	if (sc->ss == NULL)
2982 		return;
2983 
2984 	for (slice = 0; slice < sc->num_slices; slice++)
2985 		mxge_free_slice_rings(&sc->ss[slice]);
2986 }
2987 
2988 static int
2989 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
2990     int tx_ring_entries)
2991 {
2992 	mxge_softc_t *sc = ss->sc;
2993 	size_t bytes;
2994 	int err, i;
2995 
2996 	/*
2997 	 * Allocate per-slice receive resources
2998 	 */
2999 
3000 	ss->rx_data.rx_small.mask = ss->rx_data.rx_big.mask =
3001 	    rx_ring_entries - 1;
3002 	ss->rx_data.rx_done.mask = (2 * rx_ring_entries) - 1;
3003 
3004 	/* Allocate the rx shadow rings */
3005 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_small.shadow);
3006 	ss->rx_data.rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3007 
3008 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_big.shadow);
3009 	ss->rx_data.rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3010 
3011 	/* Allocate the rx host info rings */
3012 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_small.info);
3013 	ss->rx_data.rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3014 
3015 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_big.info);
3016 	ss->rx_data.rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3017 
3018 	/* Allocate the rx busdma resources */
3019 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3020 				 1,			/* alignment */
3021 				 4096,			/* boundary */
3022 				 BUS_SPACE_MAXADDR,	/* low */
3023 				 BUS_SPACE_MAXADDR,	/* high */
3024 				 NULL, NULL,		/* filter */
3025 				 MHLEN,			/* maxsize */
3026 				 1,			/* num segs */
3027 				 MHLEN,			/* maxsegsize */
3028 				 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
3029 				 			/* flags */
3030 				 &ss->rx_data.rx_small.dmat); /* tag */
3031 	if (err != 0) {
3032 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3033 		    err);
3034 		return err;
3035 	}
3036 
3037 	err = bus_dmamap_create(ss->rx_data.rx_small.dmat, BUS_DMA_WAITOK,
3038 	    &ss->rx_data.rx_small.extra_map);
3039 	if (err != 0) {
3040 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n", err);
3041 		bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
3042 		ss->rx_data.rx_small.dmat = NULL;
3043 		return err;
3044 	}
3045 	for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
3046 		err = bus_dmamap_create(ss->rx_data.rx_small.dmat,
3047 		    BUS_DMA_WAITOK, &ss->rx_data.rx_small.info[i].map);
3048 		if (err != 0) {
3049 			int j;
3050 
3051 			device_printf(sc->dev, "Err %d rx_small dmamap\n", err);
3052 
3053 			for (j = 0; j < i; ++j) {
3054 				bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
3055 				    ss->rx_data.rx_small.info[j].map);
3056 			}
3057 			bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
3058 			    ss->rx_data.rx_small.extra_map);
3059 			bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
3060 			ss->rx_data.rx_small.dmat = NULL;
3061 			return err;
3062 		}
3063 	}
3064 
3065 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3066 				 1,			/* alignment */
3067 				 4096,			/* boundary */
3068 				 BUS_SPACE_MAXADDR,	/* low */
3069 				 BUS_SPACE_MAXADDR,	/* high */
3070 				 NULL, NULL,		/* filter */
3071 				 4096,			/* maxsize */
3072 				 1,			/* num segs */
3073 				 4096,			/* maxsegsize*/
3074 				 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
3075 				 			/* flags */
3076 				 &ss->rx_data.rx_big.dmat); /* tag */
3077 	if (err != 0) {
3078 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3079 		    err);
3080 		return err;
3081 	}
3082 
3083 	err = bus_dmamap_create(ss->rx_data.rx_big.dmat, BUS_DMA_WAITOK,
3084 	    &ss->rx_data.rx_big.extra_map);
3085 	if (err != 0) {
3086 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n", err);
3087 		bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
3088 		ss->rx_data.rx_big.dmat = NULL;
3089 		return err;
3090 	}
3091 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
3092 		err = bus_dmamap_create(ss->rx_data.rx_big.dmat, BUS_DMA_WAITOK,
3093 		    &ss->rx_data.rx_big.info[i].map);
3094 		if (err != 0) {
3095 			int j;
3096 
3097 			device_printf(sc->dev, "Err %d rx_big dmamap\n", err);
3098 			for (j = 0; j < i; ++j) {
3099 				bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
3100 				    ss->rx_data.rx_big.info[j].map);
3101 			}
3102 			bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
3103 			    ss->rx_data.rx_big.extra_map);
3104 			bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
3105 			ss->rx_data.rx_big.dmat = NULL;
3106 			return err;
3107 		}
3108 	}
3109 
3110 	/*
3111 	 * Now allocate TX resources
3112 	 */
3113 
3114 	ss->tx.mask = tx_ring_entries - 1;
3115 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3116 
3117 	/*
3118 	 * Allocate the tx request copy block; MUST be at least 8 bytes
3119 	 * aligned
3120 	 */
3121 	bytes = sizeof(*ss->tx.req_list) * (ss->tx.max_desc + 4);
3122 	ss->tx.req_list = kmalloc_cachealign(__VM_CACHELINE_ALIGN(bytes),
3123 	    M_DEVBUF, M_WAITOK);
3124 
3125 	/* Allocate the tx busdma segment list */
3126 	bytes = sizeof(*ss->tx.seg_list) * ss->tx.max_desc;
3127 	ss->tx.seg_list = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3128 
3129 	/* Allocate the tx host info ring */
3130 	bytes = tx_ring_entries * sizeof(*ss->tx.info);
3131 	ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3132 
3133 	/* Allocate the tx busdma resources */
3134 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3135 				 1,			/* alignment */
3136 				 sc->tx_boundary,	/* boundary */
3137 				 BUS_SPACE_MAXADDR,	/* low */
3138 				 BUS_SPACE_MAXADDR,	/* high */
3139 				 NULL, NULL,		/* filter */
3140 				 IP_MAXPACKET +
3141 				 sizeof(struct ether_vlan_header),
3142 				 			/* maxsize */
3143 				 ss->tx.max_desc - 2,	/* num segs */
3144 				 sc->tx_boundary,	/* maxsegsz */
3145 				 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW |
3146 				 BUS_DMA_ONEBPAGE,	/* flags */
3147 				 &ss->tx.dmat);		/* tag */
3148 	if (err != 0) {
3149 		device_printf(sc->dev, "Err %d allocating tx dmat\n", err);
3150 		return err;
3151 	}
3152 
3153 	/*
3154 	 * Now use these tags to setup DMA maps for each slot in the ring
3155 	 */
3156 	for (i = 0; i <= ss->tx.mask; i++) {
3157 		err = bus_dmamap_create(ss->tx.dmat,
3158 		    BUS_DMA_WAITOK | BUS_DMA_ONEBPAGE, &ss->tx.info[i].map);
3159 		if (err != 0) {
3160 			int j;
3161 
3162 			device_printf(sc->dev, "Err %d tx dmamap\n", err);
3163 			for (j = 0; j < i; ++j) {
3164 				bus_dmamap_destroy(ss->tx.dmat,
3165 				    ss->tx.info[j].map);
3166 			}
3167 			bus_dma_tag_destroy(ss->tx.dmat);
3168 			ss->tx.dmat = NULL;
3169 			return err;
3170 		}
3171 	}
3172 	return 0;
3173 }
3174 
3175 static int
3176 mxge_alloc_rings(mxge_softc_t *sc)
3177 {
3178 	mxge_cmd_t cmd;
3179 	int tx_ring_size;
3180 	int tx_ring_entries, rx_ring_entries;
3181 	int err, slice;
3182 
3183 	/* Get ring sizes */
3184 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3185 	if (err != 0) {
3186 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3187 		return err;
3188 	}
3189 	tx_ring_size = cmd.data0;
3190 
3191 	tx_ring_entries = tx_ring_size / sizeof(mcp_kreq_ether_send_t);
3192 	rx_ring_entries = sc->rx_intr_slots / 2;
3193 
3194 	if (bootverbose) {
3195 		device_printf(sc->dev, "tx desc %d, rx desc %d\n",
3196 		    tx_ring_entries, rx_ring_entries);
3197 	}
3198 
3199 	sc->ifp->if_nmbclusters = rx_ring_entries * sc->num_slices;
3200 	sc->ifp->if_nmbjclusters = sc->ifp->if_nmbclusters;
3201 
3202 	ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3203 	ifq_set_ready(&sc->ifp->if_snd);
3204 	ifq_set_subq_cnt(&sc->ifp->if_snd, sc->num_tx_rings);
3205 
3206 	if (sc->num_tx_rings > 1) {
3207 		sc->ifp->if_mapsubq = ifq_mapsubq_mask;
3208 		ifq_set_subq_mask(&sc->ifp->if_snd, sc->num_tx_rings - 1);
3209 	}
3210 
3211 	for (slice = 0; slice < sc->num_slices; slice++) {
3212 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3213 		    rx_ring_entries, tx_ring_entries);
3214 		if (err != 0) {
3215 			device_printf(sc->dev,
3216 			    "alloc %d slice rings failed\n", slice);
3217 			return err;
3218 		}
3219 	}
3220 	return 0;
3221 }
3222 
3223 static void
3224 mxge_choose_params(int mtu, int *cl_size)
3225 {
3226 	int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3227 
3228 	if (bufsize < MCLBYTES) {
3229 		*cl_size = MCLBYTES;
3230 	} else {
3231 		KASSERT(bufsize < MJUMPAGESIZE, ("invalid MTU %d", mtu));
3232 		*cl_size = MJUMPAGESIZE;
3233 	}
3234 }
3235 
3236 static int
3237 mxge_slice_open(struct mxge_slice_state *ss, int cl_size)
3238 {
3239 	mxge_cmd_t cmd;
3240 	int err, i, slice;
3241 
3242 	slice = ss - ss->sc->ss;
3243 
3244 	/*
3245 	 * Get the lanai pointers to the send and receive rings
3246 	 */
3247 	err = 0;
3248 
3249 	bzero(&cmd, sizeof(cmd));	/* silence gcc warning */
3250 	if (ss->sc->num_tx_rings == 1) {
3251 		if (slice == 0) {
3252 			cmd.data0 = slice;
3253 			err = mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SEND_OFFSET,
3254 			    &cmd);
3255 			ss->tx.lanai = (volatile mcp_kreq_ether_send_t *)
3256 			    (ss->sc->sram + cmd.data0);
3257 			/* Leave send_go and send_stop as NULL */
3258 		}
3259 	} else {
3260 		cmd.data0 = slice;
3261 		err = mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3262 		ss->tx.lanai = (volatile mcp_kreq_ether_send_t *)
3263 		    (ss->sc->sram + cmd.data0);
3264 		ss->tx.send_go = (volatile uint32_t *)
3265 		    (ss->sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3266 		ss->tx.send_stop = (volatile uint32_t *)
3267 		    (ss->sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3268 	}
3269 
3270 	cmd.data0 = slice;
3271 	err |= mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3272 	ss->rx_data.rx_small.lanai =
3273 	    (volatile mcp_kreq_ether_recv_t *)(ss->sc->sram + cmd.data0);
3274 
3275 	cmd.data0 = slice;
3276 	err |= mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3277 	ss->rx_data.rx_big.lanai =
3278 	    (volatile mcp_kreq_ether_recv_t *)(ss->sc->sram + cmd.data0);
3279 
3280 	if (err != 0) {
3281 		if_printf(ss->sc->ifp,
3282 		    "failed to get ring sizes or locations\n");
3283 		return EIO;
3284 	}
3285 
3286 	/*
3287 	 * Stock small receive ring
3288 	 */
3289 	for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
3290 		err = mxge_get_buf_small(&ss->rx_data.rx_small,
3291 		    ss->rx_data.rx_small.info[i].map, i, TRUE);
3292 		if (err) {
3293 			if_printf(ss->sc->ifp, "alloced %d/%d smalls\n", i,
3294 			    ss->rx_data.rx_small.mask + 1);
3295 			return ENOMEM;
3296 		}
3297 	}
3298 
3299 	/*
3300 	 * Stock big receive ring
3301 	 */
3302 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
3303 		ss->rx_data.rx_big.shadow[i].addr_low = 0xffffffff;
3304 		ss->rx_data.rx_big.shadow[i].addr_high = 0xffffffff;
3305 	}
3306 
3307 	ss->rx_data.rx_big.cl_size = cl_size;
3308 
3309 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
3310 		err = mxge_get_buf_big(&ss->rx_data.rx_big,
3311 		    ss->rx_data.rx_big.info[i].map, i, TRUE);
3312 		if (err) {
3313 			if_printf(ss->sc->ifp, "alloced %d/%d bigs\n", i,
3314 			    ss->rx_data.rx_big.mask + 1);
3315 			return ENOMEM;
3316 		}
3317 	}
3318 	return 0;
3319 }
3320 
3321 static int
3322 mxge_open(mxge_softc_t *sc)
3323 {
3324 	struct ifnet *ifp = sc->ifp;
3325 	mxge_cmd_t cmd;
3326 	int err, slice, cl_size, i;
3327 	bus_addr_t bus;
3328 	volatile uint8_t *itable;
3329 	struct mxge_slice_state *ss;
3330 
3331 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
3332 
3333 	/* Copy the MAC address in case it was overridden */
3334 	bcopy(IF_LLADDR(ifp), sc->mac_addr, ETHER_ADDR_LEN);
3335 
3336 	err = mxge_reset(sc, 1);
3337 	if (err != 0) {
3338 		if_printf(ifp, "failed to reset\n");
3339 		return EIO;
3340 	}
3341 
3342 	if (sc->num_slices > 1) {
3343 		/* Setup the indirection table */
3344 		cmd.data0 = sc->num_slices;
3345 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE, &cmd);
3346 
3347 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET, &cmd);
3348 		if (err != 0) {
3349 			if_printf(ifp, "failed to setup rss tables\n");
3350 			return err;
3351 		}
3352 
3353 		/* Just enable an identity mapping */
3354 		itable = sc->sram + cmd.data0;
3355 		for (i = 0; i < sc->num_slices; i++)
3356 			itable[i] = (uint8_t)i;
3357 
3358 		if (sc->use_rss) {
3359 			volatile uint8_t *hwkey;
3360 			uint8_t swkey[MXGE_HWRSS_KEYLEN];
3361 
3362 			err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_KEY_OFFSET,
3363 			    &cmd);
3364 			if (err != 0) {
3365 				if_printf(ifp, "failed to get rsskey\n");
3366 				return err;
3367 			}
3368 			hwkey = sc->sram + cmd.data0;
3369 
3370 			toeplitz_get_key(swkey, MXGE_HWRSS_KEYLEN);
3371 			for (i = 0; i < MXGE_HWRSS_KEYLEN; ++i)
3372 				hwkey[i] = swkey[i];
3373 			wmb();
3374 
3375 			err = mxge_send_cmd(sc, MXGEFW_CMD_RSS_KEY_UPDATED,
3376 			    &cmd);
3377 			if (err != 0) {
3378 				if_printf(ifp, "failed to update rsskey\n");
3379 				return err;
3380 			}
3381 			if (bootverbose)
3382 				if_printf(ifp, "RSS key updated\n");
3383 		}
3384 
3385 		cmd.data0 = 1;
3386 		if (sc->use_rss) {
3387 			if (bootverbose)
3388 				if_printf(ifp, "input hash: RSS\n");
3389 			cmd.data1 = MXGEFW_RSS_HASH_TYPE_IPV4 |
3390 			    MXGEFW_RSS_HASH_TYPE_TCP_IPV4;
3391 		} else {
3392 			if (bootverbose)
3393 				if_printf(ifp, "input hash: SRC_DST_PORT\n");
3394 			cmd.data1 = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
3395 		}
3396 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3397 		if (err != 0) {
3398 			if_printf(ifp, "failed to enable slices\n");
3399 			return err;
3400 		}
3401 	}
3402 
3403 	cmd.data0 = MXGEFW_TSO_MODE_NDIS;
3404 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_TSO_MODE, &cmd);
3405 	if (err) {
3406 		/*
3407 		 * Can't change TSO mode to NDIS, never allow TSO then
3408 		 */
3409 		if_printf(ifp, "failed to set TSO mode\n");
3410 		ifp->if_capenable &= ~IFCAP_TSO;
3411 		ifp->if_capabilities &= ~IFCAP_TSO;
3412 		ifp->if_hwassist &= ~CSUM_TSO;
3413 	}
3414 
3415 	mxge_choose_params(ifp->if_mtu, &cl_size);
3416 
3417 	cmd.data0 = 1;
3418 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS, &cmd);
3419 	/*
3420 	 * Error is only meaningful if we're trying to set
3421 	 * MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1
3422 	 */
3423 
3424 	/*
3425 	 * Give the firmware the mtu and the big and small buffer
3426 	 * sizes.  The firmware wants the big buf size to be a power
3427 	 * of two. Luckily, DragonFly's clusters are powers of two
3428 	 */
3429 	cmd.data0 = ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3430 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3431 
3432 	cmd.data0 = MXGE_RX_SMALL_BUFLEN;
3433 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE, &cmd);
3434 
3435 	cmd.data0 = cl_size;
3436 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3437 
3438 	if (err != 0) {
3439 		if_printf(ifp, "failed to setup params\n");
3440 		goto abort;
3441 	}
3442 
3443 	/* Now give him the pointer to the stats block */
3444 	for (slice = 0; slice < sc->num_slices; slice++) {
3445 		ss = &sc->ss[slice];
3446 		cmd.data0 = MXGE_LOWPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3447 		cmd.data1 = MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3448 		cmd.data2 = sizeof(struct mcp_irq_data);
3449 		cmd.data2 |= (slice << 16);
3450 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3451 	}
3452 
3453 	if (err != 0) {
3454 		bus = sc->ss->fw_stats_dma.dmem_busaddr;
3455 		bus += offsetof(struct mcp_irq_data, send_done_count);
3456 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3457 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3458 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3459 		    &cmd);
3460 
3461 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3462 		sc->fw_multicast_support = 0;
3463 	} else {
3464 		sc->fw_multicast_support = 1;
3465 	}
3466 
3467 	if (err != 0) {
3468 		if_printf(ifp, "failed to setup params\n");
3469 		goto abort;
3470 	}
3471 
3472 	for (slice = 0; slice < sc->num_slices; slice++) {
3473 		err = mxge_slice_open(&sc->ss[slice], cl_size);
3474 		if (err != 0) {
3475 			if_printf(ifp, "couldn't open slice %d\n", slice);
3476 			goto abort;
3477 		}
3478 	}
3479 
3480 	/* Finally, start the firmware running */
3481 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3482 	if (err) {
3483 		if_printf(ifp, "Couldn't bring up link\n");
3484 		goto abort;
3485 	}
3486 
3487 	ifp->if_flags |= IFF_RUNNING;
3488 	for (i = 0; i < sc->num_tx_rings; ++i) {
3489 		mxge_tx_ring_t *tx = &sc->ss[i].tx;
3490 
3491 		ifsq_clr_oactive(tx->ifsq);
3492 		ifsq_watchdog_start(&tx->watchdog);
3493 	}
3494 
3495 	return 0;
3496 
3497 abort:
3498 	mxge_free_mbufs(sc);
3499 	return err;
3500 }
3501 
3502 static void
3503 mxge_close(mxge_softc_t *sc, int down)
3504 {
3505 	struct ifnet *ifp = sc->ifp;
3506 	mxge_cmd_t cmd;
3507 	int err, old_down_cnt, i;
3508 
3509 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
3510 
3511 	if (!down) {
3512 		old_down_cnt = sc->down_cnt;
3513 		wmb();
3514 
3515 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3516 		if (err)
3517 			if_printf(ifp, "Couldn't bring down link\n");
3518 
3519 		if (old_down_cnt == sc->down_cnt) {
3520 			/*
3521 			 * Wait for down irq
3522 			 * XXX racy
3523 			 */
3524 			ifnet_deserialize_all(ifp);
3525 			DELAY(10 * sc->intr_coal_delay);
3526 			ifnet_serialize_all(ifp);
3527 		}
3528 
3529 		wmb();
3530 		if (old_down_cnt == sc->down_cnt)
3531 			if_printf(ifp, "never got down irq\n");
3532 	}
3533 	mxge_free_mbufs(sc);
3534 
3535 	ifp->if_flags &= ~IFF_RUNNING;
3536 	for (i = 0; i < sc->num_tx_rings; ++i) {
3537 		mxge_tx_ring_t *tx = &sc->ss[i].tx;
3538 
3539 		ifsq_clr_oactive(tx->ifsq);
3540 		ifsq_watchdog_stop(&tx->watchdog);
3541 	}
3542 }
3543 
3544 static void
3545 mxge_setup_cfg_space(mxge_softc_t *sc)
3546 {
3547 	device_t dev = sc->dev;
3548 	int reg;
3549 	uint16_t lnk, pectl;
3550 
3551 	/* Find the PCIe link width and set max read request to 4KB */
3552 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3553 		lnk = pci_read_config(dev, reg + 0x12, 2);
3554 		sc->link_width = (lnk >> 4) & 0x3f;
3555 
3556 		if (sc->pectl == 0) {
3557 			pectl = pci_read_config(dev, reg + 0x8, 2);
3558 			pectl = (pectl & ~0x7000) | (5 << 12);
3559 			pci_write_config(dev, reg + 0x8, pectl, 2);
3560 			sc->pectl = pectl;
3561 		} else {
3562 			/* Restore saved pectl after watchdog reset */
3563 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3564 		}
3565 	}
3566 
3567 	/* Enable DMA and memory space access */
3568 	pci_enable_busmaster(dev);
3569 }
3570 
3571 static uint32_t
3572 mxge_read_reboot(mxge_softc_t *sc)
3573 {
3574 	device_t dev = sc->dev;
3575 	uint32_t vs;
3576 
3577 	/* Find the vendor specific offset */
3578 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3579 		if_printf(sc->ifp, "could not find vendor specific offset\n");
3580 		return (uint32_t)-1;
3581 	}
3582 	/* Enable read32 mode */
3583 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3584 	/* Tell NIC which register to read */
3585 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3586 	return pci_read_config(dev, vs + 0x14, 4);
3587 }
3588 
3589 static void
3590 mxge_watchdog_reset(mxge_softc_t *sc)
3591 {
3592 	struct pci_devinfo *dinfo;
3593 	int err, running;
3594 	uint32_t reboot;
3595 	uint16_t cmd;
3596 
3597 	err = ENXIO;
3598 
3599 	if_printf(sc->ifp, "Watchdog reset!\n");
3600 
3601 	/*
3602 	 * Check to see if the NIC rebooted.  If it did, then all of
3603 	 * PCI config space has been reset, and things like the
3604 	 * busmaster bit will be zero.  If this is the case, then we
3605 	 * must restore PCI config space before the NIC can be used
3606 	 * again
3607 	 */
3608 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3609 	if (cmd == 0xffff) {
3610 		/*
3611 		 * Maybe the watchdog caught the NIC rebooting; wait
3612 		 * up to 100ms for it to finish.  If it does not come
3613 		 * back, then give up
3614 		 */
3615 		DELAY(1000*100);
3616 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3617 		if (cmd == 0xffff)
3618 			if_printf(sc->ifp, "NIC disappeared!\n");
3619 	}
3620 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3621 		/* Print the reboot status */
3622 		reboot = mxge_read_reboot(sc);
3623 		if_printf(sc->ifp, "NIC rebooted, status = 0x%x\n", reboot);
3624 
3625 		running = sc->ifp->if_flags & IFF_RUNNING;
3626 		if (running) {
3627 			/*
3628 			 * Quiesce NIC so that TX routines will not try to
3629 			 * xmit after restoration of BAR
3630 			 */
3631 
3632 			/* Mark the link as down */
3633 			if (sc->link_state) {
3634 				sc->ifp->if_link_state = LINK_STATE_DOWN;
3635 				if_link_state_change(sc->ifp);
3636 			}
3637 			mxge_close(sc, 1);
3638 		}
3639 		/* Restore PCI configuration space */
3640 		dinfo = device_get_ivars(sc->dev);
3641 		pci_cfg_restore(sc->dev, dinfo);
3642 
3643 		/* And redo any changes we made to our config space */
3644 		mxge_setup_cfg_space(sc);
3645 
3646 		/* Reload f/w */
3647 		err = mxge_load_firmware(sc, 0);
3648 		if (err)
3649 			if_printf(sc->ifp, "Unable to re-load f/w\n");
3650 		if (running && !err) {
3651 			int i;
3652 
3653 			err = mxge_open(sc);
3654 
3655 			for (i = 0; i < sc->num_tx_rings; ++i)
3656 				ifsq_devstart_sched(sc->ss[i].tx.ifsq);
3657 		}
3658 		sc->watchdog_resets++;
3659 	} else {
3660 		if_printf(sc->ifp, "NIC did not reboot, not resetting\n");
3661 		err = 0;
3662 	}
3663 	if (err) {
3664 		if_printf(sc->ifp, "watchdog reset failed\n");
3665 	} else {
3666 		if (sc->dying == 2)
3667 			sc->dying = 0;
3668 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3669 	}
3670 }
3671 
3672 static void
3673 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3674 {
3675 	if_printf(sc->ifp, "slice %d struck? ring state:\n", slice);
3676 	if_printf(sc->ifp, "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3677 	    tx->req, tx->done, tx->queue_active);
3678 	if_printf(sc->ifp, "tx.activate=%d tx.deactivate=%d\n",
3679 	    tx->activate, tx->deactivate);
3680 	if_printf(sc->ifp, "pkt_done=%d fw=%d\n",
3681 	    tx->pkt_done, be32toh(sc->ss->fw_stats->send_done_count));
3682 }
3683 
3684 static u_long
3685 mxge_update_stats(mxge_softc_t *sc)
3686 {
3687 	u_long ipackets, opackets, pkts;
3688 
3689 	IFNET_STAT_GET(sc->ifp, ipackets, ipackets);
3690 	IFNET_STAT_GET(sc->ifp, opackets, opackets);
3691 
3692 	pkts = ipackets - sc->ipackets;
3693 	pkts += opackets - sc->opackets;
3694 
3695 	sc->ipackets = ipackets;
3696 	sc->opackets = opackets;
3697 
3698 	return pkts;
3699 }
3700 
3701 static void
3702 mxge_tick(void *arg)
3703 {
3704 	mxge_softc_t *sc = arg;
3705 	u_long pkts = 0;
3706 	int err = 0;
3707 	int ticks;
3708 
3709 	lwkt_serialize_enter(&sc->main_serialize);
3710 
3711 	ticks = mxge_ticks;
3712 	if (sc->ifp->if_flags & IFF_RUNNING) {
3713 		/* Aggregate stats from different slices */
3714 		pkts = mxge_update_stats(sc);
3715 		if (sc->need_media_probe)
3716 			mxge_media_probe(sc);
3717 	}
3718 	if (pkts == 0) {
3719 		uint16_t cmd;
3720 
3721 		/* Ensure NIC did not suffer h/w fault while idle */
3722 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3723 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3724 			sc->dying = 2;
3725 			mxge_serialize_skipmain(sc);
3726 			mxge_watchdog_reset(sc);
3727 			mxge_deserialize_skipmain(sc);
3728 			err = ENXIO;
3729 		}
3730 
3731 		/* Look less often if NIC is idle */
3732 		ticks *= 4;
3733 	}
3734 
3735 	if (err == 0)
3736 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
3737 
3738 	lwkt_serialize_exit(&sc->main_serialize);
3739 }
3740 
3741 static int
3742 mxge_media_change(struct ifnet *ifp)
3743 {
3744 	mxge_softc_t *sc = ifp->if_softc;
3745 	const struct ifmedia *ifm = &sc->media;
3746 	int pause;
3747 
3748 	if (IFM_OPTIONS(ifm->ifm_media) & (IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE)) {
3749 		if (sc->pause)
3750 			return 0;
3751 		pause = 1;
3752 	} else {
3753 		if (!sc->pause)
3754 			return 0;
3755 		pause = 0;
3756 	}
3757 	return mxge_change_pause(sc, pause);
3758 }
3759 
3760 static int
3761 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3762 {
3763 	struct ifnet *ifp = sc->ifp;
3764 	int real_mtu, old_mtu;
3765 	int err = 0;
3766 
3767 	real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3768 	if (mtu > sc->max_mtu || real_mtu < 60)
3769 		return EINVAL;
3770 
3771 	old_mtu = ifp->if_mtu;
3772 	ifp->if_mtu = mtu;
3773 	if (ifp->if_flags & IFF_RUNNING) {
3774 		mxge_close(sc, 0);
3775 		err = mxge_open(sc);
3776 		if (err != 0) {
3777 			ifp->if_mtu = old_mtu;
3778 			mxge_close(sc, 0);
3779 			mxge_open(sc);
3780 		}
3781 	}
3782 	return err;
3783 }
3784 
3785 static void
3786 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3787 {
3788 	mxge_softc_t *sc = ifp->if_softc;
3789 
3790 	ifmr->ifm_status = IFM_AVALID;
3791 	ifmr->ifm_active = IFM_ETHER;
3792 
3793 	if (sc->link_state)
3794 		ifmr->ifm_status |= IFM_ACTIVE;
3795 
3796 	/*
3797 	 * Autoselect is not supported, so the current media
3798 	 * should be delivered.
3799 	 */
3800 	ifmr->ifm_active |= sc->current_media;
3801 	if (sc->current_media != IFM_NONE) {
3802 		ifmr->ifm_active |= MXGE_IFM;
3803 		if (sc->pause)
3804 			ifmr->ifm_active |= IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE;
3805 	}
3806 }
3807 
3808 static int
3809 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data,
3810     struct ucred *cr __unused)
3811 {
3812 	mxge_softc_t *sc = ifp->if_softc;
3813 	struct ifreq *ifr = (struct ifreq *)data;
3814 	int err, mask;
3815 
3816 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
3817 	err = 0;
3818 
3819 	switch (command) {
3820 	case SIOCSIFMTU:
3821 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3822 		break;
3823 
3824 	case SIOCSIFFLAGS:
3825 		if (sc->dying)
3826 			return EINVAL;
3827 
3828 		if (ifp->if_flags & IFF_UP) {
3829 			if (!(ifp->if_flags & IFF_RUNNING)) {
3830 				err = mxge_open(sc);
3831 			} else {
3832 				/*
3833 				 * Take care of PROMISC and ALLMULTI
3834 				 * flag changes
3835 				 */
3836 				mxge_change_promisc(sc,
3837 				    ifp->if_flags & IFF_PROMISC);
3838 				mxge_set_multicast_list(sc);
3839 			}
3840 		} else {
3841 			if (ifp->if_flags & IFF_RUNNING)
3842 				mxge_close(sc, 0);
3843 		}
3844 		break;
3845 
3846 	case SIOCADDMULTI:
3847 	case SIOCDELMULTI:
3848 		mxge_set_multicast_list(sc);
3849 		break;
3850 
3851 	case SIOCSIFCAP:
3852 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3853 		if (mask & IFCAP_TXCSUM) {
3854 			ifp->if_capenable ^= IFCAP_TXCSUM;
3855 			if (ifp->if_capenable & IFCAP_TXCSUM)
3856 				ifp->if_hwassist |= CSUM_TCP | CSUM_UDP;
3857 			else
3858 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
3859 		}
3860 		if (mask & IFCAP_TSO) {
3861 			ifp->if_capenable ^= IFCAP_TSO;
3862 			if (ifp->if_capenable & IFCAP_TSO)
3863 				ifp->if_hwassist |= CSUM_TSO;
3864 			else
3865 				ifp->if_hwassist &= ~CSUM_TSO;
3866 		}
3867 		if (mask & IFCAP_RXCSUM)
3868 			ifp->if_capenable ^= IFCAP_RXCSUM;
3869 		if (mask & IFCAP_VLAN_HWTAGGING)
3870 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3871 		break;
3872 
3873 	case SIOCGIFMEDIA:
3874 	case SIOCSIFMEDIA:
3875 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3876 		    &sc->media, command);
3877 		break;
3878 
3879 	default:
3880 		err = ether_ioctl(ifp, command, data);
3881 		break;
3882 	}
3883 	return err;
3884 }
3885 
3886 static void
3887 mxge_fetch_tunables(mxge_softc_t *sc)
3888 {
3889 	int ifm;
3890 
3891 	sc->intr_coal_delay = mxge_intr_coal_delay;
3892 	if (sc->intr_coal_delay < 0 || sc->intr_coal_delay > (10 * 1000))
3893 		sc->intr_coal_delay = MXGE_INTR_COAL_DELAY;
3894 
3895 	/* XXX */
3896 	if (mxge_ticks == 0)
3897 		mxge_ticks = hz / 2;
3898 
3899 	ifm = ifmedia_str2ethfc(mxge_flowctrl);
3900 	if (ifm & (IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE))
3901 		sc->pause = 1;
3902 
3903 	sc->use_rss = mxge_use_rss;
3904 
3905 	sc->throttle = mxge_throttle;
3906 	if (sc->throttle && sc->throttle > MXGE_MAX_THROTTLE)
3907 		sc->throttle = MXGE_MAX_THROTTLE;
3908 	if (sc->throttle && sc->throttle < MXGE_MIN_THROTTLE)
3909 		sc->throttle = MXGE_MIN_THROTTLE;
3910 }
3911 
3912 static void
3913 mxge_free_slices(mxge_softc_t *sc)
3914 {
3915 	struct mxge_slice_state *ss;
3916 	int i;
3917 
3918 	if (sc->ss == NULL)
3919 		return;
3920 
3921 	for (i = 0; i < sc->num_slices; i++) {
3922 		ss = &sc->ss[i];
3923 		if (ss->fw_stats != NULL) {
3924 			mxge_dma_free(&ss->fw_stats_dma);
3925 			ss->fw_stats = NULL;
3926 		}
3927 		if (ss->rx_data.rx_done.entry != NULL) {
3928 			mxge_dma_free(&ss->rx_done_dma);
3929 			ss->rx_data.rx_done.entry = NULL;
3930 		}
3931 	}
3932 	kfree(sc->ss, M_DEVBUF);
3933 	sc->ss = NULL;
3934 }
3935 
3936 static int
3937 mxge_alloc_slices(mxge_softc_t *sc)
3938 {
3939 	mxge_cmd_t cmd;
3940 	struct mxge_slice_state *ss;
3941 	size_t bytes;
3942 	int err, i, rx_ring_size;
3943 
3944 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3945 	if (err != 0) {
3946 		device_printf(sc->dev, "Cannot determine rx ring size\n");
3947 		return err;
3948 	}
3949 	rx_ring_size = cmd.data0;
3950 	sc->rx_intr_slots = 2 * (rx_ring_size / sizeof (mcp_dma_addr_t));
3951 
3952 	bytes = sizeof(*sc->ss) * sc->num_slices;
3953 	sc->ss = kmalloc_cachealign(bytes, M_DEVBUF, M_WAITOK | M_ZERO);
3954 
3955 	for (i = 0; i < sc->num_slices; i++) {
3956 		ss = &sc->ss[i];
3957 
3958 		ss->sc = sc;
3959 
3960 		lwkt_serialize_init(&ss->rx_data.rx_serialize);
3961 		lwkt_serialize_init(&ss->tx.tx_serialize);
3962 		ss->intr_rid = -1;
3963 
3964 		/*
3965 		 * Allocate per-slice rx interrupt queue
3966 		 * XXX assume 4bytes mcp_slot
3967 		 */
3968 		bytes = sc->rx_intr_slots * sizeof(mcp_slot_t);
3969 		err = mxge_dma_alloc(sc, &ss->rx_done_dma, bytes, 4096);
3970 		if (err != 0) {
3971 			device_printf(sc->dev,
3972 			    "alloc %d slice rx_done failed\n", i);
3973 			return err;
3974 		}
3975 		ss->rx_data.rx_done.entry = ss->rx_done_dma.dmem_addr;
3976 
3977 		/*
3978 		 * Allocate the per-slice firmware stats
3979 		 */
3980 		bytes = sizeof(*ss->fw_stats);
3981 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
3982 		    sizeof(*ss->fw_stats), 64);
3983 		if (err != 0) {
3984 			device_printf(sc->dev,
3985 			    "alloc %d fw_stats failed\n", i);
3986 			return err;
3987 		}
3988 		ss->fw_stats = ss->fw_stats_dma.dmem_addr;
3989 	}
3990 	return 0;
3991 }
3992 
3993 static void
3994 mxge_slice_probe(mxge_softc_t *sc)
3995 {
3996 	int status, max_intr_slots, max_slices, num_slices;
3997 	int msix_cnt, msix_enable, i, multi_tx;
3998 	mxge_cmd_t cmd;
3999 	const char *old_fw;
4000 
4001 	sc->num_slices = 1;
4002 	sc->num_tx_rings = 1;
4003 
4004 	num_slices = device_getenv_int(sc->dev, "num_slices", mxge_num_slices);
4005 	if (num_slices == 1)
4006 		return;
4007 
4008 	if (ncpus2 == 1)
4009 		return;
4010 
4011 	msix_enable = device_getenv_int(sc->dev, "msix.enable",
4012 	    mxge_msix_enable);
4013 	if (!msix_enable)
4014 		return;
4015 
4016 	msix_cnt = pci_msix_count(sc->dev);
4017 	if (msix_cnt < 2)
4018 		return;
4019 
4020 	/*
4021 	 * Round down MSI-X vector count to the nearest power of 2
4022 	 */
4023 	i = 0;
4024 	while ((1 << (i + 1)) <= msix_cnt)
4025 		++i;
4026 	msix_cnt = 1 << i;
4027 
4028 	/*
4029 	 * Now load the slice aware firmware see what it supports
4030 	 */
4031 	old_fw = sc->fw_name;
4032 	if (old_fw == mxge_fw_aligned)
4033 		sc->fw_name = mxge_fw_rss_aligned;
4034 	else
4035 		sc->fw_name = mxge_fw_rss_unaligned;
4036 	status = mxge_load_firmware(sc, 0);
4037 	if (status != 0) {
4038 		device_printf(sc->dev, "Falling back to a single slice\n");
4039 		return;
4040 	}
4041 
4042 	/*
4043 	 * Try to send a reset command to the card to see if it is alive
4044 	 */
4045 	memset(&cmd, 0, sizeof(cmd));
4046 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4047 	if (status != 0) {
4048 		device_printf(sc->dev, "failed reset\n");
4049 		goto abort_with_fw;
4050 	}
4051 
4052 	/*
4053 	 * Get rx ring size to calculate rx interrupt queue size
4054 	 */
4055 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4056 	if (status != 0) {
4057 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4058 		goto abort_with_fw;
4059 	}
4060 	max_intr_slots = 2 * (cmd.data0 / sizeof(mcp_dma_addr_t));
4061 
4062 	/*
4063 	 * Tell it the size of the rx interrupt queue
4064 	 */
4065 	cmd.data0 = max_intr_slots * sizeof(struct mcp_slot);
4066 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4067 	if (status != 0) {
4068 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4069 		goto abort_with_fw;
4070 	}
4071 
4072 	/*
4073 	 * Ask the maximum number of slices it supports
4074 	 */
4075 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4076 	if (status != 0) {
4077 		device_printf(sc->dev,
4078 		    "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4079 		goto abort_with_fw;
4080 	}
4081 	max_slices = cmd.data0;
4082 
4083 	/*
4084 	 * Round down max slices count to the nearest power of 2
4085 	 */
4086 	i = 0;
4087 	while ((1 << (i + 1)) <= max_slices)
4088 		++i;
4089 	max_slices = 1 << i;
4090 
4091 	if (max_slices > msix_cnt)
4092 		max_slices = msix_cnt;
4093 
4094 	sc->num_slices = num_slices;
4095 	sc->num_slices = if_ring_count2(sc->num_slices, max_slices);
4096 
4097 	multi_tx = device_getenv_int(sc->dev, "multi_tx", mxge_multi_tx);
4098 	if (multi_tx)
4099 		sc->num_tx_rings = sc->num_slices;
4100 
4101 	if (bootverbose) {
4102 		device_printf(sc->dev, "using %d slices, max %d\n",
4103 		    sc->num_slices, max_slices);
4104 	}
4105 
4106 	if (sc->num_slices == 1)
4107 		goto abort_with_fw;
4108 	return;
4109 
4110 abort_with_fw:
4111 	sc->fw_name = old_fw;
4112 	mxge_load_firmware(sc, 0);
4113 }
4114 
4115 static void
4116 mxge_setup_serialize(struct mxge_softc *sc)
4117 {
4118 	int i = 0, slice;
4119 
4120 	/* Main + rx + tx */
4121 	sc->nserialize = (2 * sc->num_slices) + 1;
4122 	sc->serializes =
4123 	    kmalloc(sc->nserialize * sizeof(struct lwkt_serialize *),
4124 	        M_DEVBUF, M_WAITOK | M_ZERO);
4125 
4126 	/*
4127 	 * Setup serializes
4128 	 *
4129 	 * NOTE: Order is critical
4130 	 */
4131 
4132 	KKASSERT(i < sc->nserialize);
4133 	sc->serializes[i++] = &sc->main_serialize;
4134 
4135 	for (slice = 0; slice < sc->num_slices; ++slice) {
4136 		KKASSERT(i < sc->nserialize);
4137 		sc->serializes[i++] = &sc->ss[slice].rx_data.rx_serialize;
4138 	}
4139 
4140 	for (slice = 0; slice < sc->num_slices; ++slice) {
4141 		KKASSERT(i < sc->nserialize);
4142 		sc->serializes[i++] = &sc->ss[slice].tx.tx_serialize;
4143 	}
4144 
4145 	KKASSERT(i == sc->nserialize);
4146 }
4147 
4148 static void
4149 mxge_serialize(struct ifnet *ifp, enum ifnet_serialize slz)
4150 {
4151 	struct mxge_softc *sc = ifp->if_softc;
4152 
4153 	ifnet_serialize_array_enter(sc->serializes, sc->nserialize, slz);
4154 }
4155 
4156 static void
4157 mxge_deserialize(struct ifnet *ifp, enum ifnet_serialize slz)
4158 {
4159 	struct mxge_softc *sc = ifp->if_softc;
4160 
4161 	ifnet_serialize_array_exit(sc->serializes, sc->nserialize, slz);
4162 }
4163 
4164 static int
4165 mxge_tryserialize(struct ifnet *ifp, enum ifnet_serialize slz)
4166 {
4167 	struct mxge_softc *sc = ifp->if_softc;
4168 
4169 	return ifnet_serialize_array_try(sc->serializes, sc->nserialize, slz);
4170 }
4171 
4172 #ifdef INVARIANTS
4173 
4174 static void
4175 mxge_serialize_assert(struct ifnet *ifp, enum ifnet_serialize slz,
4176     boolean_t serialized)
4177 {
4178 	struct mxge_softc *sc = ifp->if_softc;
4179 
4180 	ifnet_serialize_array_assert(sc->serializes, sc->nserialize,
4181 	    slz, serialized);
4182 }
4183 
4184 #endif	/* INVARIANTS */
4185 
4186 #ifdef IFPOLL_ENABLE
4187 
4188 static void
4189 mxge_npoll_rx(struct ifnet *ifp, void *xss, int cycle)
4190 {
4191 	struct mxge_slice_state *ss = xss;
4192 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
4193 
4194 	ASSERT_SERIALIZED(&ss->rx_data.rx_serialize);
4195 
4196 	if (rx_done->entry[rx_done->idx].length != 0) {
4197 		mxge_clean_rx_done(&ss->sc->arpcom.ac_if, &ss->rx_data, cycle);
4198 	} else {
4199 		/*
4200 		 * XXX
4201 		 * This register writting obviously has cost,
4202 		 * however, if we don't hand back the rx token,
4203 		 * the upcoming packets may suffer rediculously
4204 		 * large delay, as observed on 8AL-C using ping(8).
4205 		 */
4206 		*ss->irq_claim = be32toh(3);
4207 	}
4208 }
4209 
4210 static void
4211 mxge_npoll(struct ifnet *ifp, struct ifpoll_info *info)
4212 {
4213 	struct mxge_softc *sc = ifp->if_softc;
4214 	int i;
4215 
4216 	if (info == NULL)
4217 		return;
4218 
4219 	/*
4220 	 * Only poll rx; polling tx and status don't seem to work
4221 	 */
4222 	for (i = 0; i < sc->num_slices; ++i) {
4223 		struct mxge_slice_state *ss = &sc->ss[i];
4224 		int idx = ss->intr_cpuid;
4225 
4226 		KKASSERT(idx < ncpus2);
4227 		info->ifpi_rx[idx].poll_func = mxge_npoll_rx;
4228 		info->ifpi_rx[idx].arg = ss;
4229 		info->ifpi_rx[idx].serializer = &ss->rx_data.rx_serialize;
4230 	}
4231 }
4232 
4233 #endif	/* IFPOLL_ENABLE */
4234 
4235 static int
4236 mxge_attach(device_t dev)
4237 {
4238 	mxge_softc_t *sc = device_get_softc(dev);
4239 	struct ifnet *ifp = &sc->arpcom.ac_if;
4240 	int err, rid, i;
4241 
4242 	/*
4243 	 * Avoid rewriting half the lines in this file to use
4244 	 * &sc->arpcom.ac_if instead
4245 	 */
4246 	sc->ifp = ifp;
4247 	sc->dev = dev;
4248 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4249 
4250 	/* IFM_ETH_FORCEPAUSE can't be changed */
4251 	ifmedia_init(&sc->media, IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE,
4252 	    mxge_media_change, mxge_media_status);
4253 
4254 	lwkt_serialize_init(&sc->main_serialize);
4255 
4256 	mxge_fetch_tunables(sc);
4257 
4258 	err = bus_dma_tag_create(NULL,			/* parent */
4259 				 1,			/* alignment */
4260 				 0,			/* boundary */
4261 				 BUS_SPACE_MAXADDR,	/* low */
4262 				 BUS_SPACE_MAXADDR,	/* high */
4263 				 NULL, NULL,		/* filter */
4264 				 BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
4265 				 0, 			/* num segs */
4266 				 BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
4267 				 0,			/* flags */
4268 				 &sc->parent_dmat);	/* tag */
4269 	if (err != 0) {
4270 		device_printf(dev, "Err %d allocating parent dmat\n", err);
4271 		goto failed;
4272 	}
4273 
4274 	callout_init_mp(&sc->co_hdl);
4275 
4276 	mxge_setup_cfg_space(sc);
4277 
4278 	/*
4279 	 * Map the board into the kernel
4280 	 */
4281 	rid = PCIR_BARS;
4282 	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
4283 	    &rid, RF_ACTIVE);
4284 	if (sc->mem_res == NULL) {
4285 		device_printf(dev, "could not map memory\n");
4286 		err = ENXIO;
4287 		goto failed;
4288 	}
4289 
4290 	sc->sram = rman_get_virtual(sc->mem_res);
4291 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4292 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4293 		device_printf(dev, "impossible memory region size %ld\n",
4294 		    rman_get_size(sc->mem_res));
4295 		err = ENXIO;
4296 		goto failed;
4297 	}
4298 
4299 	/*
4300 	 * Make NULL terminated copy of the EEPROM strings section of
4301 	 * lanai SRAM
4302 	 */
4303 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4304 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4305 	    rman_get_bushandle(sc->mem_res),
4306 	    sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4307 	    sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE - 2);
4308 	err = mxge_parse_strings(sc);
4309 	if (err != 0) {
4310 		device_printf(dev, "parse EEPROM string failed\n");
4311 		goto failed;
4312 	}
4313 
4314 	/*
4315 	 * Enable write combining for efficient use of PCIe bus
4316 	 */
4317 	mxge_enable_wc(sc);
4318 
4319 	/*
4320 	 * Allocate the out of band DMA memory
4321 	 */
4322 	err = mxge_dma_alloc(sc, &sc->cmd_dma, sizeof(mxge_cmd_t), 64);
4323 	if (err != 0) {
4324 		device_printf(dev, "alloc cmd DMA buf failed\n");
4325 		goto failed;
4326 	}
4327 	sc->cmd = sc->cmd_dma.dmem_addr;
4328 
4329 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4330 	if (err != 0) {
4331 		device_printf(dev, "alloc zeropad DMA buf failed\n");
4332 		goto failed;
4333 	}
4334 
4335 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4336 	if (err != 0) {
4337 		device_printf(dev, "alloc dmabench DMA buf failed\n");
4338 		goto failed;
4339 	}
4340 
4341 	/* Select & load the firmware */
4342 	err = mxge_select_firmware(sc);
4343 	if (err != 0) {
4344 		device_printf(dev, "select firmware failed\n");
4345 		goto failed;
4346 	}
4347 
4348 	mxge_slice_probe(sc);
4349 	err = mxge_alloc_slices(sc);
4350 	if (err != 0) {
4351 		device_printf(dev, "alloc slices failed\n");
4352 		goto failed;
4353 	}
4354 
4355 	err = mxge_alloc_intr(sc);
4356 	if (err != 0) {
4357 		device_printf(dev, "alloc intr failed\n");
4358 		goto failed;
4359 	}
4360 
4361 	/* Setup serializes */
4362 	mxge_setup_serialize(sc);
4363 
4364 	err = mxge_reset(sc, 0);
4365 	if (err != 0) {
4366 		device_printf(dev, "reset failed\n");
4367 		goto failed;
4368 	}
4369 
4370 	err = mxge_alloc_rings(sc);
4371 	if (err != 0) {
4372 		device_printf(dev, "failed to allocate rings\n");
4373 		goto failed;
4374 	}
4375 
4376 	ifp->if_baudrate = IF_Gbps(10UL);
4377 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO;
4378 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4379 
4380 	ifp->if_capabilities |= IFCAP_VLAN_MTU;
4381 #if 0
4382 	/* Well, its software, sigh */
4383 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
4384 #endif
4385 	ifp->if_capenable = ifp->if_capabilities;
4386 
4387 	ifp->if_softc = sc;
4388 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4389 	ifp->if_init = mxge_init;
4390 	ifp->if_ioctl = mxge_ioctl;
4391 	ifp->if_start = mxge_start;
4392 #ifdef IFPOLL_ENABLE
4393 	if (sc->intr_type != PCI_INTR_TYPE_LEGACY)
4394 		ifp->if_npoll = mxge_npoll;
4395 #endif
4396 	ifp->if_serialize = mxge_serialize;
4397 	ifp->if_deserialize = mxge_deserialize;
4398 	ifp->if_tryserialize = mxge_tryserialize;
4399 #ifdef INVARIANTS
4400 	ifp->if_serialize_assert = mxge_serialize_assert;
4401 #endif
4402 
4403 	/* Increase TSO burst length */
4404 	ifp->if_tsolen = (32 * ETHERMTU);
4405 
4406 	/* Initialise the ifmedia structure */
4407 	mxge_media_init(sc);
4408 	mxge_media_probe(sc);
4409 
4410 	ether_ifattach(ifp, sc->mac_addr, NULL);
4411 
4412 	/* Setup TX rings and subqueues */
4413 	for (i = 0; i < sc->num_tx_rings; ++i) {
4414 		struct ifaltq_subque *ifsq = ifq_get_subq(&ifp->if_snd, i);
4415 		struct mxge_slice_state *ss = &sc->ss[i];
4416 
4417 		ifsq_set_cpuid(ifsq, ss->intr_cpuid);
4418 		ifsq_set_hw_serialize(ifsq, &ss->tx.tx_serialize);
4419 		ifsq_set_priv(ifsq, &ss->tx);
4420 		ss->tx.ifsq = ifsq;
4421 
4422 		ifsq_watchdog_init(&ss->tx.watchdog, ifsq, mxge_watchdog);
4423 	}
4424 
4425 	/*
4426 	 * XXX
4427 	 * We are not ready to do "gather" jumbo frame, so
4428 	 * limit MTU to MJUMPAGESIZE
4429 	 */
4430 	sc->max_mtu = MJUMPAGESIZE -
4431 	    ETHER_HDR_LEN - EVL_ENCAPLEN - MXGEFW_PAD - 1;
4432 	sc->dying = 0;
4433 
4434 	err = mxge_setup_intr(sc);
4435 	if (err != 0) {
4436 		device_printf(dev, "alloc and setup intr failed\n");
4437 		ether_ifdetach(ifp);
4438 		goto failed;
4439 	}
4440 
4441 	mxge_add_sysctls(sc);
4442 
4443 	/* Increase non-cluster mbuf limit; used by small RX rings */
4444 	mb_inclimit(ifp->if_nmbclusters);
4445 
4446 	callout_reset_bycpu(&sc->co_hdl, mxge_ticks, mxge_tick, sc,
4447 	    sc->ss[0].intr_cpuid);
4448 	return 0;
4449 
4450 failed:
4451 	mxge_detach(dev);
4452 	return err;
4453 }
4454 
4455 static int
4456 mxge_detach(device_t dev)
4457 {
4458 	mxge_softc_t *sc = device_get_softc(dev);
4459 
4460 	if (device_is_attached(dev)) {
4461 		struct ifnet *ifp = sc->ifp;
4462 		int mblimit = ifp->if_nmbclusters;
4463 
4464 		ifnet_serialize_all(ifp);
4465 
4466 		sc->dying = 1;
4467 		if (ifp->if_flags & IFF_RUNNING)
4468 			mxge_close(sc, 1);
4469 		callout_stop(&sc->co_hdl);
4470 
4471 		mxge_teardown_intr(sc, sc->num_slices);
4472 
4473 		ifnet_deserialize_all(ifp);
4474 
4475 		callout_terminate(&sc->co_hdl);
4476 
4477 		ether_ifdetach(ifp);
4478 
4479 		/* Decrease non-cluster mbuf limit increased by us */
4480 		mb_inclimit(-mblimit);
4481 	}
4482 	ifmedia_removeall(&sc->media);
4483 
4484 	if (sc->cmd != NULL && sc->zeropad_dma.dmem_addr != NULL &&
4485 	    sc->sram != NULL)
4486 		mxge_dummy_rdma(sc, 0);
4487 
4488 	mxge_free_intr(sc);
4489 	mxge_rem_sysctls(sc);
4490 	mxge_free_rings(sc);
4491 
4492 	/* MUST after sysctls, intr and rings are freed */
4493 	mxge_free_slices(sc);
4494 
4495 	if (sc->dmabench_dma.dmem_addr != NULL)
4496 		mxge_dma_free(&sc->dmabench_dma);
4497 	if (sc->zeropad_dma.dmem_addr != NULL)
4498 		mxge_dma_free(&sc->zeropad_dma);
4499 	if (sc->cmd_dma.dmem_addr != NULL)
4500 		mxge_dma_free(&sc->cmd_dma);
4501 
4502 	if (sc->msix_table_res != NULL) {
4503 		bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(2),
4504 		    sc->msix_table_res);
4505 	}
4506 	if (sc->mem_res != NULL) {
4507 		bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS,
4508 		    sc->mem_res);
4509 	}
4510 
4511 	if (sc->parent_dmat != NULL)
4512 		bus_dma_tag_destroy(sc->parent_dmat);
4513 
4514 	return 0;
4515 }
4516 
4517 static int
4518 mxge_shutdown(device_t dev)
4519 {
4520 	return 0;
4521 }
4522 
4523 static void
4524 mxge_free_msix(struct mxge_softc *sc, boolean_t setup)
4525 {
4526 	int i;
4527 
4528 	KKASSERT(sc->num_slices > 1);
4529 
4530 	for (i = 0; i < sc->num_slices; ++i) {
4531 		struct mxge_slice_state *ss = &sc->ss[i];
4532 
4533 		if (ss->intr_res != NULL) {
4534 			bus_release_resource(sc->dev, SYS_RES_IRQ,
4535 			    ss->intr_rid, ss->intr_res);
4536 		}
4537 		if (ss->intr_rid >= 0)
4538 			pci_release_msix_vector(sc->dev, ss->intr_rid);
4539 	}
4540 	if (setup)
4541 		pci_teardown_msix(sc->dev);
4542 }
4543 
4544 static int
4545 mxge_alloc_msix(struct mxge_softc *sc)
4546 {
4547 	struct mxge_slice_state *ss;
4548 	int offset, rid, error, i;
4549 	boolean_t setup = FALSE;
4550 
4551 	KKASSERT(sc->num_slices > 1);
4552 
4553 	if (sc->num_slices == ncpus2) {
4554 		offset = 0;
4555 	} else {
4556 		int offset_def;
4557 
4558 		offset_def = (sc->num_slices * device_get_unit(sc->dev)) %
4559 		    ncpus2;
4560 
4561 		offset = device_getenv_int(sc->dev, "msix.offset", offset_def);
4562 		if (offset >= ncpus2 ||
4563 		    offset % sc->num_slices != 0) {
4564 			device_printf(sc->dev, "invalid msix.offset %d, "
4565 			    "use %d\n", offset, offset_def);
4566 			offset = offset_def;
4567 		}
4568 	}
4569 
4570 	ss = &sc->ss[0];
4571 
4572 	ss->intr_serialize = &sc->main_serialize;
4573 	ss->intr_func = mxge_msi;
4574 	ksnprintf(ss->intr_desc0, sizeof(ss->intr_desc0),
4575 	    "%s comb", device_get_nameunit(sc->dev));
4576 	ss->intr_desc = ss->intr_desc0;
4577 	ss->intr_cpuid = offset;
4578 
4579 	for (i = 1; i < sc->num_slices; ++i) {
4580 		ss = &sc->ss[i];
4581 
4582 		ss->intr_serialize = &ss->rx_data.rx_serialize;
4583 		if (sc->num_tx_rings == 1) {
4584 			ss->intr_func = mxge_msix_rx;
4585 			ksnprintf(ss->intr_desc0, sizeof(ss->intr_desc0),
4586 			    "%s rx", device_get_nameunit(sc->dev));
4587 		} else {
4588 			ss->intr_func = mxge_msix_rxtx;
4589 			ksnprintf(ss->intr_desc0, sizeof(ss->intr_desc0),
4590 			    "%s rxtx", device_get_nameunit(sc->dev));
4591 		}
4592 		ss->intr_desc = ss->intr_desc0;
4593 		ss->intr_cpuid = offset + i;
4594 	}
4595 
4596 	rid = PCIR_BAR(2);
4597 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4598 	    &rid, RF_ACTIVE);
4599 	if (sc->msix_table_res == NULL) {
4600 		device_printf(sc->dev, "couldn't alloc MSI-X table res\n");
4601 		return ENXIO;
4602 	}
4603 
4604 	error = pci_setup_msix(sc->dev);
4605 	if (error) {
4606 		device_printf(sc->dev, "could not setup MSI-X\n");
4607 		goto back;
4608 	}
4609 	setup = TRUE;
4610 
4611 	for (i = 0; i < sc->num_slices; ++i) {
4612 		ss = &sc->ss[i];
4613 
4614 		error = pci_alloc_msix_vector(sc->dev, i, &ss->intr_rid,
4615 		    ss->intr_cpuid);
4616 		if (error) {
4617 			device_printf(sc->dev, "could not alloc "
4618 			    "MSI-X %d on cpu%d\n", i, ss->intr_cpuid);
4619 			goto back;
4620 		}
4621 
4622 		ss->intr_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
4623 		    &ss->intr_rid, RF_ACTIVE);
4624 		if (ss->intr_res == NULL) {
4625 			device_printf(sc->dev, "could not alloc "
4626 			    "MSI-X %d resource\n", i);
4627 			error = ENXIO;
4628 			goto back;
4629 		}
4630 	}
4631 
4632 	pci_enable_msix(sc->dev);
4633 	sc->intr_type = PCI_INTR_TYPE_MSIX;
4634 back:
4635 	if (error)
4636 		mxge_free_msix(sc, setup);
4637 	return error;
4638 }
4639 
4640 static int
4641 mxge_alloc_intr(struct mxge_softc *sc)
4642 {
4643 	struct mxge_slice_state *ss;
4644 	u_int irq_flags;
4645 
4646 	if (sc->num_slices > 1) {
4647 		int error;
4648 
4649 		error = mxge_alloc_msix(sc);
4650 		if (error)
4651 			return error;
4652 		KKASSERT(sc->intr_type == PCI_INTR_TYPE_MSIX);
4653 		return 0;
4654 	}
4655 
4656 	ss = &sc->ss[0];
4657 
4658 	sc->intr_type = pci_alloc_1intr(sc->dev, mxge_msi_enable,
4659 	    &ss->intr_rid, &irq_flags);
4660 
4661 	ss->intr_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
4662 	    &ss->intr_rid, irq_flags);
4663 	if (ss->intr_res == NULL) {
4664 		device_printf(sc->dev, "could not alloc interrupt\n");
4665 		return ENXIO;
4666 	}
4667 
4668 	if (sc->intr_type == PCI_INTR_TYPE_LEGACY)
4669 		ss->intr_func = mxge_legacy;
4670 	else
4671 		ss->intr_func = mxge_msi;
4672 	ss->intr_serialize = &sc->main_serialize;
4673 	ss->intr_cpuid = rman_get_cpuid(ss->intr_res);
4674 
4675 	return 0;
4676 }
4677 
4678 static int
4679 mxge_setup_intr(struct mxge_softc *sc)
4680 {
4681 	int i;
4682 
4683 	for (i = 0; i < sc->num_slices; ++i) {
4684 		struct mxge_slice_state *ss = &sc->ss[i];
4685 		int error;
4686 
4687 		error = bus_setup_intr_descr(sc->dev, ss->intr_res,
4688 		    INTR_MPSAFE, ss->intr_func, ss, &ss->intr_hand,
4689 		    ss->intr_serialize, ss->intr_desc);
4690 		if (error) {
4691 			device_printf(sc->dev, "can't setup %dth intr\n", i);
4692 			mxge_teardown_intr(sc, i);
4693 			return error;
4694 		}
4695 	}
4696 	return 0;
4697 }
4698 
4699 static void
4700 mxge_teardown_intr(struct mxge_softc *sc, int cnt)
4701 {
4702 	int i;
4703 
4704 	if (sc->ss == NULL)
4705 		return;
4706 
4707 	for (i = 0; i < cnt; ++i) {
4708 		struct mxge_slice_state *ss = &sc->ss[i];
4709 
4710 		bus_teardown_intr(sc->dev, ss->intr_res, ss->intr_hand);
4711 	}
4712 }
4713 
4714 static void
4715 mxge_free_intr(struct mxge_softc *sc)
4716 {
4717 	if (sc->ss == NULL)
4718 		return;
4719 
4720 	if (sc->intr_type != PCI_INTR_TYPE_MSIX) {
4721 		struct mxge_slice_state *ss = &sc->ss[0];
4722 
4723 		if (ss->intr_res != NULL) {
4724 			bus_release_resource(sc->dev, SYS_RES_IRQ,
4725 			    ss->intr_rid, ss->intr_res);
4726 		}
4727 		if (sc->intr_type == PCI_INTR_TYPE_MSI)
4728 			pci_release_msi(sc->dev);
4729 	} else {
4730 		mxge_free_msix(sc, TRUE);
4731 	}
4732 }
4733