xref: /dragonfly/sys/dev/netif/mxge/if_mxge.c (revision 7d84b73d)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2013, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 $FreeBSD: head/sys/dev/mxge/if_mxge.c 254263 2013-08-12 23:30:01Z scottl $
29 
30 ***************************************************************************/
31 
32 #include "opt_ifpoll.h"
33 #include "opt_inet.h"
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/linker.h>
38 #include <sys/firmware.h>
39 #include <sys/endian.h>
40 #include <sys/in_cksum.h>
41 #include <sys/sockio.h>
42 #include <sys/mbuf.h>
43 #include <sys/malloc.h>
44 #include <sys/kernel.h>
45 #include <sys/module.h>
46 #include <sys/serialize.h>
47 #include <sys/socket.h>
48 #include <sys/sysctl.h>
49 
50 #include <net/if.h>
51 #include <net/if_arp.h>
52 #include <net/ifq_var.h>
53 #include <net/if_ringmap.h>
54 #include <net/ethernet.h>
55 #include <net/if_dl.h>
56 #include <net/if_media.h>
57 #include <net/if_poll.h>
58 
59 #include <net/bpf.h>
60 
61 #include <net/if_types.h>
62 #include <net/vlan/if_vlan_var.h>
63 #include <net/zlib.h>
64 #include <net/toeplitz.h>
65 
66 #include <netinet/in_systm.h>
67 #include <netinet/in.h>
68 #include <netinet/ip.h>
69 #include <netinet/tcp.h>
70 
71 #include <sys/bus.h>
72 #include <sys/rman.h>
73 
74 #include <bus/pci/pcireg.h>
75 #include <bus/pci/pcivar.h>
76 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
77 
78 #include <vm/vm.h>		/* for pmap_mapdev() */
79 #include <vm/pmap.h>
80 
81 #if defined(__x86_64__)
82 #include <machine/specialreg.h>
83 #endif
84 
85 #include <dev/netif/mxge/mxge_mcp.h>
86 #include <dev/netif/mxge/mcp_gen_header.h>
87 #include <dev/netif/mxge/if_mxge_var.h>
88 
89 #define MXGE_IFM	(IFM_ETHER | IFM_FDX | IFM_ETH_FORCEPAUSE)
90 
91 #define MXGE_RX_SMALL_BUFLEN		(MHLEN - MXGEFW_PAD)
92 #define MXGE_HWRSS_KEYLEN		16
93 
94 /* Tunable params */
95 static int mxge_nvidia_ecrc_enable = 1;
96 static int mxge_force_firmware = 0;
97 static int mxge_intr_coal_delay = MXGE_INTR_COAL_DELAY;
98 static int mxge_deassert_wait = 1;
99 static int mxge_ticks;
100 static int mxge_num_slices = 0;
101 static int mxge_always_promisc = 0;
102 static int mxge_throttle = 0;
103 static int mxge_msi_enable = 1;
104 static int mxge_msix_enable = 1;
105 static int mxge_multi_tx = 1;
106 /*
107  * Don't use RSS by default, its just too slow
108  */
109 static int mxge_use_rss = 0;
110 
111 static char mxge_flowctrl[IFM_ETH_FC_STRLEN] = IFM_ETH_FC_FORCE_NONE;
112 
113 static const char *mxge_fw_unaligned = "mxge_ethp_z8e";
114 static const char *mxge_fw_aligned = "mxge_eth_z8e";
115 static const char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
116 static const char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
117 
118 TUNABLE_INT("hw.mxge.num_slices", &mxge_num_slices);
119 TUNABLE_INT("hw.mxge.intr_coal_delay", &mxge_intr_coal_delay);
120 TUNABLE_INT("hw.mxge.nvidia_ecrc_enable", &mxge_nvidia_ecrc_enable);
121 TUNABLE_INT("hw.mxge.force_firmware", &mxge_force_firmware);
122 TUNABLE_INT("hw.mxge.deassert_wait", &mxge_deassert_wait);
123 TUNABLE_INT("hw.mxge.ticks", &mxge_ticks);
124 TUNABLE_INT("hw.mxge.always_promisc", &mxge_always_promisc);
125 TUNABLE_INT("hw.mxge.throttle", &mxge_throttle);
126 TUNABLE_INT("hw.mxge.multi_tx", &mxge_multi_tx);
127 TUNABLE_INT("hw.mxge.use_rss", &mxge_use_rss);
128 TUNABLE_INT("hw.mxge.msi.enable", &mxge_msi_enable);
129 TUNABLE_INT("hw.mxge.msix.enable", &mxge_msix_enable);
130 TUNABLE_STR("hw.mxge.flow_ctrl", mxge_flowctrl, sizeof(mxge_flowctrl));
131 
132 static int mxge_probe(device_t dev);
133 static int mxge_attach(device_t dev);
134 static int mxge_detach(device_t dev);
135 static int mxge_shutdown(device_t dev);
136 
137 static int mxge_alloc_intr(struct mxge_softc *sc);
138 static void mxge_free_intr(struct mxge_softc *sc);
139 static int mxge_setup_intr(struct mxge_softc *sc);
140 static void mxge_teardown_intr(struct mxge_softc *sc, int cnt);
141 
142 static device_method_t mxge_methods[] = {
143 	/* Device interface */
144 	DEVMETHOD(device_probe, mxge_probe),
145 	DEVMETHOD(device_attach, mxge_attach),
146 	DEVMETHOD(device_detach, mxge_detach),
147 	DEVMETHOD(device_shutdown, mxge_shutdown),
148 	DEVMETHOD_END
149 };
150 
151 static driver_t mxge_driver = {
152 	"mxge",
153 	mxge_methods,
154 	sizeof(mxge_softc_t),
155 };
156 
157 static devclass_t mxge_devclass;
158 
159 /* Declare ourselves to be a child of the PCI bus.*/
160 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, NULL, NULL);
161 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
162 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
163 
164 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
165 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
166 static void mxge_close(mxge_softc_t *sc, int down);
167 static int mxge_open(mxge_softc_t *sc);
168 static void mxge_tick(void *arg);
169 static void mxge_watchdog_reset(mxge_softc_t *sc);
170 static void mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice);
171 
172 static int
173 mxge_probe(device_t dev)
174 {
175 	if (pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM &&
176 	    (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E ||
177 	     pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9)) {
178 		int rev = pci_get_revid(dev);
179 
180 		switch (rev) {
181 		case MXGE_PCI_REV_Z8E:
182 			device_set_desc(dev, "Myri10G-PCIE-8A");
183 			break;
184 		case MXGE_PCI_REV_Z8ES:
185 			device_set_desc(dev, "Myri10G-PCIE-8B");
186 			break;
187 		default:
188 			device_set_desc(dev, "Myri10G-PCIE-8??");
189 			device_printf(dev, "Unrecognized rev %d NIC\n", rev);
190 			break;
191 		}
192 		return 0;
193 	}
194 	return ENXIO;
195 }
196 
197 static void
198 mxge_enable_wc(mxge_softc_t *sc)
199 {
200 #if defined(__x86_64__)
201 	vm_offset_t len;
202 
203 	sc->wc = 1;
204 	len = rman_get_size(sc->mem_res);
205 	pmap_change_attr((vm_offset_t) sc->sram, len / PAGE_SIZE,
206 	    PAT_WRITE_COMBINING);
207 #endif
208 }
209 
210 static int
211 mxge_dma_alloc(mxge_softc_t *sc, bus_dmamem_t *dma, size_t bytes,
212     bus_size_t alignment)
213 {
214 	bus_size_t boundary;
215 	int err;
216 
217 	if (bytes > 4096 && alignment == 4096)
218 		boundary = 0;
219 	else
220 		boundary = 4096;
221 
222 	err = bus_dmamem_coherent(sc->parent_dmat, alignment, boundary,
223 	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, bytes,
224 	    BUS_DMA_WAITOK | BUS_DMA_ZERO, dma);
225 	if (err != 0) {
226 		device_printf(sc->dev, "bus_dmamem_coherent failed: %d\n", err);
227 		return err;
228 	}
229 	return 0;
230 }
231 
232 static void
233 mxge_dma_free(bus_dmamem_t *dma)
234 {
235 	bus_dmamap_unload(dma->dmem_tag, dma->dmem_map);
236 	bus_dmamem_free(dma->dmem_tag, dma->dmem_addr, dma->dmem_map);
237 	bus_dma_tag_destroy(dma->dmem_tag);
238 }
239 
240 /*
241  * The eeprom strings on the lanaiX have the format
242  * SN=x\0
243  * MAC=x:x:x:x:x:x\0
244  * PC=text\0
245  */
246 static int
247 mxge_parse_strings(mxge_softc_t *sc)
248 {
249 	const char *ptr;
250 	int i, found_mac, found_sn2;
251 	char *endptr;
252 
253 	ptr = sc->eeprom_strings;
254 	found_mac = 0;
255 	found_sn2 = 0;
256 	while (*ptr != '\0') {
257 		if (strncmp(ptr, "MAC=", 4) == 0) {
258 			ptr += 4;
259 			for (i = 0;;) {
260 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
261 				if (endptr - ptr != 2)
262 					goto abort;
263 				ptr = endptr;
264 				if (++i == 6)
265 					break;
266 				if (*ptr++ != ':')
267 					goto abort;
268 			}
269 			found_mac = 1;
270 		} else if (strncmp(ptr, "PC=", 3) == 0) {
271 			ptr += 3;
272 			strlcpy(sc->product_code_string, ptr,
273 			    sizeof(sc->product_code_string));
274 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
275 			ptr += 3;
276 			strlcpy(sc->serial_number_string, ptr,
277 			    sizeof(sc->serial_number_string));
278 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
279 			/* SN2 takes precedence over SN */
280 			ptr += 4;
281 			found_sn2 = 1;
282 			strlcpy(sc->serial_number_string, ptr,
283 			    sizeof(sc->serial_number_string));
284 		}
285 		while (*ptr++ != '\0') {}
286 	}
287 
288 	if (found_mac)
289 		return 0;
290 
291 abort:
292 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
293 	return ENXIO;
294 }
295 
296 #if defined(__x86_64__)
297 
298 static void
299 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
300 {
301 	uint32_t val;
302 	unsigned long base, off;
303 	char *va, *cfgptr;
304 	device_t pdev, mcp55;
305 	uint16_t vendor_id, device_id, word;
306 	uintptr_t bus, slot, func, ivend, idev;
307 	uint32_t *ptr32;
308 
309 	if (!mxge_nvidia_ecrc_enable)
310 		return;
311 
312 	pdev = device_get_parent(device_get_parent(sc->dev));
313 	if (pdev == NULL) {
314 		device_printf(sc->dev, "could not find parent?\n");
315 		return;
316 	}
317 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
318 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
319 
320 	if (vendor_id != 0x10de)
321 		return;
322 
323 	base = 0;
324 
325 	if (device_id == 0x005d) {
326 		/* ck804, base address is magic */
327 		base = 0xe0000000UL;
328 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
329 		/* mcp55, base address stored in chipset */
330 		mcp55 = pci_find_bsf(0, 0, 0);
331 		if (mcp55 &&
332 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
333 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
334 			word = pci_read_config(mcp55, 0x90, 2);
335 			base = ((unsigned long)word & 0x7ffeU) << 25;
336 		}
337 	}
338 	if (!base)
339 		return;
340 
341 	/*
342 	 * XXXX
343 	 * Test below is commented because it is believed that doing
344 	 * config read/write beyond 0xff will access the config space
345 	 * for the next larger function.  Uncomment this and remove
346 	 * the hacky pmap_mapdev() way of accessing config space when
347 	 * DragonFly grows support for extended pcie config space access.
348 	 */
349 #if 0
350 	/*
351 	 * See if we can, by some miracle, access the extended
352 	 * config space
353 	 */
354 	val = pci_read_config(pdev, 0x178, 4);
355 	if (val != 0xffffffff) {
356 		val |= 0x40;
357 		pci_write_config(pdev, 0x178, val, 4);
358 		return;
359 	}
360 #endif
361 	/*
362 	 * Rather than using normal pci config space writes, we must
363 	 * map the Nvidia config space ourselves.  This is because on
364 	 * opteron/nvidia class machine the 0xe000000 mapping is
365 	 * handled by the nvidia chipset, that means the internal PCI
366 	 * device (the on-chip northbridge), or the amd-8131 bridge
367 	 * and things behind them are not visible by this method.
368 	 */
369 
370 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
371 		      PCI_IVAR_BUS, &bus);
372 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
373 		      PCI_IVAR_SLOT, &slot);
374 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
375 		      PCI_IVAR_FUNCTION, &func);
376 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
377 		      PCI_IVAR_VENDOR, &ivend);
378 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
379 		      PCI_IVAR_DEVICE, &idev);
380 
381 	off =  base + 0x00100000UL * (unsigned long)bus +
382 	    0x00001000UL * (unsigned long)(func + 8 * slot);
383 
384 	/* map it into the kernel */
385 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
386 	if (va == NULL) {
387 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
388 		return;
389 	}
390 	/* get a pointer to the config space mapped into the kernel */
391 	cfgptr = va + (off & PAGE_MASK);
392 
393 	/* make sure that we can really access it */
394 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
395 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
396 	if (!(vendor_id == ivend && device_id == idev)) {
397 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
398 		    vendor_id, device_id);
399 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
400 		return;
401 	}
402 
403 	ptr32 = (uint32_t*)(cfgptr + 0x178);
404 	val = *ptr32;
405 
406 	if (val == 0xffffffff) {
407 		device_printf(sc->dev, "extended mapping failed\n");
408 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
409 		return;
410 	}
411 	*ptr32 = val | 0x40;
412 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
413 	if (bootverbose) {
414 		device_printf(sc->dev, "Enabled ECRC on upstream "
415 		    "Nvidia bridge at %d:%d:%d\n",
416 		    (int)bus, (int)slot, (int)func);
417 	}
418 }
419 
420 #else	/* __x86_64__ */
421 
422 static void
423 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
424 {
425 	device_printf(sc->dev, "Nforce 4 chipset on non-x86/x86_64!?!?!\n");
426 }
427 
428 #endif
429 
430 static int
431 mxge_dma_test(mxge_softc_t *sc, int test_type)
432 {
433 	mxge_cmd_t cmd;
434 	bus_addr_t dmatest_bus = sc->dmabench_dma.dmem_busaddr;
435 	int status;
436 	uint32_t len;
437 	const char *test = " ";
438 
439 	/*
440 	 * Run a small DMA test.
441 	 * The magic multipliers to the length tell the firmware
442 	 * to do DMA read, write, or read+write tests.  The
443 	 * results are returned in cmd.data0.  The upper 16
444 	 * bits of the return is the number of transfers completed.
445 	 * The lower 16 bits is the time in 0.5us ticks that the
446 	 * transfers took to complete.
447 	 */
448 
449 	len = sc->tx_boundary;
450 
451 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
452 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
453 	cmd.data2 = len * 0x10000;
454 	status = mxge_send_cmd(sc, test_type, &cmd);
455 	if (status != 0) {
456 		test = "read";
457 		goto abort;
458 	}
459 	sc->read_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
460 
461 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
462 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
463 	cmd.data2 = len * 0x1;
464 	status = mxge_send_cmd(sc, test_type, &cmd);
465 	if (status != 0) {
466 		test = "write";
467 		goto abort;
468 	}
469 	sc->write_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
470 
471 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
472 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
473 	cmd.data2 = len * 0x10001;
474 	status = mxge_send_cmd(sc, test_type, &cmd);
475 	if (status != 0) {
476 		test = "read/write";
477 		goto abort;
478 	}
479 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
480 	    (cmd.data0 & 0xffff);
481 
482 abort:
483 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST) {
484 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
485 		    test, status);
486 	}
487 	return status;
488 }
489 
490 /*
491  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
492  * when the PCI-E Completion packets are aligned on an 8-byte
493  * boundary.  Some PCI-E chip sets always align Completion packets; on
494  * the ones that do not, the alignment can be enforced by enabling
495  * ECRC generation (if supported).
496  *
497  * When PCI-E Completion packets are not aligned, it is actually more
498  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
499  *
500  * If the driver can neither enable ECRC nor verify that it has
501  * already been enabled, then it must use a firmware image which works
502  * around unaligned completion packets (ethp_z8e.dat), and it should
503  * also ensure that it never gives the device a Read-DMA which is
504  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
505  * enabled, then the driver should use the aligned (eth_z8e.dat)
506  * firmware image, and set tx_boundary to 4KB.
507  */
508 static int
509 mxge_firmware_probe(mxge_softc_t *sc)
510 {
511 	device_t dev = sc->dev;
512 	int reg, status;
513 	uint16_t pectl;
514 
515 	sc->tx_boundary = 4096;
516 
517 	/*
518 	 * Verify the max read request size was set to 4KB
519 	 * before trying the test with 4KB.
520 	 */
521 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
522 		pectl = pci_read_config(dev, reg + 0x8, 2);
523 		if ((pectl & (5 << 12)) != (5 << 12)) {
524 			device_printf(dev, "Max Read Req. size != 4k (0x%x)\n",
525 			    pectl);
526 			sc->tx_boundary = 2048;
527 		}
528 	}
529 
530 	/*
531 	 * Load the optimized firmware (which assumes aligned PCIe
532 	 * completions) in order to see if it works on this host.
533 	 */
534 	sc->fw_name = mxge_fw_aligned;
535 	status = mxge_load_firmware(sc, 1);
536 	if (status != 0)
537 		return status;
538 
539 	/*
540 	 * Enable ECRC if possible
541 	 */
542 	mxge_enable_nvidia_ecrc(sc);
543 
544 	/*
545 	 * Run a DMA test which watches for unaligned completions and
546 	 * aborts on the first one seen.  Not required on Z8ES or newer.
547 	 */
548 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
549 		return 0;
550 
551 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
552 	if (status == 0)
553 		return 0; /* keep the aligned firmware */
554 
555 	if (status != E2BIG)
556 		device_printf(dev, "DMA test failed: %d\n", status);
557 	if (status == ENOSYS) {
558 		device_printf(dev, "Falling back to ethp! "
559 		    "Please install up to date fw\n");
560 	}
561 	return status;
562 }
563 
564 static int
565 mxge_select_firmware(mxge_softc_t *sc)
566 {
567 	int aligned = 0;
568 	int force_firmware = mxge_force_firmware;
569 
570 	if (sc->throttle)
571 		force_firmware = sc->throttle;
572 
573 	if (force_firmware != 0) {
574 		if (force_firmware == 1)
575 			aligned = 1;
576 		else
577 			aligned = 0;
578 		if (bootverbose) {
579 			device_printf(sc->dev,
580 			    "Assuming %s completions (forced)\n",
581 			    aligned ? "aligned" : "unaligned");
582 		}
583 		goto abort;
584 	}
585 
586 	/*
587 	 * If the PCIe link width is 4 or less, we can use the aligned
588 	 * firmware and skip any checks
589 	 */
590 	if (sc->link_width != 0 && sc->link_width <= 4) {
591 		device_printf(sc->dev, "PCIe x%d Link, "
592 		    "expect reduced performance\n", sc->link_width);
593 		aligned = 1;
594 		goto abort;
595 	}
596 
597 	if (mxge_firmware_probe(sc) == 0)
598 		return 0;
599 
600 abort:
601 	if (aligned) {
602 		sc->fw_name = mxge_fw_aligned;
603 		sc->tx_boundary = 4096;
604 	} else {
605 		sc->fw_name = mxge_fw_unaligned;
606 		sc->tx_boundary = 2048;
607 	}
608 	return mxge_load_firmware(sc, 0);
609 }
610 
611 static int
612 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
613 {
614 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
615 		if_printf(sc->ifp, "Bad firmware type: 0x%x\n",
616 		    be32toh(hdr->mcp_type));
617 		return EIO;
618 	}
619 
620 	/* Save firmware version for sysctl */
621 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
622 	if (bootverbose)
623 		if_printf(sc->ifp, "firmware id: %s\n", hdr->version);
624 
625 	ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
626 	    &sc->fw_ver_minor, &sc->fw_ver_tiny);
627 
628 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR &&
629 	      sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
630 		if_printf(sc->ifp, "Found firmware version %s\n",
631 		    sc->fw_version);
632 		if_printf(sc->ifp, "Driver needs %d.%d\n",
633 		    MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
634 		return EINVAL;
635 	}
636 	return 0;
637 }
638 
639 static void *
640 z_alloc(void *nil, u_int items, u_int size)
641 {
642 	return kmalloc(items * size, M_TEMP, M_WAITOK);
643 }
644 
645 static void
646 z_free(void *nil, void *ptr)
647 {
648 	kfree(ptr, M_TEMP);
649 }
650 
651 static int
652 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
653 {
654 	z_stream zs;
655 	char *inflate_buffer;
656 	const struct firmware *fw;
657 	const mcp_gen_header_t *hdr;
658 	unsigned hdr_offset;
659 	int status;
660 	unsigned int i;
661 	char dummy;
662 	size_t fw_len;
663 
664 	fw = firmware_get(sc->fw_name);
665 	if (fw == NULL) {
666 		if_printf(sc->ifp, "Could not find firmware image %s\n",
667 		    sc->fw_name);
668 		return ENOENT;
669 	}
670 
671 	/* Setup zlib and decompress f/w */
672 	bzero(&zs, sizeof(zs));
673 	zs.zalloc = z_alloc;
674 	zs.zfree = z_free;
675 	status = inflateInit(&zs);
676 	if (status != Z_OK) {
677 		status = EIO;
678 		goto abort_with_fw;
679 	}
680 
681 	/*
682 	 * The uncompressed size is stored as the firmware version,
683 	 * which would otherwise go unused
684 	 */
685 	fw_len = (size_t)fw->version;
686 	inflate_buffer = kmalloc(fw_len, M_TEMP, M_WAITOK);
687 	zs.avail_in = fw->datasize;
688 	zs.next_in = __DECONST(char *, fw->data);
689 	zs.avail_out = fw_len;
690 	zs.next_out = inflate_buffer;
691 	status = inflate(&zs, Z_FINISH);
692 	if (status != Z_STREAM_END) {
693 		if_printf(sc->ifp, "zlib %d\n", status);
694 		status = EIO;
695 		goto abort_with_buffer;
696 	}
697 
698 	/* Check id */
699 	hdr_offset =
700 	htobe32(*(const uint32_t *)(inflate_buffer + MCP_HEADER_PTR_OFFSET));
701 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
702 		if_printf(sc->ifp, "Bad firmware file");
703 		status = EIO;
704 		goto abort_with_buffer;
705 	}
706 	hdr = (const void*)(inflate_buffer + hdr_offset);
707 
708 	status = mxge_validate_firmware(sc, hdr);
709 	if (status != 0)
710 		goto abort_with_buffer;
711 
712 	/* Copy the inflated firmware to NIC SRAM. */
713 	for (i = 0; i < fw_len; i += 256) {
714 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i, inflate_buffer + i,
715 		    min(256U, (unsigned)(fw_len - i)));
716 		wmb();
717 		dummy = *sc->sram;
718 		wmb();
719 	}
720 
721 	*limit = fw_len;
722 	status = 0;
723 abort_with_buffer:
724 	kfree(inflate_buffer, M_TEMP);
725 	inflateEnd(&zs);
726 abort_with_fw:
727 	firmware_put(fw, FIRMWARE_UNLOAD);
728 	return status;
729 }
730 
731 /*
732  * Enable or disable periodic RDMAs from the host to make certain
733  * chipsets resend dropped PCIe messages
734  */
735 static void
736 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
737 {
738 	char buf_bytes[72];
739 	volatile uint32_t *confirm;
740 	volatile char *submit;
741 	uint32_t *buf, dma_low, dma_high;
742 	int i;
743 
744 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
745 
746 	/* Clear confirmation addr */
747 	confirm = (volatile uint32_t *)sc->cmd;
748 	*confirm = 0;
749 	wmb();
750 
751 	/*
752 	 * Send an rdma command to the PCIe engine, and wait for the
753 	 * response in the confirmation address.  The firmware should
754 	 * write a -1 there to indicate it is alive and well
755 	 */
756 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
757 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
758 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
759 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
760 	buf[2] = htobe32(0xffffffff);		/* confirm data */
761 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
762 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
763 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
764 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
765 	buf[5] = htobe32(enable);		/* enable? */
766 
767 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
768 
769 	mxge_pio_copy(submit, buf, 64);
770 	wmb();
771 	DELAY(1000);
772 	wmb();
773 	i = 0;
774 	while (*confirm != 0xffffffff && i < 20) {
775 		DELAY(1000);
776 		i++;
777 	}
778 	if (*confirm != 0xffffffff) {
779 		if_printf(sc->ifp, "dummy rdma %s failed (%p = 0x%x)",
780 		    (enable ? "enable" : "disable"), confirm, *confirm);
781 	}
782 }
783 
784 static int
785 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
786 {
787 	mcp_cmd_t *buf;
788 	char buf_bytes[sizeof(*buf) + 8];
789 	volatile mcp_cmd_response_t *response = sc->cmd;
790 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
791 	uint32_t dma_low, dma_high;
792 	int err, sleep_total = 0;
793 
794 	/* Ensure buf is aligned to 8 bytes */
795 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
796 
797 	buf->data0 = htobe32(data->data0);
798 	buf->data1 = htobe32(data->data1);
799 	buf->data2 = htobe32(data->data2);
800 	buf->cmd = htobe32(cmd);
801 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
802 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
803 
804 	buf->response_addr.low = htobe32(dma_low);
805 	buf->response_addr.high = htobe32(dma_high);
806 
807 	response->result = 0xffffffff;
808 	wmb();
809 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
810 
811 	/*
812 	 * Wait up to 20ms
813 	 */
814 	err = EAGAIN;
815 	for (sleep_total = 0; sleep_total < 20; sleep_total++) {
816 		wmb();
817 		switch (be32toh(response->result)) {
818 		case 0:
819 			data->data0 = be32toh(response->data);
820 			err = 0;
821 			break;
822 		case 0xffffffff:
823 			DELAY(1000);
824 			break;
825 		case MXGEFW_CMD_UNKNOWN:
826 			err = ENOSYS;
827 			break;
828 		case MXGEFW_CMD_ERROR_UNALIGNED:
829 			err = E2BIG;
830 			break;
831 		case MXGEFW_CMD_ERROR_BUSY:
832 			err = EBUSY;
833 			break;
834 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
835 			err = ENXIO;
836 			break;
837 		default:
838 			if_printf(sc->ifp, "command %d failed, result = %d\n",
839 			    cmd, be32toh(response->result));
840 			err = ENXIO;
841 			break;
842 		}
843 		if (err != EAGAIN)
844 			break;
845 	}
846 	if (err == EAGAIN) {
847 		if_printf(sc->ifp, "command %d timed out result = %d\n",
848 		    cmd, be32toh(response->result));
849 	}
850 	return err;
851 }
852 
853 static int
854 mxge_adopt_running_firmware(mxge_softc_t *sc)
855 {
856 	struct mcp_gen_header *hdr;
857 	const size_t bytes = sizeof(struct mcp_gen_header);
858 	size_t hdr_offset;
859 	int status;
860 
861 	/*
862 	 * Find running firmware header
863 	 */
864 	hdr_offset =
865 	htobe32(*(volatile uint32_t *)(sc->sram + MCP_HEADER_PTR_OFFSET));
866 
867 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
868 		if_printf(sc->ifp, "Running firmware has bad header offset "
869 		    "(%zu)\n", hdr_offset);
870 		return EIO;
871 	}
872 
873 	/*
874 	 * Copy header of running firmware from SRAM to host memory to
875 	 * validate firmware
876 	 */
877 	hdr = kmalloc(bytes, M_DEVBUF, M_WAITOK);
878 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
879 	    rman_get_bushandle(sc->mem_res), hdr_offset, (char *)hdr, bytes);
880 	status = mxge_validate_firmware(sc, hdr);
881 	kfree(hdr, M_DEVBUF);
882 
883 	/*
884 	 * Check to see if adopted firmware has bug where adopting
885 	 * it will cause broadcasts to be filtered unless the NIC
886 	 * is kept in ALLMULTI mode
887 	 */
888 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
889 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
890 		sc->adopted_rx_filter_bug = 1;
891 		if_printf(sc->ifp, "Adopting fw %d.%d.%d: "
892 		    "working around rx filter bug\n",
893 		    sc->fw_ver_major, sc->fw_ver_minor, sc->fw_ver_tiny);
894 	}
895 
896 	return status;
897 }
898 
899 static int
900 mxge_load_firmware(mxge_softc_t *sc, int adopt)
901 {
902 	volatile uint32_t *confirm;
903 	volatile char *submit;
904 	char buf_bytes[72];
905 	uint32_t *buf, size, dma_low, dma_high;
906 	int status, i;
907 
908 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
909 
910 	size = sc->sram_size;
911 	status = mxge_load_firmware_helper(sc, &size);
912 	if (status) {
913 		if (!adopt)
914 			return status;
915 
916 		/*
917 		 * Try to use the currently running firmware, if
918 		 * it is new enough
919 		 */
920 		status = mxge_adopt_running_firmware(sc);
921 		if (status) {
922 			if_printf(sc->ifp,
923 			    "failed to adopt running firmware\n");
924 			return status;
925 		}
926 		if_printf(sc->ifp, "Successfully adopted running firmware\n");
927 
928 		if (sc->tx_boundary == 4096) {
929 			if_printf(sc->ifp,
930 			     "Using firmware currently running on NIC.  "
931 			     "For optimal\n");
932 			if_printf(sc->ifp, "performance consider loading "
933 			     "optimized firmware\n");
934 		}
935 		sc->fw_name = mxge_fw_unaligned;
936 		sc->tx_boundary = 2048;
937 		return 0;
938 	}
939 
940 	/* Clear confirmation addr */
941 	confirm = (volatile uint32_t *)sc->cmd;
942 	*confirm = 0;
943 	wmb();
944 
945 	/*
946 	 * Send a reload command to the bootstrap MCP, and wait for the
947 	 * response in the confirmation address.  The firmware should
948 	 * write a -1 there to indicate it is alive and well
949 	 */
950 
951 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
952 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
953 
954 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
955 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
956 	buf[2] = htobe32(0xffffffff);	/* confirm data */
957 
958 	/*
959 	 * FIX: All newest firmware should un-protect the bottom of
960 	 * the sram before handoff. However, the very first interfaces
961 	 * do not. Therefore the handoff copy must skip the first 8 bytes
962 	 */
963 					/* where the code starts*/
964 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
965 	buf[4] = htobe32(size - 8); 	/* length of code */
966 	buf[5] = htobe32(8);		/* where to copy to */
967 	buf[6] = htobe32(0);		/* where to jump to */
968 
969 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
970 	mxge_pio_copy(submit, buf, 64);
971 	wmb();
972 	DELAY(1000);
973 	wmb();
974 	i = 0;
975 	while (*confirm != 0xffffffff && i < 20) {
976 		DELAY(1000*10);
977 		i++;
978 	}
979 	if (*confirm != 0xffffffff) {
980 		if_printf(sc->ifp,"handoff failed (%p = 0x%x)",
981 		    confirm, *confirm);
982 		return ENXIO;
983 	}
984 	return 0;
985 }
986 
987 static int
988 mxge_update_mac_address(mxge_softc_t *sc)
989 {
990 	mxge_cmd_t cmd;
991 	uint8_t *addr = sc->mac_addr;
992 
993 	cmd.data0 = (addr[0] << 24) | (addr[1] << 16) |
994 	    (addr[2] << 8) | addr[3];
995 	cmd.data1 = (addr[4] << 8) | (addr[5]);
996 	return mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
997 }
998 
999 static int
1000 mxge_change_pause(mxge_softc_t *sc, int pause)
1001 {
1002 	mxge_cmd_t cmd;
1003 	int status;
1004 
1005 	bzero(&cmd, sizeof(cmd));	/* silence gcc warning */
1006 	if (pause)
1007 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL, &cmd);
1008 	else
1009 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL, &cmd);
1010 	if (status) {
1011 		if_printf(sc->ifp, "Failed to set flow control mode\n");
1012 		return ENXIO;
1013 	}
1014 	sc->pause = pause;
1015 	return 0;
1016 }
1017 
1018 static void
1019 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1020 {
1021 	mxge_cmd_t cmd;
1022 	int status;
1023 
1024 	bzero(&cmd, sizeof(cmd));	/* avoid gcc warning */
1025 	if (mxge_always_promisc)
1026 		promisc = 1;
1027 
1028 	if (promisc)
1029 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC, &cmd);
1030 	else
1031 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC, &cmd);
1032 	if (status)
1033 		if_printf(sc->ifp, "Failed to set promisc mode\n");
1034 }
1035 
1036 static void
1037 mxge_set_multicast_list(mxge_softc_t *sc)
1038 {
1039 	mxge_cmd_t cmd;
1040 	struct ifmultiaddr *ifma;
1041 	struct ifnet *ifp = sc->ifp;
1042 	int err;
1043 
1044 	/* This firmware is known to not support multicast */
1045 	if (!sc->fw_multicast_support)
1046 		return;
1047 
1048 	/* Disable multicast filtering while we play with the lists*/
1049 	bzero(&cmd, sizeof(cmd));	/* silence gcc warning */
1050 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1051 	if (err != 0) {
1052 		if_printf(ifp, "Failed MXGEFW_ENABLE_ALLMULTI, "
1053 		    "error status: %d\n", err);
1054 		return;
1055 	}
1056 
1057 	if (sc->adopted_rx_filter_bug)
1058 		return;
1059 
1060 	if (ifp->if_flags & IFF_ALLMULTI) {
1061 		/* Request to disable multicast filtering, so quit here */
1062 		return;
1063 	}
1064 
1065 	/* Flush all the filters */
1066 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1067 	if (err != 0) {
1068 		if_printf(ifp, "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, "
1069 		    "error status: %d\n", err);
1070 		return;
1071 	}
1072 
1073 	/*
1074 	 * Walk the multicast list, and add each address
1075 	 */
1076 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1077 		if (ifma->ifma_addr->sa_family != AF_LINK)
1078 			continue;
1079 
1080 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1081 		    &cmd.data0, 4);
1082 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1083 		    &cmd.data1, 2);
1084 		cmd.data0 = htonl(cmd.data0);
1085 		cmd.data1 = htonl(cmd.data1);
1086 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1087 		if (err != 0) {
1088 			if_printf(ifp, "Failed MXGEFW_JOIN_MULTICAST_GROUP, "
1089 			    "error status: %d\n", err);
1090 			/* Abort, leaving multicast filtering off */
1091 			return;
1092 		}
1093 	}
1094 
1095 	/* Enable multicast filtering */
1096 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1097 	if (err != 0) {
1098 		if_printf(ifp, "Failed MXGEFW_DISABLE_ALLMULTI, "
1099 		    "error status: %d\n", err);
1100 	}
1101 }
1102 
1103 #if 0
1104 static int
1105 mxge_max_mtu(mxge_softc_t *sc)
1106 {
1107 	mxge_cmd_t cmd;
1108 	int status;
1109 
1110 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1111 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1112 
1113 	/* try to set nbufs to see if it we can
1114 	   use virtually contiguous jumbos */
1115 	cmd.data0 = 0;
1116 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1117 			       &cmd);
1118 	if (status == 0)
1119 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1120 
1121 	/* otherwise, we're limited to MJUMPAGESIZE */
1122 	return MJUMPAGESIZE - MXGEFW_PAD;
1123 }
1124 #endif
1125 
1126 static int
1127 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1128 {
1129 	struct mxge_slice_state *ss;
1130 	mxge_rx_done_t *rx_done;
1131 	volatile uint32_t *irq_claim;
1132 	mxge_cmd_t cmd;
1133 	int slice, status, rx_intr_size;
1134 
1135 	/*
1136 	 * Try to send a reset command to the card to see if it
1137 	 * is alive
1138 	 */
1139 	memset(&cmd, 0, sizeof (cmd));
1140 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1141 	if (status != 0) {
1142 		if_printf(sc->ifp, "failed reset\n");
1143 		return ENXIO;
1144 	}
1145 
1146 	mxge_dummy_rdma(sc, 1);
1147 
1148 	/*
1149 	 * Set the intrq size
1150 	 * XXX assume 4byte mcp_slot
1151 	 */
1152 	rx_intr_size = sc->rx_intr_slots * sizeof(mcp_slot_t);
1153 	cmd.data0 = rx_intr_size;
1154 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1155 
1156 	/*
1157 	 * Even though we already know how many slices are supported
1158 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1159 	 * has magic side effects, and must be called after a reset.
1160 	 * It must be called prior to calling any RSS related cmds,
1161 	 * including assigning an interrupt queue for anything but
1162 	 * slice 0.  It must also be called *after*
1163 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1164 	 * the firmware to compute offsets.
1165 	 */
1166 	if (sc->num_slices > 1) {
1167 		/* Ask the maximum number of slices it supports */
1168 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
1169 		if (status != 0) {
1170 			if_printf(sc->ifp, "failed to get number of slices\n");
1171 			return status;
1172 		}
1173 
1174 		/*
1175 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1176 		 * to setting up the interrupt queue DMA
1177 		 */
1178 		cmd.data0 = sc->num_slices;
1179 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1180 		if (sc->num_tx_rings > 1)
1181 			cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1182 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES, &cmd);
1183 		if (status != 0) {
1184 			if_printf(sc->ifp, "failed to set number of slices\n");
1185 			return status;
1186 		}
1187 	}
1188 
1189 	if (interrupts_setup) {
1190 		/* Now exchange information about interrupts  */
1191 		for (slice = 0; slice < sc->num_slices; slice++) {
1192 			ss = &sc->ss[slice];
1193 
1194 			rx_done = &ss->rx_data.rx_done;
1195 			memset(rx_done->entry, 0, rx_intr_size);
1196 
1197 			cmd.data0 =
1198 			    MXGE_LOWPART_TO_U32(ss->rx_done_dma.dmem_busaddr);
1199 			cmd.data1 =
1200 			    MXGE_HIGHPART_TO_U32(ss->rx_done_dma.dmem_busaddr);
1201 			cmd.data2 = slice;
1202 			status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA,
1203 			    &cmd);
1204 		}
1205 	}
1206 
1207 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET,
1208 	    &cmd);
1209 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1210 
1211 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1212 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1213 
1214 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, &cmd);
1215 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1216 
1217 	if (status != 0) {
1218 		if_printf(sc->ifp, "failed set interrupt parameters\n");
1219 		return status;
1220 	}
1221 
1222 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1223 
1224 	/* Run a DMA benchmark */
1225 	mxge_dma_test(sc, MXGEFW_DMA_TEST);
1226 
1227 	for (slice = 0; slice < sc->num_slices; slice++) {
1228 		ss = &sc->ss[slice];
1229 
1230 		ss->irq_claim = irq_claim + (2 * slice);
1231 
1232 		/* Reset mcp/driver shared state back to 0 */
1233 		ss->rx_data.rx_done.idx = 0;
1234 		ss->tx.req = 0;
1235 		ss->tx.done = 0;
1236 		ss->tx.pkt_done = 0;
1237 		ss->tx.queue_active = 0;
1238 		ss->tx.activate = 0;
1239 		ss->tx.deactivate = 0;
1240 		ss->rx_data.rx_big.cnt = 0;
1241 		ss->rx_data.rx_small.cnt = 0;
1242 		if (ss->fw_stats != NULL)
1243 			bzero(ss->fw_stats, sizeof(*ss->fw_stats));
1244 	}
1245 	sc->rdma_tags_available = 15;
1246 
1247 	status = mxge_update_mac_address(sc);
1248 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1249 	mxge_change_pause(sc, sc->pause);
1250 	mxge_set_multicast_list(sc);
1251 
1252 	if (sc->throttle) {
1253 		cmd.data0 = sc->throttle;
1254 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd))
1255 			if_printf(sc->ifp, "can't enable throttle\n");
1256 	}
1257 	return status;
1258 }
1259 
1260 static int
1261 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1262 {
1263 	mxge_cmd_t cmd;
1264 	mxge_softc_t *sc;
1265 	int err;
1266 	unsigned int throttle;
1267 
1268 	sc = arg1;
1269 	throttle = sc->throttle;
1270 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1271 	if (err != 0)
1272 		return err;
1273 
1274 	if (throttle == sc->throttle)
1275 		return 0;
1276 
1277 	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1278 		return EINVAL;
1279 
1280 	ifnet_serialize_all(sc->ifp);
1281 
1282 	cmd.data0 = throttle;
1283 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1284 	if (err == 0)
1285 		sc->throttle = throttle;
1286 
1287 	ifnet_deserialize_all(sc->ifp);
1288 	return err;
1289 }
1290 
1291 static int
1292 mxge_change_use_rss(SYSCTL_HANDLER_ARGS)
1293 {
1294 	mxge_softc_t *sc;
1295 	int err, use_rss;
1296 
1297 	sc = arg1;
1298 	use_rss = sc->use_rss;
1299 	err = sysctl_handle_int(oidp, &use_rss, arg2, req);
1300 	if (err != 0)
1301 		return err;
1302 
1303 	if (use_rss == sc->use_rss)
1304 		return 0;
1305 
1306 	ifnet_serialize_all(sc->ifp);
1307 
1308 	sc->use_rss = use_rss;
1309 	if (sc->ifp->if_flags & IFF_RUNNING) {
1310 		mxge_close(sc, 0);
1311 		mxge_open(sc);
1312 	}
1313 
1314 	ifnet_deserialize_all(sc->ifp);
1315 	return err;
1316 }
1317 
1318 static int
1319 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1320 {
1321 	mxge_softc_t *sc;
1322 	unsigned int intr_coal_delay;
1323 	int err;
1324 
1325 	sc = arg1;
1326 	intr_coal_delay = sc->intr_coal_delay;
1327 	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1328 	if (err != 0)
1329 		return err;
1330 
1331 	if (intr_coal_delay == sc->intr_coal_delay)
1332 		return 0;
1333 
1334 	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1335 		return EINVAL;
1336 
1337 	ifnet_serialize_all(sc->ifp);
1338 
1339 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1340 	sc->intr_coal_delay = intr_coal_delay;
1341 
1342 	ifnet_deserialize_all(sc->ifp);
1343 	return err;
1344 }
1345 
1346 static int
1347 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1348 {
1349 	int err;
1350 
1351 	if (arg1 == NULL)
1352 		return EFAULT;
1353 	arg2 = be32toh(*(int *)arg1);
1354 	arg1 = NULL;
1355 	err = sysctl_handle_int(oidp, arg1, arg2, req);
1356 
1357 	return err;
1358 }
1359 
1360 static void
1361 mxge_rem_sysctls(mxge_softc_t *sc)
1362 {
1363 	if (sc->ss != NULL) {
1364 		struct mxge_slice_state *ss;
1365 		int slice;
1366 
1367 		for (slice = 0; slice < sc->num_slices; slice++) {
1368 			ss = &sc->ss[slice];
1369 			if (ss->sysctl_tree != NULL) {
1370 				sysctl_ctx_free(&ss->sysctl_ctx);
1371 				ss->sysctl_tree = NULL;
1372 			}
1373 		}
1374 	}
1375 
1376 	if (sc->slice_sysctl_tree != NULL) {
1377 		sysctl_ctx_free(&sc->slice_sysctl_ctx);
1378 		sc->slice_sysctl_tree = NULL;
1379 	}
1380 }
1381 
1382 static void
1383 mxge_add_sysctls(mxge_softc_t *sc)
1384 {
1385 	struct sysctl_ctx_list *ctx;
1386 	struct sysctl_oid_list *children;
1387 	mcp_irq_data_t *fw;
1388 	struct mxge_slice_state *ss;
1389 	int slice;
1390 	char slice_num[8];
1391 
1392 	ctx = device_get_sysctl_ctx(sc->dev);
1393 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1394 	fw = sc->ss[0].fw_stats;
1395 
1396 	/*
1397 	 * Random information
1398 	 */
1399 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version",
1400 	    CTLFLAG_RD, &sc->fw_version, 0, "firmware version");
1401 
1402 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "serial_number",
1403 	    CTLFLAG_RD, &sc->serial_number_string, 0, "serial number");
1404 
1405 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "product_code",
1406 	    CTLFLAG_RD, &sc->product_code_string, 0, "product code");
1407 
1408 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "pcie_link_width",
1409 	    CTLFLAG_RD, &sc->link_width, 0, "link width");
1410 
1411 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_boundary",
1412 	    CTLFLAG_RD, &sc->tx_boundary, 0, "tx boundary");
1413 
1414 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_combine",
1415 	    CTLFLAG_RD, &sc->wc, 0, "write combining PIO");
1416 
1417 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_dma_MBs",
1418 	    CTLFLAG_RD, &sc->read_dma, 0, "DMA Read speed in MB/s");
1419 
1420 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_dma_MBs",
1421 	    CTLFLAG_RD, &sc->write_dma, 0, "DMA Write speed in MB/s");
1422 
1423 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_write_dma_MBs",
1424 	    CTLFLAG_RD, &sc->read_write_dma, 0,
1425 	    "DMA concurrent Read/Write speed in MB/s");
1426 
1427 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "watchdog_resets",
1428 	    CTLFLAG_RD, &sc->watchdog_resets, 0,
1429 	    "Number of times NIC was reset");
1430 
1431 	if (sc->num_slices > 1) {
1432 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "slice_cpumap",
1433 		    CTLTYPE_OPAQUE | CTLFLAG_RD, sc->ring_map, 0,
1434 		    if_ringmap_cpumap_sysctl, "I", "slice CPU map");
1435 	}
1436 
1437 	/*
1438 	 * Performance related tunables
1439 	 */
1440 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "intr_coal_delay",
1441 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_intr_coal, "I",
1442 	    "Interrupt coalescing delay in usecs");
1443 
1444 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "throttle",
1445 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_throttle, "I",
1446 	    "Transmit throttling");
1447 
1448 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "use_rss",
1449 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_use_rss, "I",
1450 	    "Use RSS");
1451 
1452 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "deassert_wait",
1453 	    CTLFLAG_RW, &mxge_deassert_wait, 0,
1454 	    "Wait for IRQ line to go low in ihandler");
1455 
1456 	/*
1457 	 * Stats block from firmware is in network byte order.
1458 	 * Need to swap it
1459 	 */
1460 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "link_up",
1461 	    CTLTYPE_INT|CTLFLAG_RD, &fw->link_up, 0,
1462 	    mxge_handle_be32, "I", "link up");
1463 
1464 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_tags_available",
1465 	    CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available, 0,
1466 	    mxge_handle_be32, "I", "rdma_tags_available");
1467 
1468 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_crc32",
1469 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_crc32, 0,
1470 	    mxge_handle_be32, "I", "dropped_bad_crc32");
1471 
1472 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_phy",
1473 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_phy, 0,
1474 	    mxge_handle_be32, "I", "dropped_bad_phy");
1475 
1476 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_error_or_filtered",
1477 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_error_or_filtered, 0,
1478 	    mxge_handle_be32, "I", "dropped_link_error_or_filtered");
1479 
1480 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_overflow",
1481 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow, 0,
1482 	    mxge_handle_be32, "I", "dropped_link_overflow");
1483 
1484 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_multicast_filtered",
1485 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_multicast_filtered, 0,
1486 	    mxge_handle_be32, "I", "dropped_multicast_filtered");
1487 
1488 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_big_buffer",
1489 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer, 0,
1490 	    mxge_handle_be32, "I", "dropped_no_big_buffer");
1491 
1492 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_small_buffer",
1493 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_small_buffer, 0,
1494 	    mxge_handle_be32, "I", "dropped_no_small_buffer");
1495 
1496 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_overrun",
1497 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun, 0,
1498 	    mxge_handle_be32, "I", "dropped_overrun");
1499 
1500 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_pause",
1501 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_pause, 0,
1502 	    mxge_handle_be32, "I", "dropped_pause");
1503 
1504 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_runt",
1505 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt, 0,
1506 	    mxge_handle_be32, "I", "dropped_runt");
1507 
1508 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_unicast_filtered",
1509 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered, 0,
1510 	    mxge_handle_be32, "I", "dropped_unicast_filtered");
1511 
1512 	/* add counters exported for debugging from all slices */
1513 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1514 	sc->slice_sysctl_tree = SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx,
1515 	    children, OID_AUTO, "slice", CTLFLAG_RD, 0, "");
1516 	if (sc->slice_sysctl_tree == NULL) {
1517 		device_printf(sc->dev, "can't add slice sysctl node\n");
1518 		return;
1519 	}
1520 
1521 	for (slice = 0; slice < sc->num_slices; slice++) {
1522 		ss = &sc->ss[slice];
1523 		sysctl_ctx_init(&ss->sysctl_ctx);
1524 		ctx = &ss->sysctl_ctx;
1525 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1526 		ksprintf(slice_num, "%d", slice);
1527 		ss->sysctl_tree = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
1528 		    slice_num, CTLFLAG_RD, 0, "");
1529 		if (ss->sysctl_tree == NULL) {
1530 			device_printf(sc->dev,
1531 			    "can't add %d slice sysctl node\n", slice);
1532 			return;	/* XXX continue? */
1533 		}
1534 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1535 
1536 		/*
1537 		 * XXX change to ULONG
1538 		 */
1539 
1540 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_small_cnt",
1541 		    CTLFLAG_RD, &ss->rx_data.rx_small.cnt, 0, "rx_small_cnt");
1542 
1543 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_big_cnt",
1544 		    CTLFLAG_RD, &ss->rx_data.rx_big.cnt, 0, "rx_small_cnt");
1545 
1546 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_req",
1547 		    CTLFLAG_RD, &ss->tx.req, 0, "tx_req");
1548 
1549 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_done",
1550 		    CTLFLAG_RD, &ss->tx.done, 0, "tx_done");
1551 
1552 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_pkt_done",
1553 		    CTLFLAG_RD, &ss->tx.pkt_done, 0, "tx_done");
1554 
1555 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_queue_active",
1556 		    CTLFLAG_RD, &ss->tx.queue_active, 0, "tx_queue_active");
1557 
1558 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_activate",
1559 		    CTLFLAG_RD, &ss->tx.activate, 0, "tx_activate");
1560 
1561 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_deactivate",
1562 		    CTLFLAG_RD, &ss->tx.deactivate, 0, "tx_deactivate");
1563 	}
1564 }
1565 
1566 /*
1567  * Copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1568  * backwards one at a time and handle ring wraps
1569  */
1570 static __inline void
1571 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1572     mcp_kreq_ether_send_t *src, int cnt)
1573 {
1574 	int idx, starting_slot;
1575 
1576 	starting_slot = tx->req;
1577 	while (cnt > 1) {
1578 		cnt--;
1579 		idx = (starting_slot + cnt) & tx->mask;
1580 		mxge_pio_copy(&tx->lanai[idx], &src[cnt], sizeof(*src));
1581 		wmb();
1582 	}
1583 }
1584 
1585 /*
1586  * Copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1587  * at most 32 bytes at a time, so as to avoid involving the software
1588  * pio handler in the nic.  We re-write the first segment's flags
1589  * to mark them valid only after writing the entire chain
1590  */
1591 static __inline void
1592 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src, int cnt)
1593 {
1594 	int idx, i;
1595 	uint32_t *src_ints;
1596 	volatile uint32_t *dst_ints;
1597 	mcp_kreq_ether_send_t *srcp;
1598 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1599 	uint8_t last_flags;
1600 
1601 	idx = tx->req & tx->mask;
1602 
1603 	last_flags = src->flags;
1604 	src->flags = 0;
1605 	wmb();
1606 	dst = dstp = &tx->lanai[idx];
1607 	srcp = src;
1608 
1609 	if ((idx + cnt) < tx->mask) {
1610 		for (i = 0; i < cnt - 1; i += 2) {
1611 			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1612 			wmb(); /* force write every 32 bytes */
1613 			srcp += 2;
1614 			dstp += 2;
1615 		}
1616 	} else {
1617 		/*
1618 		 * Submit all but the first request, and ensure
1619 		 * that it is submitted below
1620 		 */
1621 		mxge_submit_req_backwards(tx, src, cnt);
1622 		i = 0;
1623 	}
1624 	if (i < cnt) {
1625 		/* Submit the first request */
1626 		mxge_pio_copy(dstp, srcp, sizeof(*src));
1627 		wmb(); /* barrier before setting valid flag */
1628 	}
1629 
1630 	/* Re-write the last 32-bits with the valid flags */
1631 	src->flags = last_flags;
1632 	src_ints = (uint32_t *)src;
1633 	src_ints+=3;
1634 	dst_ints = (volatile uint32_t *)dst;
1635 	dst_ints+=3;
1636 	*dst_ints = *src_ints;
1637 	tx->req += cnt;
1638 	wmb();
1639 }
1640 
1641 static int
1642 mxge_pullup_tso(struct mbuf **mp)
1643 {
1644 	int hoff, iphlen, thoff;
1645 	struct mbuf *m;
1646 
1647 	m = *mp;
1648 	KASSERT(M_WRITABLE(m), ("TSO mbuf not writable"));
1649 
1650 	iphlen = m->m_pkthdr.csum_iphlen;
1651 	thoff = m->m_pkthdr.csum_thlen;
1652 	hoff = m->m_pkthdr.csum_lhlen;
1653 
1654 	KASSERT(iphlen > 0, ("invalid ip hlen"));
1655 	KASSERT(thoff > 0, ("invalid tcp hlen"));
1656 	KASSERT(hoff > 0, ("invalid ether hlen"));
1657 
1658 	if (__predict_false(m->m_len < hoff + iphlen + thoff)) {
1659 		m = m_pullup(m, hoff + iphlen + thoff);
1660 		if (m == NULL) {
1661 			*mp = NULL;
1662 			return ENOBUFS;
1663 		}
1664 		*mp = m;
1665 	}
1666 	return 0;
1667 }
1668 
1669 static int
1670 mxge_encap_tso(mxge_tx_ring_t *tx, struct mxge_buffer_state *info_map,
1671     struct mbuf *m, int busdma_seg_cnt)
1672 {
1673 	mcp_kreq_ether_send_t *req;
1674 	bus_dma_segment_t *seg;
1675 	uint32_t low, high_swapped;
1676 	int len, seglen, cum_len, cum_len_next;
1677 	int next_is_first, chop, cnt, rdma_count, small;
1678 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1679 	uint8_t flags, flags_next;
1680 	struct mxge_buffer_state *info_last;
1681 	bus_dmamap_t map = info_map->map;
1682 
1683 	mss = m->m_pkthdr.tso_segsz;
1684 
1685 	/*
1686 	 * Negative cum_len signifies to the send loop that we are
1687 	 * still in the header portion of the TSO packet.
1688 	 */
1689 	cum_len = -(m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen +
1690 	    m->m_pkthdr.csum_thlen);
1691 
1692 	/*
1693 	 * TSO implies checksum offload on this hardware
1694 	 */
1695 	cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1696 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1697 
1698 	/*
1699 	 * For TSO, pseudo_hdr_offset holds mss.  The firmware figures
1700 	 * out where to put the checksum by parsing the header.
1701 	 */
1702 	pseudo_hdr_offset = htobe16(mss);
1703 
1704 	req = tx->req_list;
1705 	seg = tx->seg_list;
1706 	cnt = 0;
1707 	rdma_count = 0;
1708 
1709 	/*
1710 	 * "rdma_count" is the number of RDMAs belonging to the current
1711 	 * packet BEFORE the current send request.  For non-TSO packets,
1712 	 * this is equal to "count".
1713 	 *
1714 	 * For TSO packets, rdma_count needs to be reset to 0 after a
1715 	 * segment cut.
1716 	 *
1717 	 * The rdma_count field of the send request is the number of
1718 	 * RDMAs of the packet starting at that request.  For TSO send
1719 	 * requests with one ore more cuts in the middle, this is the
1720 	 * number of RDMAs starting after the last cut in the request.
1721 	 * All previous segments before the last cut implicitly have 1
1722 	 * RDMA.
1723 	 *
1724 	 * Since the number of RDMAs is not known beforehand, it must be
1725 	 * filled-in retroactively - after each segmentation cut or at
1726 	 * the end of the entire packet.
1727 	 */
1728 
1729 	while (busdma_seg_cnt) {
1730 		/*
1731 		 * Break the busdma segment up into pieces
1732 		 */
1733 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1734 		high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1735 		len = seg->ds_len;
1736 
1737 		while (len) {
1738 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1739 			seglen = len;
1740 			cum_len_next = cum_len + seglen;
1741 			(req - rdma_count)->rdma_count = rdma_count + 1;
1742 			if (__predict_true(cum_len >= 0)) {
1743 				/* Payload */
1744 				chop = (cum_len_next > mss);
1745 				cum_len_next = cum_len_next % mss;
1746 				next_is_first = (cum_len_next == 0);
1747 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1748 				flags_next |=
1749 				    next_is_first * MXGEFW_FLAGS_FIRST;
1750 				rdma_count |= -(chop | next_is_first);
1751 				rdma_count += chop & !next_is_first;
1752 			} else if (cum_len_next >= 0) {
1753 				/* Header ends */
1754 				rdma_count = -1;
1755 				cum_len_next = 0;
1756 				seglen = -cum_len;
1757 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1758 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1759 				    MXGEFW_FLAGS_FIRST |
1760 				    (small * MXGEFW_FLAGS_SMALL);
1761 			}
1762 
1763 			req->addr_high = high_swapped;
1764 			req->addr_low = htobe32(low);
1765 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1766 			req->pad = 0;
1767 			req->rdma_count = 1;
1768 			req->length = htobe16(seglen);
1769 			req->cksum_offset = cksum_offset;
1770 			req->flags =
1771 			    flags | ((cum_len & 1) * MXGEFW_FLAGS_ALIGN_ODD);
1772 			low += seglen;
1773 			len -= seglen;
1774 			cum_len = cum_len_next;
1775 			flags = flags_next;
1776 			req++;
1777 			cnt++;
1778 			rdma_count++;
1779 			if (__predict_false(cksum_offset > seglen))
1780 				cksum_offset -= seglen;
1781 			else
1782 				cksum_offset = 0;
1783 			if (__predict_false(cnt > tx->max_desc))
1784 				goto drop;
1785 		}
1786 		busdma_seg_cnt--;
1787 		seg++;
1788 	}
1789 	(req - rdma_count)->rdma_count = rdma_count;
1790 
1791 	do {
1792 		req--;
1793 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1794 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1795 
1796 	info_last = &tx->info[((cnt - 1) + tx->req) & tx->mask];
1797 
1798 	info_map->map = info_last->map;
1799 	info_last->map = map;
1800 	info_last->m = m;
1801 
1802 	mxge_submit_req(tx, tx->req_list, cnt);
1803 
1804 	if (tx->send_go != NULL && tx->queue_active == 0) {
1805 		/* Tell the NIC to start polling this slice */
1806 		*tx->send_go = 1;
1807 		tx->queue_active = 1;
1808 		tx->activate++;
1809 		wmb();
1810 	}
1811 	return 0;
1812 
1813 drop:
1814 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1815 	m_freem(m);
1816 	return ENOBUFS;
1817 }
1818 
1819 static int
1820 mxge_encap(mxge_tx_ring_t *tx, struct mbuf *m, bus_addr_t zeropad)
1821 {
1822 	mcp_kreq_ether_send_t *req;
1823 	bus_dma_segment_t *seg;
1824 	bus_dmamap_t map;
1825 	int cnt, cum_len, err, i, idx, odd_flag;
1826 	uint16_t pseudo_hdr_offset;
1827 	uint8_t flags, cksum_offset;
1828 	struct mxge_buffer_state *info_map, *info_last;
1829 
1830 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
1831 		err = mxge_pullup_tso(&m);
1832 		if (__predict_false(err))
1833 			return err;
1834 	}
1835 
1836 	/*
1837 	 * Map the frame for DMA
1838 	 */
1839 	idx = tx->req & tx->mask;
1840 	info_map = &tx->info[idx];
1841 	map = info_map->map;
1842 
1843 	err = bus_dmamap_load_mbuf_defrag(tx->dmat, map, &m,
1844 	    tx->seg_list, tx->max_desc - 2, &cnt, BUS_DMA_NOWAIT);
1845 	if (__predict_false(err != 0))
1846 		goto drop;
1847 	bus_dmamap_sync(tx->dmat, map, BUS_DMASYNC_PREWRITE);
1848 
1849 	/*
1850 	 * TSO is different enough, we handle it in another routine
1851 	 */
1852 	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1853 		return mxge_encap_tso(tx, info_map, m, cnt);
1854 
1855 	req = tx->req_list;
1856 	cksum_offset = 0;
1857 	pseudo_hdr_offset = 0;
1858 	flags = MXGEFW_FLAGS_NO_TSO;
1859 
1860 	/*
1861 	 * Checksum offloading
1862 	 */
1863 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1864 		cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1865 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1866 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1867 		req->cksum_offset = cksum_offset;
1868 		flags |= MXGEFW_FLAGS_CKSUM;
1869 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1870 	} else {
1871 		odd_flag = 0;
1872 	}
1873 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1874 		flags |= MXGEFW_FLAGS_SMALL;
1875 
1876 	/*
1877 	 * Convert segments into a request list
1878 	 */
1879 	cum_len = 0;
1880 	seg = tx->seg_list;
1881 	req->flags = MXGEFW_FLAGS_FIRST;
1882 	for (i = 0; i < cnt; i++) {
1883 		req->addr_low = htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1884 		req->addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1885 		req->length = htobe16(seg->ds_len);
1886 		req->cksum_offset = cksum_offset;
1887 		if (cksum_offset > seg->ds_len)
1888 			cksum_offset -= seg->ds_len;
1889 		else
1890 			cksum_offset = 0;
1891 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1892 		req->pad = 0; /* complete solid 16-byte block */
1893 		req->rdma_count = 1;
1894 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1895 		cum_len += seg->ds_len;
1896 		seg++;
1897 		req++;
1898 		req->flags = 0;
1899 	}
1900 	req--;
1901 
1902 	/*
1903 	 * Pad runt to 60 bytes
1904 	 */
1905 	if (cum_len < 60) {
1906 		req++;
1907 		req->addr_low = htobe32(MXGE_LOWPART_TO_U32(zeropad));
1908 		req->addr_high = htobe32(MXGE_HIGHPART_TO_U32(zeropad));
1909 		req->length = htobe16(60 - cum_len);
1910 		req->cksum_offset = 0;
1911 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1912 		req->pad = 0; /* complete solid 16-byte block */
1913 		req->rdma_count = 1;
1914 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1915 		cnt++;
1916 	}
1917 
1918 	tx->req_list[0].rdma_count = cnt;
1919 #if 0
1920 	/* print what the firmware will see */
1921 	for (i = 0; i < cnt; i++) {
1922 		kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1923 		    "cso:%d, flags:0x%x, rdma:%d\n",
1924 		    i, (int)ntohl(tx->req_list[i].addr_high),
1925 		    (int)ntohl(tx->req_list[i].addr_low),
1926 		    (int)ntohs(tx->req_list[i].length),
1927 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1928 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1929 		    tx->req_list[i].rdma_count);
1930 	}
1931 	kprintf("--------------\n");
1932 #endif
1933 	info_last = &tx->info[((cnt - 1) + tx->req) & tx->mask];
1934 
1935 	info_map->map = info_last->map;
1936 	info_last->map = map;
1937 	info_last->m = m;
1938 
1939 	mxge_submit_req(tx, tx->req_list, cnt);
1940 
1941 	if (tx->send_go != NULL && tx->queue_active == 0) {
1942 		/* Tell the NIC to start polling this slice */
1943 		*tx->send_go = 1;
1944 		tx->queue_active = 1;
1945 		tx->activate++;
1946 		wmb();
1947 	}
1948 	return 0;
1949 
1950 drop:
1951 	m_freem(m);
1952 	return err;
1953 }
1954 
1955 static void
1956 mxge_start(struct ifnet *ifp, struct ifaltq_subque *ifsq)
1957 {
1958 	mxge_softc_t *sc = ifp->if_softc;
1959 	mxge_tx_ring_t *tx = ifsq_get_priv(ifsq);
1960 	bus_addr_t zeropad;
1961 	int encap = 0;
1962 
1963 	KKASSERT(tx->ifsq == ifsq);
1964 	ASSERT_SERIALIZED(&tx->tx_serialize);
1965 
1966 	if ((ifp->if_flags & IFF_RUNNING) == 0 || ifsq_is_oactive(ifsq))
1967 		return;
1968 
1969 	zeropad = sc->zeropad_dma.dmem_busaddr;
1970 	while (tx->mask - (tx->req - tx->done) > tx->max_desc) {
1971 		struct mbuf *m;
1972 		int error;
1973 
1974 		m = ifsq_dequeue(ifsq);
1975 		if (m == NULL)
1976 			goto done;
1977 
1978 		BPF_MTAP(ifp, m);
1979 		error = mxge_encap(tx, m, zeropad);
1980 		if (!error)
1981 			encap = 1;
1982 		else
1983 			IFNET_STAT_INC(ifp, oerrors, 1);
1984 	}
1985 
1986 	/* Ran out of transmit slots */
1987 	ifsq_set_oactive(ifsq);
1988 done:
1989 	if (encap)
1990 		ifsq_watchdog_set_count(&tx->watchdog, 5);
1991 }
1992 
1993 static void
1994 mxge_watchdog(struct ifaltq_subque *ifsq)
1995 {
1996 	struct ifnet *ifp = ifsq_get_ifp(ifsq);
1997 	struct mxge_softc *sc = ifp->if_softc;
1998 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
1999 	mxge_tx_ring_t *tx = ifsq_get_priv(ifsq);
2000 
2001 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
2002 
2003 	/* Check for pause blocking before resetting */
2004 	if (tx->watchdog_rx_pause == rx_pause) {
2005 		mxge_warn_stuck(sc, tx, 0);
2006 		mxge_watchdog_reset(sc);
2007 		return;
2008 	} else {
2009 		if_printf(ifp, "Flow control blocking xmits, "
2010 		    "check link partner\n");
2011 	}
2012 	tx->watchdog_rx_pause = rx_pause;
2013 }
2014 
2015 /*
2016  * Copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2017  * at most 32 bytes at a time, so as to avoid involving the software
2018  * pio handler in the nic.  We re-write the first segment's low
2019  * DMA address to mark it valid only after we write the entire chunk
2020  * in a burst
2021  */
2022 static __inline void
2023 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2024     mcp_kreq_ether_recv_t *src)
2025 {
2026 	uint32_t low;
2027 
2028 	low = src->addr_low;
2029 	src->addr_low = 0xffffffff;
2030 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2031 	wmb();
2032 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2033 	wmb();
2034 	src->addr_low = low;
2035 	dst->addr_low = low;
2036 	wmb();
2037 }
2038 
2039 static int
2040 mxge_get_buf_small(mxge_rx_ring_t *rx, bus_dmamap_t map, int idx,
2041     boolean_t init)
2042 {
2043 	bus_dma_segment_t seg;
2044 	struct mbuf *m;
2045 	int cnt, err, mflag;
2046 
2047 	mflag = M_NOWAIT;
2048 	if (__predict_false(init))
2049 		mflag = M_WAITOK;
2050 
2051 	m = m_gethdr(mflag, MT_DATA);
2052 	if (m == NULL) {
2053 		err = ENOBUFS;
2054 		if (__predict_false(init)) {
2055 			/*
2056 			 * During initialization, there
2057 			 * is nothing to setup; bail out
2058 			 */
2059 			return err;
2060 		}
2061 		goto done;
2062 	}
2063 	m->m_len = m->m_pkthdr.len = MHLEN;
2064 
2065 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2066 	    &seg, 1, &cnt, BUS_DMA_NOWAIT);
2067 	if (err != 0) {
2068 		m_freem(m);
2069 		if (__predict_false(init)) {
2070 			/*
2071 			 * During initialization, there
2072 			 * is nothing to setup; bail out
2073 			 */
2074 			return err;
2075 		}
2076 		goto done;
2077 	}
2078 
2079 	rx->info[idx].m = m;
2080 	rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2081 	rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2082 
2083 done:
2084 	if ((idx & 7) == 7)
2085 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2086 	return err;
2087 }
2088 
2089 static int
2090 mxge_get_buf_big(mxge_rx_ring_t *rx, bus_dmamap_t map, int idx,
2091     boolean_t init)
2092 {
2093 	bus_dma_segment_t seg;
2094 	struct mbuf *m;
2095 	int cnt, err, mflag;
2096 
2097 	mflag = M_NOWAIT;
2098 	if (__predict_false(init))
2099 		mflag = M_WAITOK;
2100 
2101 	if (rx->cl_size == MCLBYTES)
2102 		m = m_getcl(mflag, MT_DATA, M_PKTHDR);
2103 	else
2104 		m = m_getjcl(mflag, MT_DATA, M_PKTHDR, MJUMPAGESIZE);
2105 	if (m == NULL) {
2106 		err = ENOBUFS;
2107 		if (__predict_false(init)) {
2108 			/*
2109 			 * During initialization, there
2110 			 * is nothing to setup; bail out
2111 			 */
2112 			return err;
2113 		}
2114 		goto done;
2115 	}
2116 	m->m_len = m->m_pkthdr.len = rx->cl_size;
2117 
2118 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2119 	    &seg, 1, &cnt, BUS_DMA_NOWAIT);
2120 	if (err != 0) {
2121 		m_freem(m);
2122 		if (__predict_false(init)) {
2123 			/*
2124 			 * During initialization, there
2125 			 * is nothing to setup; bail out
2126 			 */
2127 			return err;
2128 		}
2129 		goto done;
2130 	}
2131 
2132 	rx->info[idx].m = m;
2133 	rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2134 	rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2135 
2136 done:
2137 	if ((idx & 7) == 7)
2138 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2139 	return err;
2140 }
2141 
2142 /*
2143  * Myri10GE hardware checksums are not valid if the sender
2144  * padded the frame with non-zero padding.  This is because
2145  * the firmware just does a simple 16-bit 1s complement
2146  * checksum across the entire frame, excluding the first 14
2147  * bytes.  It is best to simply to check the checksum and
2148  * tell the stack about it only if the checksum is good
2149  */
2150 static __inline uint16_t
2151 mxge_rx_csum(struct mbuf *m, int csum)
2152 {
2153 	const struct ether_header *eh;
2154 	const struct ip *ip;
2155 	uint16_t c;
2156 
2157 	eh = mtod(m, const struct ether_header *);
2158 
2159 	/* Only deal with IPv4 TCP & UDP for now */
2160 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2161 		return 1;
2162 
2163 	ip = (const struct ip *)(eh + 1);
2164 	if (__predict_false(ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP))
2165 		return 1;
2166 
2167 #ifdef INET
2168 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2169 	    htonl(ntohs(csum) + ntohs(ip->ip_len) +
2170 	          - (ip->ip_hl << 2) + ip->ip_p));
2171 #else
2172 	c = 1;
2173 #endif
2174 	c ^= 0xffff;
2175 	return c;
2176 }
2177 
2178 static void
2179 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2180 {
2181 	struct ether_vlan_header *evl;
2182 	uint32_t partial;
2183 
2184 	evl = mtod(m, struct ether_vlan_header *);
2185 
2186 	/*
2187 	 * Fix checksum by subtracting EVL_ENCAPLEN bytes after
2188 	 * what the firmware thought was the end of the ethernet
2189 	 * header.
2190 	 */
2191 
2192 	/* Put checksum into host byte order */
2193 	*csum = ntohs(*csum);
2194 
2195 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2196 	*csum += ~partial;
2197 	*csum += ((*csum) < ~partial);
2198 	*csum = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2199 	*csum = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2200 
2201 	/*
2202 	 * Restore checksum to network byte order;
2203 	 * later consumers expect this
2204 	 */
2205 	*csum = htons(*csum);
2206 
2207 	/* save the tag */
2208 	m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2209 	m->m_flags |= M_VLANTAG;
2210 
2211 	/*
2212 	 * Remove the 802.1q header by copying the Ethernet
2213 	 * addresses over it and adjusting the beginning of
2214 	 * the data in the mbuf.  The encapsulated Ethernet
2215 	 * type field is already in place.
2216 	 */
2217 	bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2218 	    ETHER_HDR_LEN - ETHER_TYPE_LEN);
2219 	m_adj(m, EVL_ENCAPLEN);
2220 }
2221 
2222 
2223 static __inline void
2224 mxge_rx_done_big(struct ifnet *ifp, mxge_rx_ring_t *rx,
2225     uint32_t len, uint32_t csum)
2226 {
2227 	struct mbuf *m;
2228 	const struct ether_header *eh;
2229 	bus_dmamap_t old_map;
2230 	int idx;
2231 
2232 	idx = rx->cnt & rx->mask;
2233 	rx->cnt++;
2234 
2235 	/* Save a pointer to the received mbuf */
2236 	m = rx->info[idx].m;
2237 
2238 	/* Try to replace the received mbuf */
2239 	if (mxge_get_buf_big(rx, rx->extra_map, idx, FALSE)) {
2240 		/* Drop the frame -- the old mbuf is re-cycled */
2241 		IFNET_STAT_INC(ifp, ierrors, 1);
2242 		return;
2243 	}
2244 
2245 	/* Unmap the received buffer */
2246 	old_map = rx->info[idx].map;
2247 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2248 	bus_dmamap_unload(rx->dmat, old_map);
2249 
2250 	/* Swap the bus_dmamap_t's */
2251 	rx->info[idx].map = rx->extra_map;
2252 	rx->extra_map = old_map;
2253 
2254 	/*
2255 	 * mcp implicitly skips 1st 2 bytes so that packet is properly
2256 	 * aligned
2257 	 */
2258 	m->m_data += MXGEFW_PAD;
2259 
2260 	m->m_pkthdr.rcvif = ifp;
2261 	m->m_len = m->m_pkthdr.len = len;
2262 
2263 	IFNET_STAT_INC(ifp, ipackets, 1);
2264 
2265 	eh = mtod(m, const struct ether_header *);
2266 	if (eh->ether_type == htons(ETHERTYPE_VLAN))
2267 		mxge_vlan_tag_remove(m, &csum);
2268 
2269 	/* If the checksum is valid, mark it in the mbuf header */
2270 	if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2271 	    mxge_rx_csum(m, csum) == 0) {
2272 		/* Tell the stack that the checksum is good */
2273 		m->m_pkthdr.csum_data = 0xffff;
2274 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2275 		    CSUM_DATA_VALID;
2276 	}
2277 	ifp->if_input(ifp, m, NULL, -1);
2278 }
2279 
2280 static __inline void
2281 mxge_rx_done_small(struct ifnet *ifp, mxge_rx_ring_t *rx,
2282     uint32_t len, uint32_t csum)
2283 {
2284 	const struct ether_header *eh;
2285 	struct mbuf *m;
2286 	bus_dmamap_t old_map;
2287 	int idx;
2288 
2289 	idx = rx->cnt & rx->mask;
2290 	rx->cnt++;
2291 
2292 	/* Save a pointer to the received mbuf */
2293 	m = rx->info[idx].m;
2294 
2295 	/* Try to replace the received mbuf */
2296 	if (mxge_get_buf_small(rx, rx->extra_map, idx, FALSE)) {
2297 		/* Drop the frame -- the old mbuf is re-cycled */
2298 		IFNET_STAT_INC(ifp, ierrors, 1);
2299 		return;
2300 	}
2301 
2302 	/* Unmap the received buffer */
2303 	old_map = rx->info[idx].map;
2304 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2305 	bus_dmamap_unload(rx->dmat, old_map);
2306 
2307 	/* Swap the bus_dmamap_t's */
2308 	rx->info[idx].map = rx->extra_map;
2309 	rx->extra_map = old_map;
2310 
2311 	/*
2312 	 * mcp implicitly skips 1st 2 bytes so that packet is properly
2313 	 * aligned
2314 	 */
2315 	m->m_data += MXGEFW_PAD;
2316 
2317 	m->m_pkthdr.rcvif = ifp;
2318 	m->m_len = m->m_pkthdr.len = len;
2319 
2320 	IFNET_STAT_INC(ifp, ipackets, 1);
2321 
2322 	eh = mtod(m, const struct ether_header *);
2323 	if (eh->ether_type == htons(ETHERTYPE_VLAN))
2324 		mxge_vlan_tag_remove(m, &csum);
2325 
2326 	/* If the checksum is valid, mark it in the mbuf header */
2327 	if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2328 	    mxge_rx_csum(m, csum) == 0) {
2329 		/* Tell the stack that the checksum is good */
2330 		m->m_pkthdr.csum_data = 0xffff;
2331 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2332 		    CSUM_DATA_VALID;
2333 	}
2334 	ifp->if_input(ifp, m, NULL, -1);
2335 }
2336 
2337 static __inline void
2338 mxge_clean_rx_done(struct ifnet *ifp, struct mxge_rx_data *rx_data, int cycle)
2339 {
2340 	mxge_rx_done_t *rx_done = &rx_data->rx_done;
2341 
2342 	while (rx_done->entry[rx_done->idx].length != 0 && cycle != 0) {
2343 		uint16_t length, checksum;
2344 
2345 		length = ntohs(rx_done->entry[rx_done->idx].length);
2346 		rx_done->entry[rx_done->idx].length = 0;
2347 
2348 		checksum = rx_done->entry[rx_done->idx].checksum;
2349 
2350 		if (length <= MXGE_RX_SMALL_BUFLEN) {
2351 			mxge_rx_done_small(ifp, &rx_data->rx_small,
2352 			    length, checksum);
2353 		} else {
2354 			mxge_rx_done_big(ifp, &rx_data->rx_big,
2355 			    length, checksum);
2356 		}
2357 
2358 		rx_done->idx++;
2359 		rx_done->idx &= rx_done->mask;
2360 		--cycle;
2361 	}
2362 }
2363 
2364 static __inline void
2365 mxge_tx_done(struct ifnet *ifp, mxge_tx_ring_t *tx, uint32_t mcp_idx)
2366 {
2367 	ASSERT_SERIALIZED(&tx->tx_serialize);
2368 
2369 	while (tx->pkt_done != mcp_idx) {
2370 		struct mbuf *m;
2371 		int idx;
2372 
2373 		idx = tx->done & tx->mask;
2374 		tx->done++;
2375 
2376 		m = tx->info[idx].m;
2377 		/*
2378 		 * mbuf and DMA map only attached to the first
2379 		 * segment per-mbuf.
2380 		 */
2381 		if (m != NULL) {
2382 			tx->pkt_done++;
2383 			IFNET_STAT_INC(ifp, opackets, 1);
2384 			tx->info[idx].m = NULL;
2385 			bus_dmamap_unload(tx->dmat, tx->info[idx].map);
2386 			m_freem(m);
2387 		}
2388 	}
2389 
2390 	/*
2391 	 * If we have space, clear OACTIVE to tell the stack that
2392 	 * its OK to send packets
2393 	 */
2394 	if (tx->req - tx->done < (tx->mask + 1) / 2) {
2395 		ifsq_clr_oactive(tx->ifsq);
2396 		if (tx->req == tx->done) {
2397 			/* Reset watchdog */
2398 			ifsq_watchdog_set_count(&tx->watchdog, 0);
2399 		}
2400 	}
2401 
2402 	if (!ifsq_is_empty(tx->ifsq))
2403 		ifsq_devstart(tx->ifsq);
2404 
2405 	if (tx->send_stop != NULL && tx->req == tx->done) {
2406 		/*
2407 		 * Let the NIC stop polling this queue, since there
2408 		 * are no more transmits pending
2409 		 */
2410 		*tx->send_stop = 1;
2411 		tx->queue_active = 0;
2412 		tx->deactivate++;
2413 		wmb();
2414 	}
2415 }
2416 
2417 static struct mxge_media_type mxge_xfp_media_types[] = {
2418 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2419 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2420 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2421 	{IFM_NONE,	(1 << 5),	"10GBASE-ER"},
2422 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2423 	{IFM_NONE,	(1 << 3),	"10GBASE-SW"},
2424 	{IFM_NONE,	(1 << 2),	"10GBASE-LW"},
2425 	{IFM_NONE,	(1 << 1),	"10GBASE-EW"},
2426 	{IFM_NONE,	(1 << 0),	"Reserved"}
2427 };
2428 
2429 static struct mxge_media_type mxge_sfp_media_types[] = {
2430 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2431 	{IFM_NONE,	(1 << 7),	"Reserved"},
2432 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2433 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2434 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2435 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2436 };
2437 
2438 static void
2439 mxge_media_set(mxge_softc_t *sc, int media_type)
2440 {
2441 	int fc_opt = 0;
2442 
2443 	if (media_type == IFM_NONE)
2444 		return;
2445 
2446 	if (sc->pause)
2447 		fc_opt = IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE;
2448 
2449 	ifmedia_add(&sc->media, MXGE_IFM | media_type, 0, NULL);
2450 	ifmedia_set(&sc->media, MXGE_IFM | media_type | fc_opt);
2451 
2452 	sc->current_media = media_type;
2453 }
2454 
2455 static void
2456 mxge_media_unset(mxge_softc_t *sc)
2457 {
2458 	ifmedia_removeall(&sc->media);
2459 	sc->current_media = IFM_NONE;
2460 }
2461 
2462 static void
2463 mxge_media_init(mxge_softc_t *sc)
2464 {
2465 	const char *ptr;
2466 	int i;
2467 
2468 	mxge_media_unset(sc);
2469 
2470 	/*
2471 	 * Parse the product code to deterimine the interface type
2472 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2473 	 * after the 3rd dash in the driver's cached copy of the
2474 	 * EEPROM's product code string.
2475 	 */
2476 	ptr = sc->product_code_string;
2477 	if (ptr == NULL) {
2478 		if_printf(sc->ifp, "Missing product code\n");
2479 		return;
2480 	}
2481 
2482 	for (i = 0; i < 3; i++, ptr++) {
2483 		ptr = strchr(ptr, '-');
2484 		if (ptr == NULL) {
2485 			if_printf(sc->ifp, "only %d dashes in PC?!?\n", i);
2486 			return;
2487 		}
2488 	}
2489 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2490 		/* -C is CX4 */
2491 		sc->connector = MXGE_CX4;
2492 		mxge_media_set(sc, IFM_10G_CX4);
2493 	} else if (*ptr == 'Q') {
2494 		/* -Q is Quad Ribbon Fiber */
2495 		sc->connector = MXGE_QRF;
2496 		if_printf(sc->ifp, "Quad Ribbon Fiber Media\n");
2497 		/* DragonFly has no media type for Quad ribbon fiber */
2498 	} else if (*ptr == 'R') {
2499 		/* -R is XFP */
2500 		sc->connector = MXGE_XFP;
2501 		/* NOTE: ifmedia will be installed later */
2502 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2503 		/* -S or -2S is SFP+ */
2504 		sc->connector = MXGE_SFP;
2505 		/* NOTE: ifmedia will be installed later */
2506 	} else {
2507 		sc->connector = MXGE_UNK;
2508 		if_printf(sc->ifp, "Unknown media type: %c\n", *ptr);
2509 	}
2510 }
2511 
2512 /*
2513  * Determine the media type for a NIC.  Some XFPs will identify
2514  * themselves only when their link is up, so this is initiated via a
2515  * link up interrupt.  However, this can potentially take up to
2516  * several milliseconds, so it is run via the watchdog routine, rather
2517  * than in the interrupt handler itself.
2518  */
2519 static void
2520 mxge_media_probe(mxge_softc_t *sc)
2521 {
2522 	mxge_cmd_t cmd;
2523 	const char *cage_type;
2524 	struct mxge_media_type *mxge_media_types = NULL;
2525 	int i, err, ms, mxge_media_type_entries;
2526 	uint32_t byte;
2527 
2528 	sc->need_media_probe = 0;
2529 
2530 	if (sc->connector == MXGE_XFP) {
2531 		/* -R is XFP */
2532 		mxge_media_types = mxge_xfp_media_types;
2533 		mxge_media_type_entries = NELEM(mxge_xfp_media_types);
2534 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2535 		cage_type = "XFP";
2536 	} else 	if (sc->connector == MXGE_SFP) {
2537 		/* -S or -2S is SFP+ */
2538 		mxge_media_types = mxge_sfp_media_types;
2539 		mxge_media_type_entries = NELEM(mxge_sfp_media_types);
2540 		cage_type = "SFP+";
2541 		byte = 3;
2542 	} else {
2543 		/* nothing to do; media type cannot change */
2544 		return;
2545 	}
2546 
2547 	/*
2548 	 * At this point we know the NIC has an XFP cage, so now we
2549 	 * try to determine what is in the cage by using the
2550 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2551 	 * register.  We read just one byte, which may take over
2552 	 * a millisecond
2553 	 */
2554 
2555 	bzero(&cmd, sizeof(cmd));	/* silence gcc warning */
2556 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2557 	cmd.data1 = byte;
2558 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2559 	if (err != MXGEFW_CMD_OK) {
2560 		if (err == MXGEFW_CMD_ERROR_I2C_FAILURE)
2561 			if_printf(sc->ifp, "failed to read XFP\n");
2562 		else if (err == MXGEFW_CMD_ERROR_I2C_ABSENT)
2563 			if_printf(sc->ifp, "Type R/S with no XFP!?!?\n");
2564 		else
2565 			if_printf(sc->ifp, "I2C read failed, err: %d", err);
2566 		mxge_media_unset(sc);
2567 		return;
2568 	}
2569 
2570 	/* Now we wait for the data to be cached */
2571 	cmd.data0 = byte;
2572 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2573 	for (ms = 0; err == EBUSY && ms < 50; ms++) {
2574 		DELAY(1000);
2575 		cmd.data0 = byte;
2576 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2577 	}
2578 	if (err != MXGEFW_CMD_OK) {
2579 		if_printf(sc->ifp, "failed to read %s (%d, %dms)\n",
2580 		    cage_type, err, ms);
2581 		mxge_media_unset(sc);
2582 		return;
2583 	}
2584 
2585 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2586 		if (bootverbose) {
2587 			if_printf(sc->ifp, "%s:%s\n", cage_type,
2588 			    mxge_media_types[0].name);
2589 		}
2590 		if (sc->current_media != mxge_media_types[0].flag) {
2591 			mxge_media_unset(sc);
2592 			mxge_media_set(sc, mxge_media_types[0].flag);
2593 		}
2594 		return;
2595 	}
2596 	for (i = 1; i < mxge_media_type_entries; i++) {
2597 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2598 			if (bootverbose) {
2599 				if_printf(sc->ifp, "%s:%s\n", cage_type,
2600 				    mxge_media_types[i].name);
2601 			}
2602 
2603 			if (sc->current_media != mxge_media_types[i].flag) {
2604 				mxge_media_unset(sc);
2605 				mxge_media_set(sc, mxge_media_types[i].flag);
2606 			}
2607 			return;
2608 		}
2609 	}
2610 	mxge_media_unset(sc);
2611 	if (bootverbose) {
2612 		if_printf(sc->ifp, "%s media 0x%x unknown\n", cage_type,
2613 		    cmd.data0);
2614 	}
2615 }
2616 
2617 static void
2618 mxge_intr_status(struct mxge_softc *sc, const mcp_irq_data_t *stats)
2619 {
2620 	if (sc->link_state != stats->link_up) {
2621 		sc->link_state = stats->link_up;
2622 		if (sc->link_state) {
2623 			sc->ifp->if_link_state = LINK_STATE_UP;
2624 			if_link_state_change(sc->ifp);
2625 			if (bootverbose)
2626 				if_printf(sc->ifp, "link up\n");
2627 		} else {
2628 			sc->ifp->if_link_state = LINK_STATE_DOWN;
2629 			if_link_state_change(sc->ifp);
2630 			if (bootverbose)
2631 				if_printf(sc->ifp, "link down\n");
2632 		}
2633 		sc->need_media_probe = 1;
2634 	}
2635 
2636 	if (sc->rdma_tags_available != be32toh(stats->rdma_tags_available)) {
2637 		sc->rdma_tags_available = be32toh(stats->rdma_tags_available);
2638 		if_printf(sc->ifp, "RDMA timed out! %d tags left\n",
2639 		    sc->rdma_tags_available);
2640 	}
2641 
2642 	if (stats->link_down) {
2643 		sc->down_cnt += stats->link_down;
2644 		sc->link_state = 0;
2645 		sc->ifp->if_link_state = LINK_STATE_DOWN;
2646 		if_link_state_change(sc->ifp);
2647 	}
2648 }
2649 
2650 static void
2651 mxge_serialize_skipmain(struct mxge_softc *sc)
2652 {
2653 	lwkt_serialize_array_enter(sc->serializes, sc->nserialize, 1);
2654 }
2655 
2656 static void
2657 mxge_deserialize_skipmain(struct mxge_softc *sc)
2658 {
2659 	lwkt_serialize_array_exit(sc->serializes, sc->nserialize, 1);
2660 }
2661 
2662 static void
2663 mxge_legacy(void *arg)
2664 {
2665 	struct mxge_slice_state *ss = arg;
2666 	mxge_softc_t *sc = ss->sc;
2667 	mcp_irq_data_t *stats = ss->fw_stats;
2668 	mxge_tx_ring_t *tx = &ss->tx;
2669 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2670 	uint32_t send_done_count;
2671 	uint8_t valid;
2672 
2673 	ASSERT_SERIALIZED(&sc->main_serialize);
2674 
2675 	/* Make sure the DMA has finished */
2676 	if (!stats->valid)
2677 		return;
2678 	valid = stats->valid;
2679 
2680 	/* Lower legacy IRQ */
2681 	*sc->irq_deassert = 0;
2682 	if (!mxge_deassert_wait) {
2683 		/* Don't wait for conf. that irq is low */
2684 		stats->valid = 0;
2685 	}
2686 
2687 	mxge_serialize_skipmain(sc);
2688 
2689 	/*
2690 	 * Loop while waiting for legacy irq deassertion
2691 	 * XXX do we really want to loop?
2692 	 */
2693 	do {
2694 		/* Check for transmit completes and receives */
2695 		send_done_count = be32toh(stats->send_done_count);
2696 		while ((send_done_count != tx->pkt_done) ||
2697 		       (rx_done->entry[rx_done->idx].length != 0)) {
2698 			if (send_done_count != tx->pkt_done) {
2699 				mxge_tx_done(&sc->arpcom.ac_if, tx,
2700 				    (int)send_done_count);
2701 			}
2702 			mxge_clean_rx_done(&sc->arpcom.ac_if, &ss->rx_data, -1);
2703 			send_done_count = be32toh(stats->send_done_count);
2704 		}
2705 		if (mxge_deassert_wait)
2706 			wmb();
2707 	} while (*((volatile uint8_t *)&stats->valid));
2708 
2709 	mxge_deserialize_skipmain(sc);
2710 
2711 	/* Fw link & error stats meaningful only on the first slice */
2712 	if (__predict_false(stats->stats_updated))
2713 		mxge_intr_status(sc, stats);
2714 
2715 	/* Check to see if we have rx token to pass back */
2716 	if (valid & 0x1)
2717 		*ss->irq_claim = be32toh(3);
2718 	*(ss->irq_claim + 1) = be32toh(3);
2719 }
2720 
2721 static void
2722 mxge_msi(void *arg)
2723 {
2724 	struct mxge_slice_state *ss = arg;
2725 	mxge_softc_t *sc = ss->sc;
2726 	mcp_irq_data_t *stats = ss->fw_stats;
2727 	mxge_tx_ring_t *tx = &ss->tx;
2728 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2729 	uint32_t send_done_count;
2730 	uint8_t valid;
2731 #ifndef IFPOLL_ENABLE
2732 	const boolean_t polling = FALSE;
2733 #else
2734 	boolean_t polling = FALSE;
2735 #endif
2736 
2737 	ASSERT_SERIALIZED(&sc->main_serialize);
2738 
2739 	/* Make sure the DMA has finished */
2740 	if (__predict_false(!stats->valid))
2741 		return;
2742 
2743 	valid = stats->valid;
2744 	stats->valid = 0;
2745 
2746 #ifdef IFPOLL_ENABLE
2747 	if (sc->arpcom.ac_if.if_flags & IFF_NPOLLING)
2748 		polling = TRUE;
2749 #endif
2750 
2751 	if (!polling) {
2752 		/* Check for receives */
2753 		lwkt_serialize_enter(&ss->rx_data.rx_serialize);
2754 		if (rx_done->entry[rx_done->idx].length != 0)
2755 			mxge_clean_rx_done(&sc->arpcom.ac_if, &ss->rx_data, -1);
2756 		lwkt_serialize_exit(&ss->rx_data.rx_serialize);
2757 	}
2758 
2759 	/*
2760 	 * Check for transmit completes
2761 	 *
2762 	 * NOTE:
2763 	 * Since pkt_done is only changed by mxge_tx_done(),
2764 	 * which is called only in interrupt handler, the
2765 	 * check w/o holding tx serializer is MPSAFE.
2766 	 */
2767 	send_done_count = be32toh(stats->send_done_count);
2768 	if (send_done_count != tx->pkt_done) {
2769 		lwkt_serialize_enter(&tx->tx_serialize);
2770 		mxge_tx_done(&sc->arpcom.ac_if, tx, (int)send_done_count);
2771 		lwkt_serialize_exit(&tx->tx_serialize);
2772 	}
2773 
2774 	if (__predict_false(stats->stats_updated))
2775 		mxge_intr_status(sc, stats);
2776 
2777 	/* Check to see if we have rx token to pass back */
2778 	if (!polling && (valid & 0x1))
2779 		*ss->irq_claim = be32toh(3);
2780 	*(ss->irq_claim + 1) = be32toh(3);
2781 }
2782 
2783 static void
2784 mxge_msix_rx(void *arg)
2785 {
2786 	struct mxge_slice_state *ss = arg;
2787 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2788 
2789 #ifdef IFPOLL_ENABLE
2790 	if (ss->sc->arpcom.ac_if.if_flags & IFF_NPOLLING)
2791 		return;
2792 #endif
2793 
2794 	ASSERT_SERIALIZED(&ss->rx_data.rx_serialize);
2795 
2796 	if (rx_done->entry[rx_done->idx].length != 0)
2797 		mxge_clean_rx_done(&ss->sc->arpcom.ac_if, &ss->rx_data, -1);
2798 
2799 	*ss->irq_claim = be32toh(3);
2800 }
2801 
2802 static void
2803 mxge_msix_rxtx(void *arg)
2804 {
2805 	struct mxge_slice_state *ss = arg;
2806 	mxge_softc_t *sc = ss->sc;
2807 	mcp_irq_data_t *stats = ss->fw_stats;
2808 	mxge_tx_ring_t *tx = &ss->tx;
2809 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2810 	uint32_t send_done_count;
2811 	uint8_t valid;
2812 #ifndef IFPOLL_ENABLE
2813 	const boolean_t polling = FALSE;
2814 #else
2815 	boolean_t polling = FALSE;
2816 #endif
2817 
2818 	ASSERT_SERIALIZED(&ss->rx_data.rx_serialize);
2819 
2820 	/* Make sure the DMA has finished */
2821 	if (__predict_false(!stats->valid))
2822 		return;
2823 
2824 	valid = stats->valid;
2825 	stats->valid = 0;
2826 
2827 #ifdef IFPOLL_ENABLE
2828 	if (sc->arpcom.ac_if.if_flags & IFF_NPOLLING)
2829 		polling = TRUE;
2830 #endif
2831 
2832 	/* Check for receives */
2833 	if (!polling && rx_done->entry[rx_done->idx].length != 0)
2834 		mxge_clean_rx_done(&sc->arpcom.ac_if, &ss->rx_data, -1);
2835 
2836 	/*
2837 	 * Check for transmit completes
2838 	 *
2839 	 * NOTE:
2840 	 * Since pkt_done is only changed by mxge_tx_done(),
2841 	 * which is called only in interrupt handler, the
2842 	 * check w/o holding tx serializer is MPSAFE.
2843 	 */
2844 	send_done_count = be32toh(stats->send_done_count);
2845 	if (send_done_count != tx->pkt_done) {
2846 		lwkt_serialize_enter(&tx->tx_serialize);
2847 		mxge_tx_done(&sc->arpcom.ac_if, tx, (int)send_done_count);
2848 		lwkt_serialize_exit(&tx->tx_serialize);
2849 	}
2850 
2851 	/* Check to see if we have rx token to pass back */
2852 	if (!polling && (valid & 0x1))
2853 		*ss->irq_claim = be32toh(3);
2854 	*(ss->irq_claim + 1) = be32toh(3);
2855 }
2856 
2857 static void
2858 mxge_init(void *arg)
2859 {
2860 	struct mxge_softc *sc = arg;
2861 
2862 	ASSERT_IFNET_SERIALIZED_ALL(sc->ifp);
2863 	if ((sc->ifp->if_flags & IFF_RUNNING) == 0)
2864 		mxge_open(sc);
2865 }
2866 
2867 static void
2868 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2869 {
2870 	int i;
2871 
2872 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
2873 		if (ss->rx_data.rx_big.info[i].m == NULL)
2874 			continue;
2875 		bus_dmamap_unload(ss->rx_data.rx_big.dmat,
2876 		    ss->rx_data.rx_big.info[i].map);
2877 		m_freem(ss->rx_data.rx_big.info[i].m);
2878 		ss->rx_data.rx_big.info[i].m = NULL;
2879 	}
2880 
2881 	for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
2882 		if (ss->rx_data.rx_small.info[i].m == NULL)
2883 			continue;
2884 		bus_dmamap_unload(ss->rx_data.rx_small.dmat,
2885 		    ss->rx_data.rx_small.info[i].map);
2886 		m_freem(ss->rx_data.rx_small.info[i].m);
2887 		ss->rx_data.rx_small.info[i].m = NULL;
2888 	}
2889 
2890 	/* Transmit ring used only on the first slice */
2891 	if (ss->tx.info == NULL)
2892 		return;
2893 
2894 	for (i = 0; i <= ss->tx.mask; i++) {
2895 		if (ss->tx.info[i].m == NULL)
2896 			continue;
2897 		bus_dmamap_unload(ss->tx.dmat, ss->tx.info[i].map);
2898 		m_freem(ss->tx.info[i].m);
2899 		ss->tx.info[i].m = NULL;
2900 	}
2901 }
2902 
2903 static void
2904 mxge_free_mbufs(mxge_softc_t *sc)
2905 {
2906 	int slice;
2907 
2908 	for (slice = 0; slice < sc->num_slices; slice++)
2909 		mxge_free_slice_mbufs(&sc->ss[slice]);
2910 }
2911 
2912 static void
2913 mxge_free_slice_rings(struct mxge_slice_state *ss)
2914 {
2915 	int i;
2916 
2917 	if (ss->rx_data.rx_done.entry != NULL) {
2918 		mxge_dma_free(&ss->rx_done_dma);
2919 		ss->rx_data.rx_done.entry = NULL;
2920 	}
2921 
2922 	if (ss->tx.req_list != NULL) {
2923 		kfree(ss->tx.req_list, M_DEVBUF);
2924 		ss->tx.req_list = NULL;
2925 	}
2926 
2927 	if (ss->tx.seg_list != NULL) {
2928 		kfree(ss->tx.seg_list, M_DEVBUF);
2929 		ss->tx.seg_list = NULL;
2930 	}
2931 
2932 	if (ss->rx_data.rx_small.shadow != NULL) {
2933 		kfree(ss->rx_data.rx_small.shadow, M_DEVBUF);
2934 		ss->rx_data.rx_small.shadow = NULL;
2935 	}
2936 
2937 	if (ss->rx_data.rx_big.shadow != NULL) {
2938 		kfree(ss->rx_data.rx_big.shadow, M_DEVBUF);
2939 		ss->rx_data.rx_big.shadow = NULL;
2940 	}
2941 
2942 	if (ss->tx.info != NULL) {
2943 		if (ss->tx.dmat != NULL) {
2944 			for (i = 0; i <= ss->tx.mask; i++) {
2945 				bus_dmamap_destroy(ss->tx.dmat,
2946 				    ss->tx.info[i].map);
2947 			}
2948 			bus_dma_tag_destroy(ss->tx.dmat);
2949 		}
2950 		kfree(ss->tx.info, M_DEVBUF);
2951 		ss->tx.info = NULL;
2952 	}
2953 
2954 	if (ss->rx_data.rx_small.info != NULL) {
2955 		if (ss->rx_data.rx_small.dmat != NULL) {
2956 			for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
2957 				bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
2958 				    ss->rx_data.rx_small.info[i].map);
2959 			}
2960 			bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
2961 			    ss->rx_data.rx_small.extra_map);
2962 			bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
2963 		}
2964 		kfree(ss->rx_data.rx_small.info, M_DEVBUF);
2965 		ss->rx_data.rx_small.info = NULL;
2966 	}
2967 
2968 	if (ss->rx_data.rx_big.info != NULL) {
2969 		if (ss->rx_data.rx_big.dmat != NULL) {
2970 			for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
2971 				bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
2972 				    ss->rx_data.rx_big.info[i].map);
2973 			}
2974 			bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
2975 			    ss->rx_data.rx_big.extra_map);
2976 			bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
2977 		}
2978 		kfree(ss->rx_data.rx_big.info, M_DEVBUF);
2979 		ss->rx_data.rx_big.info = NULL;
2980 	}
2981 }
2982 
2983 static void
2984 mxge_free_rings(mxge_softc_t *sc)
2985 {
2986 	int slice;
2987 
2988 	if (sc->ss == NULL)
2989 		return;
2990 
2991 	for (slice = 0; slice < sc->num_slices; slice++)
2992 		mxge_free_slice_rings(&sc->ss[slice]);
2993 }
2994 
2995 static int
2996 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
2997     int tx_ring_entries)
2998 {
2999 	mxge_softc_t *sc = ss->sc;
3000 	size_t bytes;
3001 	int err, i;
3002 
3003 	/*
3004 	 * Allocate per-slice receive resources
3005 	 */
3006 
3007 	ss->rx_data.rx_small.mask = ss->rx_data.rx_big.mask =
3008 	    rx_ring_entries - 1;
3009 	ss->rx_data.rx_done.mask = (2 * rx_ring_entries) - 1;
3010 
3011 	/* Allocate the rx shadow rings */
3012 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_small.shadow);
3013 	ss->rx_data.rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3014 
3015 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_big.shadow);
3016 	ss->rx_data.rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3017 
3018 	/* Allocate the rx host info rings */
3019 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_small.info);
3020 	ss->rx_data.rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3021 
3022 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_big.info);
3023 	ss->rx_data.rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3024 
3025 	/* Allocate the rx busdma resources */
3026 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3027 				 1,			/* alignment */
3028 				 4096,			/* boundary */
3029 				 BUS_SPACE_MAXADDR,	/* low */
3030 				 BUS_SPACE_MAXADDR,	/* high */
3031 				 MHLEN,			/* maxsize */
3032 				 1,			/* num segs */
3033 				 MHLEN,			/* maxsegsize */
3034 				 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
3035 				 			/* flags */
3036 				 &ss->rx_data.rx_small.dmat); /* tag */
3037 	if (err != 0) {
3038 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3039 		    err);
3040 		return err;
3041 	}
3042 
3043 	err = bus_dmamap_create(ss->rx_data.rx_small.dmat, BUS_DMA_WAITOK,
3044 	    &ss->rx_data.rx_small.extra_map);
3045 	if (err != 0) {
3046 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n", err);
3047 		bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
3048 		ss->rx_data.rx_small.dmat = NULL;
3049 		return err;
3050 	}
3051 	for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
3052 		err = bus_dmamap_create(ss->rx_data.rx_small.dmat,
3053 		    BUS_DMA_WAITOK, &ss->rx_data.rx_small.info[i].map);
3054 		if (err != 0) {
3055 			int j;
3056 
3057 			device_printf(sc->dev, "Err %d rx_small dmamap\n", err);
3058 
3059 			for (j = 0; j < i; ++j) {
3060 				bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
3061 				    ss->rx_data.rx_small.info[j].map);
3062 			}
3063 			bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
3064 			    ss->rx_data.rx_small.extra_map);
3065 			bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
3066 			ss->rx_data.rx_small.dmat = NULL;
3067 			return err;
3068 		}
3069 	}
3070 
3071 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3072 				 1,			/* alignment */
3073 				 4096,			/* boundary */
3074 				 BUS_SPACE_MAXADDR,	/* low */
3075 				 BUS_SPACE_MAXADDR,	/* high */
3076 				 4096,			/* maxsize */
3077 				 1,			/* num segs */
3078 				 4096,			/* maxsegsize*/
3079 				 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
3080 				 			/* flags */
3081 				 &ss->rx_data.rx_big.dmat); /* tag */
3082 	if (err != 0) {
3083 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3084 		    err);
3085 		return err;
3086 	}
3087 
3088 	err = bus_dmamap_create(ss->rx_data.rx_big.dmat, BUS_DMA_WAITOK,
3089 	    &ss->rx_data.rx_big.extra_map);
3090 	if (err != 0) {
3091 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n", err);
3092 		bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
3093 		ss->rx_data.rx_big.dmat = NULL;
3094 		return err;
3095 	}
3096 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
3097 		err = bus_dmamap_create(ss->rx_data.rx_big.dmat, BUS_DMA_WAITOK,
3098 		    &ss->rx_data.rx_big.info[i].map);
3099 		if (err != 0) {
3100 			int j;
3101 
3102 			device_printf(sc->dev, "Err %d rx_big dmamap\n", err);
3103 			for (j = 0; j < i; ++j) {
3104 				bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
3105 				    ss->rx_data.rx_big.info[j].map);
3106 			}
3107 			bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
3108 			    ss->rx_data.rx_big.extra_map);
3109 			bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
3110 			ss->rx_data.rx_big.dmat = NULL;
3111 			return err;
3112 		}
3113 	}
3114 
3115 	/*
3116 	 * Now allocate TX resources
3117 	 */
3118 
3119 	ss->tx.mask = tx_ring_entries - 1;
3120 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3121 
3122 	/*
3123 	 * Allocate the tx request copy block; MUST be at least 8 bytes
3124 	 * aligned
3125 	 */
3126 	bytes = sizeof(*ss->tx.req_list) * (ss->tx.max_desc + 4);
3127 	ss->tx.req_list = kmalloc(__VM_CACHELINE_ALIGN(bytes),
3128 				  M_DEVBUF,
3129 				  M_WAITOK | M_CACHEALIGN);
3130 
3131 	/* Allocate the tx busdma segment list */
3132 	bytes = sizeof(*ss->tx.seg_list) * ss->tx.max_desc;
3133 	ss->tx.seg_list = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3134 
3135 	/* Allocate the tx host info ring */
3136 	bytes = tx_ring_entries * sizeof(*ss->tx.info);
3137 	ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3138 
3139 	/* Allocate the tx busdma resources */
3140 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3141 				 1,			/* alignment */
3142 				 sc->tx_boundary,	/* boundary */
3143 				 BUS_SPACE_MAXADDR,	/* low */
3144 				 BUS_SPACE_MAXADDR,	/* high */
3145 				 IP_MAXPACKET +
3146 				 sizeof(struct ether_vlan_header),
3147 				 			/* maxsize */
3148 				 ss->tx.max_desc - 2,	/* num segs */
3149 				 sc->tx_boundary,	/* maxsegsz */
3150 				 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW |
3151 				 BUS_DMA_ONEBPAGE,	/* flags */
3152 				 &ss->tx.dmat);		/* tag */
3153 	if (err != 0) {
3154 		device_printf(sc->dev, "Err %d allocating tx dmat\n", err);
3155 		return err;
3156 	}
3157 
3158 	/*
3159 	 * Now use these tags to setup DMA maps for each slot in the ring
3160 	 */
3161 	for (i = 0; i <= ss->tx.mask; i++) {
3162 		err = bus_dmamap_create(ss->tx.dmat,
3163 		    BUS_DMA_WAITOK | BUS_DMA_ONEBPAGE, &ss->tx.info[i].map);
3164 		if (err != 0) {
3165 			int j;
3166 
3167 			device_printf(sc->dev, "Err %d tx dmamap\n", err);
3168 			for (j = 0; j < i; ++j) {
3169 				bus_dmamap_destroy(ss->tx.dmat,
3170 				    ss->tx.info[j].map);
3171 			}
3172 			bus_dma_tag_destroy(ss->tx.dmat);
3173 			ss->tx.dmat = NULL;
3174 			return err;
3175 		}
3176 	}
3177 	return 0;
3178 }
3179 
3180 static int
3181 mxge_alloc_rings(mxge_softc_t *sc)
3182 {
3183 	mxge_cmd_t cmd;
3184 	int tx_ring_size;
3185 	int tx_ring_entries, rx_ring_entries;
3186 	int err, slice;
3187 
3188 	/* Get ring sizes */
3189 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3190 	if (err != 0) {
3191 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3192 		return err;
3193 	}
3194 	tx_ring_size = cmd.data0;
3195 
3196 	tx_ring_entries = tx_ring_size / sizeof(mcp_kreq_ether_send_t);
3197 	rx_ring_entries = sc->rx_intr_slots / 2;
3198 
3199 	if (bootverbose) {
3200 		device_printf(sc->dev, "tx desc %d, rx desc %d\n",
3201 		    tx_ring_entries, rx_ring_entries);
3202 	}
3203 
3204 	sc->ifp->if_nmbclusters = rx_ring_entries * sc->num_slices;
3205 	sc->ifp->if_nmbjclusters = sc->ifp->if_nmbclusters;
3206 
3207 	ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3208 	ifq_set_ready(&sc->ifp->if_snd);
3209 	ifq_set_subq_cnt(&sc->ifp->if_snd, sc->num_tx_rings);
3210 
3211 	if (sc->num_tx_rings > 1) {
3212 		sc->ifp->if_mapsubq = ifq_mapsubq_modulo;
3213 		ifq_set_subq_divisor(&sc->ifp->if_snd, sc->num_tx_rings);
3214 	}
3215 
3216 	for (slice = 0; slice < sc->num_slices; slice++) {
3217 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3218 		    rx_ring_entries, tx_ring_entries);
3219 		if (err != 0) {
3220 			device_printf(sc->dev,
3221 			    "alloc %d slice rings failed\n", slice);
3222 			return err;
3223 		}
3224 	}
3225 	return 0;
3226 }
3227 
3228 static void
3229 mxge_choose_params(int mtu, int *cl_size)
3230 {
3231 	int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3232 
3233 	if (bufsize < MCLBYTES) {
3234 		*cl_size = MCLBYTES;
3235 	} else {
3236 		KASSERT(bufsize < MJUMPAGESIZE, ("invalid MTU %d", mtu));
3237 		*cl_size = MJUMPAGESIZE;
3238 	}
3239 }
3240 
3241 static int
3242 mxge_slice_open(struct mxge_slice_state *ss, int cl_size)
3243 {
3244 	mxge_cmd_t cmd;
3245 	int err, i, slice;
3246 
3247 	slice = ss - ss->sc->ss;
3248 
3249 	/*
3250 	 * Get the lanai pointers to the send and receive rings
3251 	 */
3252 	err = 0;
3253 
3254 	bzero(&cmd, sizeof(cmd));	/* silence gcc warning */
3255 	if (ss->sc->num_tx_rings == 1) {
3256 		if (slice == 0) {
3257 			cmd.data0 = slice;
3258 			err = mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SEND_OFFSET,
3259 			    &cmd);
3260 			ss->tx.lanai = (volatile mcp_kreq_ether_send_t *)
3261 			    (ss->sc->sram + cmd.data0);
3262 			/* Leave send_go and send_stop as NULL */
3263 		}
3264 	} else {
3265 		cmd.data0 = slice;
3266 		err = mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3267 		ss->tx.lanai = (volatile mcp_kreq_ether_send_t *)
3268 		    (ss->sc->sram + cmd.data0);
3269 		ss->tx.send_go = (volatile uint32_t *)
3270 		    (ss->sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3271 		ss->tx.send_stop = (volatile uint32_t *)
3272 		    (ss->sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3273 	}
3274 
3275 	cmd.data0 = slice;
3276 	err |= mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3277 	ss->rx_data.rx_small.lanai =
3278 	    (volatile mcp_kreq_ether_recv_t *)(ss->sc->sram + cmd.data0);
3279 
3280 	cmd.data0 = slice;
3281 	err |= mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3282 	ss->rx_data.rx_big.lanai =
3283 	    (volatile mcp_kreq_ether_recv_t *)(ss->sc->sram + cmd.data0);
3284 
3285 	if (err != 0) {
3286 		if_printf(ss->sc->ifp,
3287 		    "failed to get ring sizes or locations\n");
3288 		return EIO;
3289 	}
3290 
3291 	/*
3292 	 * Stock small receive ring
3293 	 */
3294 	for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
3295 		err = mxge_get_buf_small(&ss->rx_data.rx_small,
3296 		    ss->rx_data.rx_small.info[i].map, i, TRUE);
3297 		if (err) {
3298 			if_printf(ss->sc->ifp, "alloced %d/%d smalls\n", i,
3299 			    ss->rx_data.rx_small.mask + 1);
3300 			return ENOMEM;
3301 		}
3302 	}
3303 
3304 	/*
3305 	 * Stock big receive ring
3306 	 */
3307 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
3308 		ss->rx_data.rx_big.shadow[i].addr_low = 0xffffffff;
3309 		ss->rx_data.rx_big.shadow[i].addr_high = 0xffffffff;
3310 	}
3311 
3312 	ss->rx_data.rx_big.cl_size = cl_size;
3313 
3314 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
3315 		err = mxge_get_buf_big(&ss->rx_data.rx_big,
3316 		    ss->rx_data.rx_big.info[i].map, i, TRUE);
3317 		if (err) {
3318 			if_printf(ss->sc->ifp, "alloced %d/%d bigs\n", i,
3319 			    ss->rx_data.rx_big.mask + 1);
3320 			return ENOMEM;
3321 		}
3322 	}
3323 	return 0;
3324 }
3325 
3326 static int
3327 mxge_open(mxge_softc_t *sc)
3328 {
3329 	struct ifnet *ifp = sc->ifp;
3330 	mxge_cmd_t cmd;
3331 	int err, slice, cl_size, i;
3332 	bus_addr_t bus;
3333 	volatile uint8_t *itable;
3334 	struct mxge_slice_state *ss;
3335 
3336 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
3337 
3338 	/* Copy the MAC address in case it was overridden */
3339 	bcopy(IF_LLADDR(ifp), sc->mac_addr, ETHER_ADDR_LEN);
3340 
3341 	err = mxge_reset(sc, 1);
3342 	if (err != 0) {
3343 		if_printf(ifp, "failed to reset\n");
3344 		return EIO;
3345 	}
3346 
3347 	if (sc->num_slices > 1) {
3348 		/*
3349 		 * Setup the indirect table.
3350 		 */
3351 		if_ringmap_rdrtable(sc->ring_map, sc->rdr_table, NETISR_CPUMAX);
3352 
3353 		cmd.data0 = NETISR_CPUMAX;
3354 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE, &cmd);
3355 
3356 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET, &cmd);
3357 		if (err != 0) {
3358 			if_printf(ifp, "failed to setup rss tables\n");
3359 			return err;
3360 		}
3361 
3362 		itable = sc->sram + cmd.data0;
3363 		for (i = 0; i < NETISR_CPUMAX; i++)
3364 			itable[i] = sc->rdr_table[i];
3365 
3366 		if (sc->use_rss) {
3367 			volatile uint8_t *hwkey;
3368 			uint8_t swkey[MXGE_HWRSS_KEYLEN];
3369 
3370 			/*
3371 			 * Setup Toeplitz key.
3372 			 */
3373 			err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_KEY_OFFSET,
3374 			    &cmd);
3375 			if (err != 0) {
3376 				if_printf(ifp, "failed to get rsskey\n");
3377 				return err;
3378 			}
3379 			hwkey = sc->sram + cmd.data0;
3380 
3381 			toeplitz_get_key(swkey, MXGE_HWRSS_KEYLEN);
3382 			for (i = 0; i < MXGE_HWRSS_KEYLEN; ++i)
3383 				hwkey[i] = swkey[i];
3384 			wmb();
3385 
3386 			err = mxge_send_cmd(sc, MXGEFW_CMD_RSS_KEY_UPDATED,
3387 			    &cmd);
3388 			if (err != 0) {
3389 				if_printf(ifp, "failed to update rsskey\n");
3390 				return err;
3391 			}
3392 			if (bootverbose)
3393 				if_printf(ifp, "RSS key updated\n");
3394 		}
3395 
3396 		cmd.data0 = 1;
3397 		if (sc->use_rss) {
3398 			if (bootverbose)
3399 				if_printf(ifp, "input hash: RSS\n");
3400 			cmd.data1 = MXGEFW_RSS_HASH_TYPE_IPV4 |
3401 			    MXGEFW_RSS_HASH_TYPE_TCP_IPV4;
3402 		} else {
3403 			if (bootverbose)
3404 				if_printf(ifp, "input hash: SRC_DST_PORT\n");
3405 			cmd.data1 = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
3406 		}
3407 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3408 		if (err != 0) {
3409 			if_printf(ifp, "failed to enable slices\n");
3410 			return err;
3411 		}
3412 	}
3413 
3414 	cmd.data0 = MXGEFW_TSO_MODE_NDIS;
3415 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_TSO_MODE, &cmd);
3416 	if (err) {
3417 		/*
3418 		 * Can't change TSO mode to NDIS, never allow TSO then
3419 		 */
3420 		if_printf(ifp, "failed to set TSO mode\n");
3421 		ifp->if_capenable &= ~IFCAP_TSO;
3422 		ifp->if_capabilities &= ~IFCAP_TSO;
3423 		ifp->if_hwassist &= ~CSUM_TSO;
3424 	}
3425 
3426 	mxge_choose_params(ifp->if_mtu, &cl_size);
3427 
3428 	cmd.data0 = 1;
3429 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS, &cmd);
3430 	/*
3431 	 * Error is only meaningful if we're trying to set
3432 	 * MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1
3433 	 */
3434 
3435 	/*
3436 	 * Give the firmware the mtu and the big and small buffer
3437 	 * sizes.  The firmware wants the big buf size to be a power
3438 	 * of two. Luckily, DragonFly's clusters are powers of two
3439 	 */
3440 	cmd.data0 = ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3441 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3442 
3443 	cmd.data0 = MXGE_RX_SMALL_BUFLEN;
3444 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE, &cmd);
3445 
3446 	cmd.data0 = cl_size;
3447 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3448 
3449 	if (err != 0) {
3450 		if_printf(ifp, "failed to setup params\n");
3451 		goto abort;
3452 	}
3453 
3454 	/* Now give him the pointer to the stats block */
3455 	for (slice = 0; slice < sc->num_slices; slice++) {
3456 		ss = &sc->ss[slice];
3457 		cmd.data0 = MXGE_LOWPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3458 		cmd.data1 = MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3459 		cmd.data2 = sizeof(struct mcp_irq_data);
3460 		cmd.data2 |= (slice << 16);
3461 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3462 	}
3463 
3464 	if (err != 0) {
3465 		bus = sc->ss->fw_stats_dma.dmem_busaddr;
3466 		bus += offsetof(struct mcp_irq_data, send_done_count);
3467 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3468 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3469 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3470 		    &cmd);
3471 
3472 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3473 		sc->fw_multicast_support = 0;
3474 	} else {
3475 		sc->fw_multicast_support = 1;
3476 	}
3477 
3478 	if (err != 0) {
3479 		if_printf(ifp, "failed to setup params\n");
3480 		goto abort;
3481 	}
3482 
3483 	for (slice = 0; slice < sc->num_slices; slice++) {
3484 		err = mxge_slice_open(&sc->ss[slice], cl_size);
3485 		if (err != 0) {
3486 			if_printf(ifp, "couldn't open slice %d\n", slice);
3487 			goto abort;
3488 		}
3489 	}
3490 
3491 	/* Finally, start the firmware running */
3492 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3493 	if (err) {
3494 		if_printf(ifp, "Couldn't bring up link\n");
3495 		goto abort;
3496 	}
3497 
3498 	ifp->if_flags |= IFF_RUNNING;
3499 	for (i = 0; i < sc->num_tx_rings; ++i) {
3500 		mxge_tx_ring_t *tx = &sc->ss[i].tx;
3501 
3502 		ifsq_clr_oactive(tx->ifsq);
3503 		ifsq_watchdog_start(&tx->watchdog);
3504 	}
3505 
3506 	return 0;
3507 
3508 abort:
3509 	mxge_free_mbufs(sc);
3510 	return err;
3511 }
3512 
3513 static void
3514 mxge_close(mxge_softc_t *sc, int down)
3515 {
3516 	struct ifnet *ifp = sc->ifp;
3517 	mxge_cmd_t cmd;
3518 	int err, old_down_cnt, i;
3519 
3520 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
3521 
3522 	if (!down) {
3523 		old_down_cnt = sc->down_cnt;
3524 		wmb();
3525 
3526 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3527 		if (err)
3528 			if_printf(ifp, "Couldn't bring down link\n");
3529 
3530 		if (old_down_cnt == sc->down_cnt) {
3531 			/*
3532 			 * Wait for down irq
3533 			 * XXX racy
3534 			 */
3535 			ifnet_deserialize_all(ifp);
3536 			DELAY(10 * sc->intr_coal_delay);
3537 			ifnet_serialize_all(ifp);
3538 		}
3539 
3540 		wmb();
3541 		if (old_down_cnt == sc->down_cnt)
3542 			if_printf(ifp, "never got down irq\n");
3543 	}
3544 	mxge_free_mbufs(sc);
3545 
3546 	ifp->if_flags &= ~IFF_RUNNING;
3547 	for (i = 0; i < sc->num_tx_rings; ++i) {
3548 		mxge_tx_ring_t *tx = &sc->ss[i].tx;
3549 
3550 		ifsq_clr_oactive(tx->ifsq);
3551 		ifsq_watchdog_stop(&tx->watchdog);
3552 	}
3553 }
3554 
3555 static void
3556 mxge_setup_cfg_space(mxge_softc_t *sc)
3557 {
3558 	device_t dev = sc->dev;
3559 	int reg;
3560 	uint16_t lnk, pectl;
3561 
3562 	/* Find the PCIe link width and set max read request to 4KB */
3563 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3564 		lnk = pci_read_config(dev, reg + 0x12, 2);
3565 		sc->link_width = (lnk >> 4) & 0x3f;
3566 
3567 		if (sc->pectl == 0) {
3568 			pectl = pci_read_config(dev, reg + 0x8, 2);
3569 			pectl = (pectl & ~0x7000) | (5 << 12);
3570 			pci_write_config(dev, reg + 0x8, pectl, 2);
3571 			sc->pectl = pectl;
3572 		} else {
3573 			/* Restore saved pectl after watchdog reset */
3574 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3575 		}
3576 	}
3577 
3578 	/* Enable DMA and memory space access */
3579 	pci_enable_busmaster(dev);
3580 }
3581 
3582 static uint32_t
3583 mxge_read_reboot(mxge_softc_t *sc)
3584 {
3585 	device_t dev = sc->dev;
3586 	uint32_t vs;
3587 
3588 	/* Find the vendor specific offset */
3589 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3590 		if_printf(sc->ifp, "could not find vendor specific offset\n");
3591 		return (uint32_t)-1;
3592 	}
3593 	/* Enable read32 mode */
3594 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3595 	/* Tell NIC which register to read */
3596 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3597 	return pci_read_config(dev, vs + 0x14, 4);
3598 }
3599 
3600 static void
3601 mxge_watchdog_reset(mxge_softc_t *sc)
3602 {
3603 	struct pci_devinfo *dinfo;
3604 	int err, running;
3605 	uint32_t reboot;
3606 	uint16_t cmd;
3607 
3608 	err = ENXIO;
3609 
3610 	if_printf(sc->ifp, "Watchdog reset!\n");
3611 
3612 	/*
3613 	 * Check to see if the NIC rebooted.  If it did, then all of
3614 	 * PCI config space has been reset, and things like the
3615 	 * busmaster bit will be zero.  If this is the case, then we
3616 	 * must restore PCI config space before the NIC can be used
3617 	 * again
3618 	 */
3619 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3620 	if (cmd == 0xffff) {
3621 		/*
3622 		 * Maybe the watchdog caught the NIC rebooting; wait
3623 		 * up to 100ms for it to finish.  If it does not come
3624 		 * back, then give up
3625 		 */
3626 		DELAY(1000*100);
3627 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3628 		if (cmd == 0xffff)
3629 			if_printf(sc->ifp, "NIC disappeared!\n");
3630 	}
3631 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3632 		/* Print the reboot status */
3633 		reboot = mxge_read_reboot(sc);
3634 		if_printf(sc->ifp, "NIC rebooted, status = 0x%x\n", reboot);
3635 
3636 		running = sc->ifp->if_flags & IFF_RUNNING;
3637 		if (running) {
3638 			/*
3639 			 * Quiesce NIC so that TX routines will not try to
3640 			 * xmit after restoration of BAR
3641 			 */
3642 
3643 			/* Mark the link as down */
3644 			if (sc->link_state) {
3645 				sc->ifp->if_link_state = LINK_STATE_DOWN;
3646 				if_link_state_change(sc->ifp);
3647 			}
3648 			mxge_close(sc, 1);
3649 		}
3650 		/* Restore PCI configuration space */
3651 		dinfo = device_get_ivars(sc->dev);
3652 		pci_cfg_restore(sc->dev, dinfo);
3653 
3654 		/* And redo any changes we made to our config space */
3655 		mxge_setup_cfg_space(sc);
3656 
3657 		/* Reload f/w */
3658 		err = mxge_load_firmware(sc, 0);
3659 		if (err)
3660 			if_printf(sc->ifp, "Unable to re-load f/w\n");
3661 		if (running && !err) {
3662 			int i;
3663 
3664 			err = mxge_open(sc);
3665 
3666 			for (i = 0; i < sc->num_tx_rings; ++i)
3667 				ifsq_devstart_sched(sc->ss[i].tx.ifsq);
3668 		}
3669 		sc->watchdog_resets++;
3670 	} else {
3671 		if_printf(sc->ifp, "NIC did not reboot, not resetting\n");
3672 		err = 0;
3673 	}
3674 	if (err) {
3675 		if_printf(sc->ifp, "watchdog reset failed\n");
3676 	} else {
3677 		if (sc->dying == 2)
3678 			sc->dying = 0;
3679 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3680 	}
3681 }
3682 
3683 static void
3684 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3685 {
3686 	if_printf(sc->ifp, "slice %d struck? ring state:\n", slice);
3687 	if_printf(sc->ifp, "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3688 	    tx->req, tx->done, tx->queue_active);
3689 	if_printf(sc->ifp, "tx.activate=%d tx.deactivate=%d\n",
3690 	    tx->activate, tx->deactivate);
3691 	if_printf(sc->ifp, "pkt_done=%d fw=%d\n",
3692 	    tx->pkt_done, be32toh(sc->ss->fw_stats->send_done_count));
3693 }
3694 
3695 static u_long
3696 mxge_update_stats(mxge_softc_t *sc)
3697 {
3698 	u_long ipackets, opackets, pkts;
3699 
3700 	IFNET_STAT_GET(sc->ifp, ipackets, ipackets);
3701 	IFNET_STAT_GET(sc->ifp, opackets, opackets);
3702 
3703 	pkts = ipackets - sc->ipackets;
3704 	pkts += opackets - sc->opackets;
3705 
3706 	sc->ipackets = ipackets;
3707 	sc->opackets = opackets;
3708 
3709 	return pkts;
3710 }
3711 
3712 static void
3713 mxge_tick(void *arg)
3714 {
3715 	mxge_softc_t *sc = arg;
3716 	u_long pkts = 0;
3717 	int err = 0;
3718 	int ticks;
3719 
3720 	lwkt_serialize_enter(&sc->main_serialize);
3721 
3722 	ticks = mxge_ticks;
3723 	if (sc->ifp->if_flags & IFF_RUNNING) {
3724 		/* Aggregate stats from different slices */
3725 		pkts = mxge_update_stats(sc);
3726 		if (sc->need_media_probe)
3727 			mxge_media_probe(sc);
3728 	}
3729 	if (pkts == 0) {
3730 		uint16_t cmd;
3731 
3732 		/* Ensure NIC did not suffer h/w fault while idle */
3733 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3734 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3735 			sc->dying = 2;
3736 			mxge_serialize_skipmain(sc);
3737 			mxge_watchdog_reset(sc);
3738 			mxge_deserialize_skipmain(sc);
3739 			err = ENXIO;
3740 		}
3741 
3742 		/* Look less often if NIC is idle */
3743 		ticks *= 4;
3744 	}
3745 
3746 	if (err == 0)
3747 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
3748 
3749 	lwkt_serialize_exit(&sc->main_serialize);
3750 }
3751 
3752 static int
3753 mxge_media_change(struct ifnet *ifp)
3754 {
3755 	mxge_softc_t *sc = ifp->if_softc;
3756 	const struct ifmedia *ifm = &sc->media;
3757 	int pause;
3758 
3759 	if (IFM_OPTIONS(ifm->ifm_media) & (IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE)) {
3760 		if (sc->pause)
3761 			return 0;
3762 		pause = 1;
3763 	} else {
3764 		if (!sc->pause)
3765 			return 0;
3766 		pause = 0;
3767 	}
3768 	return mxge_change_pause(sc, pause);
3769 }
3770 
3771 static int
3772 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3773 {
3774 	struct ifnet *ifp = sc->ifp;
3775 	int real_mtu, old_mtu;
3776 	int err = 0;
3777 
3778 	real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3779 	if (mtu > sc->max_mtu || real_mtu < 60)
3780 		return EINVAL;
3781 
3782 	old_mtu = ifp->if_mtu;
3783 	ifp->if_mtu = mtu;
3784 	if (ifp->if_flags & IFF_RUNNING) {
3785 		mxge_close(sc, 0);
3786 		err = mxge_open(sc);
3787 		if (err != 0) {
3788 			ifp->if_mtu = old_mtu;
3789 			mxge_close(sc, 0);
3790 			mxge_open(sc);
3791 		}
3792 	}
3793 	return err;
3794 }
3795 
3796 static void
3797 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3798 {
3799 	mxge_softc_t *sc = ifp->if_softc;
3800 
3801 	ifmr->ifm_status = IFM_AVALID;
3802 	ifmr->ifm_active = IFM_ETHER;
3803 
3804 	if (sc->link_state)
3805 		ifmr->ifm_status |= IFM_ACTIVE;
3806 
3807 	/*
3808 	 * Autoselect is not supported, so the current media
3809 	 * should be delivered.
3810 	 */
3811 	ifmr->ifm_active |= sc->current_media;
3812 	if (sc->current_media != IFM_NONE) {
3813 		ifmr->ifm_active |= MXGE_IFM;
3814 		if (sc->pause)
3815 			ifmr->ifm_active |= IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE;
3816 	}
3817 }
3818 
3819 static int
3820 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data,
3821     struct ucred *cr __unused)
3822 {
3823 	mxge_softc_t *sc = ifp->if_softc;
3824 	struct ifreq *ifr = (struct ifreq *)data;
3825 	int err, mask;
3826 
3827 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
3828 	err = 0;
3829 
3830 	switch (command) {
3831 	case SIOCSIFMTU:
3832 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3833 		break;
3834 
3835 	case SIOCSIFFLAGS:
3836 		if (sc->dying)
3837 			return EINVAL;
3838 
3839 		if (ifp->if_flags & IFF_UP) {
3840 			if (!(ifp->if_flags & IFF_RUNNING)) {
3841 				err = mxge_open(sc);
3842 			} else {
3843 				/*
3844 				 * Take care of PROMISC and ALLMULTI
3845 				 * flag changes
3846 				 */
3847 				mxge_change_promisc(sc,
3848 				    ifp->if_flags & IFF_PROMISC);
3849 				mxge_set_multicast_list(sc);
3850 			}
3851 		} else {
3852 			if (ifp->if_flags & IFF_RUNNING)
3853 				mxge_close(sc, 0);
3854 		}
3855 		break;
3856 
3857 	case SIOCADDMULTI:
3858 	case SIOCDELMULTI:
3859 		mxge_set_multicast_list(sc);
3860 		break;
3861 
3862 	case SIOCSIFCAP:
3863 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3864 		if (mask & IFCAP_TXCSUM) {
3865 			ifp->if_capenable ^= IFCAP_TXCSUM;
3866 			if (ifp->if_capenable & IFCAP_TXCSUM)
3867 				ifp->if_hwassist |= CSUM_TCP | CSUM_UDP;
3868 			else
3869 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
3870 		}
3871 		if (mask & IFCAP_TSO) {
3872 			ifp->if_capenable ^= IFCAP_TSO;
3873 			if (ifp->if_capenable & IFCAP_TSO)
3874 				ifp->if_hwassist |= CSUM_TSO;
3875 			else
3876 				ifp->if_hwassist &= ~CSUM_TSO;
3877 		}
3878 		if (mask & IFCAP_RXCSUM)
3879 			ifp->if_capenable ^= IFCAP_RXCSUM;
3880 		if (mask & IFCAP_VLAN_HWTAGGING)
3881 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3882 		break;
3883 
3884 	case SIOCGIFMEDIA:
3885 	case SIOCSIFMEDIA:
3886 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3887 		    &sc->media, command);
3888 		break;
3889 
3890 	default:
3891 		err = ether_ioctl(ifp, command, data);
3892 		break;
3893 	}
3894 	return err;
3895 }
3896 
3897 static void
3898 mxge_fetch_tunables(mxge_softc_t *sc)
3899 {
3900 	int ifm;
3901 
3902 	sc->intr_coal_delay = mxge_intr_coal_delay;
3903 	if (sc->intr_coal_delay < 0 || sc->intr_coal_delay > (10 * 1000))
3904 		sc->intr_coal_delay = MXGE_INTR_COAL_DELAY;
3905 
3906 	/* XXX */
3907 	if (mxge_ticks == 0)
3908 		mxge_ticks = hz / 2;
3909 
3910 	ifm = ifmedia_str2ethfc(mxge_flowctrl);
3911 	if (ifm & (IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE))
3912 		sc->pause = 1;
3913 
3914 	sc->use_rss = mxge_use_rss;
3915 
3916 	sc->throttle = mxge_throttle;
3917 	if (sc->throttle && sc->throttle > MXGE_MAX_THROTTLE)
3918 		sc->throttle = MXGE_MAX_THROTTLE;
3919 	if (sc->throttle && sc->throttle < MXGE_MIN_THROTTLE)
3920 		sc->throttle = MXGE_MIN_THROTTLE;
3921 }
3922 
3923 static void
3924 mxge_free_slices(mxge_softc_t *sc)
3925 {
3926 	struct mxge_slice_state *ss;
3927 	int i;
3928 
3929 	if (sc->ss == NULL)
3930 		return;
3931 
3932 	for (i = 0; i < sc->num_slices; i++) {
3933 		ss = &sc->ss[i];
3934 		if (ss->fw_stats != NULL) {
3935 			mxge_dma_free(&ss->fw_stats_dma);
3936 			ss->fw_stats = NULL;
3937 		}
3938 		if (ss->rx_data.rx_done.entry != NULL) {
3939 			mxge_dma_free(&ss->rx_done_dma);
3940 			ss->rx_data.rx_done.entry = NULL;
3941 		}
3942 	}
3943 	kfree(sc->ss, M_DEVBUF);
3944 	sc->ss = NULL;
3945 }
3946 
3947 static int
3948 mxge_alloc_slices(mxge_softc_t *sc)
3949 {
3950 	mxge_cmd_t cmd;
3951 	struct mxge_slice_state *ss;
3952 	size_t bytes;
3953 	int err, i, rx_ring_size;
3954 
3955 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3956 	if (err != 0) {
3957 		device_printf(sc->dev, "Cannot determine rx ring size\n");
3958 		return err;
3959 	}
3960 	rx_ring_size = cmd.data0;
3961 	sc->rx_intr_slots = 2 * (rx_ring_size / sizeof (mcp_dma_addr_t));
3962 
3963 	bytes = sizeof(*sc->ss) * sc->num_slices;
3964 	sc->ss = kmalloc(bytes, M_DEVBUF,
3965 			 M_WAITOK | M_ZERO | M_CACHEALIGN);
3966 
3967 	for (i = 0; i < sc->num_slices; i++) {
3968 		ss = &sc->ss[i];
3969 
3970 		ss->sc = sc;
3971 
3972 		lwkt_serialize_init(&ss->rx_data.rx_serialize);
3973 		lwkt_serialize_init(&ss->tx.tx_serialize);
3974 		ss->intr_rid = -1;
3975 
3976 		/*
3977 		 * Allocate per-slice rx interrupt queue
3978 		 * XXX assume 4bytes mcp_slot
3979 		 */
3980 		bytes = sc->rx_intr_slots * sizeof(mcp_slot_t);
3981 		err = mxge_dma_alloc(sc, &ss->rx_done_dma, bytes, 4096);
3982 		if (err != 0) {
3983 			device_printf(sc->dev,
3984 			    "alloc %d slice rx_done failed\n", i);
3985 			return err;
3986 		}
3987 		ss->rx_data.rx_done.entry = ss->rx_done_dma.dmem_addr;
3988 
3989 		/*
3990 		 * Allocate the per-slice firmware stats
3991 		 */
3992 		bytes = sizeof(*ss->fw_stats);
3993 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
3994 		    sizeof(*ss->fw_stats), 64);
3995 		if (err != 0) {
3996 			device_printf(sc->dev,
3997 			    "alloc %d fw_stats failed\n", i);
3998 			return err;
3999 		}
4000 		ss->fw_stats = ss->fw_stats_dma.dmem_addr;
4001 	}
4002 	return 0;
4003 }
4004 
4005 static void
4006 mxge_slice_probe(mxge_softc_t *sc)
4007 {
4008 	int status, max_intr_slots, max_slices, num_slices;
4009 	int msix_cnt, msix_enable, multi_tx;
4010 	mxge_cmd_t cmd;
4011 	const char *old_fw;
4012 
4013 	sc->num_slices = 1;
4014 	sc->num_tx_rings = 1;
4015 
4016 	num_slices = device_getenv_int(sc->dev, "num_slices", mxge_num_slices);
4017 	if (num_slices == 1)
4018 		return;
4019 
4020 	if (netisr_ncpus == 1)
4021 		return;
4022 
4023 	msix_enable = device_getenv_int(sc->dev, "msix.enable",
4024 	    mxge_msix_enable);
4025 	if (!msix_enable)
4026 		return;
4027 
4028 	msix_cnt = pci_msix_count(sc->dev);
4029 	if (msix_cnt < 2)
4030 		return;
4031 	if (bootverbose)
4032 		device_printf(sc->dev, "MSI-X count %d\n", msix_cnt);
4033 
4034 	/*
4035 	 * Now load the slice aware firmware see what it supports
4036 	 */
4037 	old_fw = sc->fw_name;
4038 	if (old_fw == mxge_fw_aligned)
4039 		sc->fw_name = mxge_fw_rss_aligned;
4040 	else
4041 		sc->fw_name = mxge_fw_rss_unaligned;
4042 	status = mxge_load_firmware(sc, 0);
4043 	if (status != 0) {
4044 		device_printf(sc->dev, "Falling back to a single slice\n");
4045 		return;
4046 	}
4047 
4048 	/*
4049 	 * Try to send a reset command to the card to see if it is alive
4050 	 */
4051 	memset(&cmd, 0, sizeof(cmd));
4052 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4053 	if (status != 0) {
4054 		device_printf(sc->dev, "failed reset\n");
4055 		goto abort_with_fw;
4056 	}
4057 
4058 	/*
4059 	 * Get rx ring size to calculate rx interrupt queue size
4060 	 */
4061 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4062 	if (status != 0) {
4063 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4064 		goto abort_with_fw;
4065 	}
4066 	max_intr_slots = 2 * (cmd.data0 / sizeof(mcp_dma_addr_t));
4067 
4068 	/*
4069 	 * Tell it the size of the rx interrupt queue
4070 	 */
4071 	cmd.data0 = max_intr_slots * sizeof(struct mcp_slot);
4072 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4073 	if (status != 0) {
4074 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4075 		goto abort_with_fw;
4076 	}
4077 
4078 	/*
4079 	 * Ask the maximum number of slices it supports
4080 	 */
4081 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4082 	if (status != 0) {
4083 		device_printf(sc->dev,
4084 		    "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4085 		goto abort_with_fw;
4086 	}
4087 	max_slices = cmd.data0;
4088 	if (bootverbose)
4089 		device_printf(sc->dev, "max slices %d\n", max_slices);
4090 
4091 	if (max_slices > msix_cnt)
4092 		max_slices = msix_cnt;
4093 
4094 	sc->ring_map = if_ringmap_alloc(sc->dev, num_slices, max_slices);
4095 	sc->num_slices = if_ringmap_count(sc->ring_map);
4096 
4097 	multi_tx = device_getenv_int(sc->dev, "multi_tx", mxge_multi_tx);
4098 	if (multi_tx)
4099 		sc->num_tx_rings = sc->num_slices;
4100 
4101 	if (bootverbose) {
4102 		device_printf(sc->dev, "using %d slices, max %d\n",
4103 		    sc->num_slices, max_slices);
4104 	}
4105 
4106 	if (sc->num_slices == 1)
4107 		goto abort_with_fw;
4108 	return;
4109 
4110 abort_with_fw:
4111 	sc->fw_name = old_fw;
4112 	mxge_load_firmware(sc, 0);
4113 }
4114 
4115 static void
4116 mxge_setup_serialize(struct mxge_softc *sc)
4117 {
4118 	int i = 0, slice;
4119 
4120 	/* Main + rx + tx */
4121 	sc->nserialize = (2 * sc->num_slices) + 1;
4122 	sc->serializes =
4123 	    kmalloc(sc->nserialize * sizeof(struct lwkt_serialize *),
4124 	        M_DEVBUF, M_WAITOK | M_ZERO);
4125 
4126 	/*
4127 	 * Setup serializes
4128 	 *
4129 	 * NOTE: Order is critical
4130 	 */
4131 
4132 	KKASSERT(i < sc->nserialize);
4133 	sc->serializes[i++] = &sc->main_serialize;
4134 
4135 	for (slice = 0; slice < sc->num_slices; ++slice) {
4136 		KKASSERT(i < sc->nserialize);
4137 		sc->serializes[i++] = &sc->ss[slice].rx_data.rx_serialize;
4138 	}
4139 
4140 	for (slice = 0; slice < sc->num_slices; ++slice) {
4141 		KKASSERT(i < sc->nserialize);
4142 		sc->serializes[i++] = &sc->ss[slice].tx.tx_serialize;
4143 	}
4144 
4145 	KKASSERT(i == sc->nserialize);
4146 }
4147 
4148 static void
4149 mxge_serialize(struct ifnet *ifp, enum ifnet_serialize slz)
4150 {
4151 	struct mxge_softc *sc = ifp->if_softc;
4152 
4153 	ifnet_serialize_array_enter(sc->serializes, sc->nserialize, slz);
4154 }
4155 
4156 static void
4157 mxge_deserialize(struct ifnet *ifp, enum ifnet_serialize slz)
4158 {
4159 	struct mxge_softc *sc = ifp->if_softc;
4160 
4161 	ifnet_serialize_array_exit(sc->serializes, sc->nserialize, slz);
4162 }
4163 
4164 static int
4165 mxge_tryserialize(struct ifnet *ifp, enum ifnet_serialize slz)
4166 {
4167 	struct mxge_softc *sc = ifp->if_softc;
4168 
4169 	return ifnet_serialize_array_try(sc->serializes, sc->nserialize, slz);
4170 }
4171 
4172 #ifdef INVARIANTS
4173 
4174 static void
4175 mxge_serialize_assert(struct ifnet *ifp, enum ifnet_serialize slz,
4176     boolean_t serialized)
4177 {
4178 	struct mxge_softc *sc = ifp->if_softc;
4179 
4180 	ifnet_serialize_array_assert(sc->serializes, sc->nserialize,
4181 	    slz, serialized);
4182 }
4183 
4184 #endif	/* INVARIANTS */
4185 
4186 #ifdef IFPOLL_ENABLE
4187 
4188 static void
4189 mxge_npoll_rx(struct ifnet *ifp, void *xss, int cycle)
4190 {
4191 	struct mxge_slice_state *ss = xss;
4192 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
4193 
4194 	ASSERT_SERIALIZED(&ss->rx_data.rx_serialize);
4195 
4196 	if (rx_done->entry[rx_done->idx].length != 0) {
4197 		mxge_clean_rx_done(&ss->sc->arpcom.ac_if, &ss->rx_data, cycle);
4198 	} else {
4199 		/*
4200 		 * XXX
4201 		 * This register writting obviously has cost,
4202 		 * however, if we don't hand back the rx token,
4203 		 * the upcoming packets may suffer rediculously
4204 		 * large delay, as observed on 8AL-C using ping(8).
4205 		 */
4206 		*ss->irq_claim = be32toh(3);
4207 	}
4208 }
4209 
4210 static void
4211 mxge_npoll(struct ifnet *ifp, struct ifpoll_info *info)
4212 {
4213 	struct mxge_softc *sc = ifp->if_softc;
4214 	int i;
4215 
4216 	if (info == NULL)
4217 		return;
4218 
4219 	/*
4220 	 * Only poll rx; polling tx and status don't seem to work
4221 	 */
4222 	for (i = 0; i < sc->num_slices; ++i) {
4223 		struct mxge_slice_state *ss = &sc->ss[i];
4224 		int cpu = ss->intr_cpuid;
4225 
4226 		KKASSERT(cpu < netisr_ncpus);
4227 		info->ifpi_rx[cpu].poll_func = mxge_npoll_rx;
4228 		info->ifpi_rx[cpu].arg = ss;
4229 		info->ifpi_rx[cpu].serializer = &ss->rx_data.rx_serialize;
4230 	}
4231 }
4232 
4233 #endif	/* IFPOLL_ENABLE */
4234 
4235 static int
4236 mxge_attach(device_t dev)
4237 {
4238 	mxge_softc_t *sc = device_get_softc(dev);
4239 	struct ifnet *ifp = &sc->arpcom.ac_if;
4240 	int err, rid, i;
4241 
4242 	/*
4243 	 * Avoid rewriting half the lines in this file to use
4244 	 * &sc->arpcom.ac_if instead
4245 	 */
4246 	sc->ifp = ifp;
4247 	sc->dev = dev;
4248 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4249 
4250 	/* IFM_ETH_FORCEPAUSE can't be changed */
4251 	ifmedia_init(&sc->media, IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE,
4252 	    mxge_media_change, mxge_media_status);
4253 
4254 	lwkt_serialize_init(&sc->main_serialize);
4255 
4256 	mxge_fetch_tunables(sc);
4257 
4258 	err = bus_dma_tag_create(NULL,			/* parent */
4259 				 1,			/* alignment */
4260 				 0,			/* boundary */
4261 				 BUS_SPACE_MAXADDR,	/* low */
4262 				 BUS_SPACE_MAXADDR,	/* high */
4263 				 BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
4264 				 0, 			/* num segs */
4265 				 BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
4266 				 0,			/* flags */
4267 				 &sc->parent_dmat);	/* tag */
4268 	if (err != 0) {
4269 		device_printf(dev, "Err %d allocating parent dmat\n", err);
4270 		goto failed;
4271 	}
4272 
4273 	callout_init_mp(&sc->co_hdl);
4274 
4275 	mxge_setup_cfg_space(sc);
4276 
4277 	/*
4278 	 * Map the board into the kernel
4279 	 */
4280 	rid = PCIR_BARS;
4281 	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
4282 	    &rid, RF_ACTIVE);
4283 	if (sc->mem_res == NULL) {
4284 		device_printf(dev, "could not map memory\n");
4285 		err = ENXIO;
4286 		goto failed;
4287 	}
4288 
4289 	sc->sram = rman_get_virtual(sc->mem_res);
4290 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4291 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4292 		device_printf(dev, "impossible memory region size %ld\n",
4293 		    rman_get_size(sc->mem_res));
4294 		err = ENXIO;
4295 		goto failed;
4296 	}
4297 
4298 	/*
4299 	 * Make NULL terminated copy of the EEPROM strings section of
4300 	 * lanai SRAM
4301 	 */
4302 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4303 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4304 	    rman_get_bushandle(sc->mem_res),
4305 	    sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4306 	    sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE - 2);
4307 	err = mxge_parse_strings(sc);
4308 	if (err != 0) {
4309 		device_printf(dev, "parse EEPROM string failed\n");
4310 		goto failed;
4311 	}
4312 
4313 	/*
4314 	 * Enable write combining for efficient use of PCIe bus
4315 	 */
4316 	mxge_enable_wc(sc);
4317 
4318 	/*
4319 	 * Allocate the out of band DMA memory
4320 	 */
4321 	err = mxge_dma_alloc(sc, &sc->cmd_dma, sizeof(mxge_cmd_t), 64);
4322 	if (err != 0) {
4323 		device_printf(dev, "alloc cmd DMA buf failed\n");
4324 		goto failed;
4325 	}
4326 	sc->cmd = sc->cmd_dma.dmem_addr;
4327 
4328 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4329 	if (err != 0) {
4330 		device_printf(dev, "alloc zeropad DMA buf failed\n");
4331 		goto failed;
4332 	}
4333 
4334 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4335 	if (err != 0) {
4336 		device_printf(dev, "alloc dmabench DMA buf failed\n");
4337 		goto failed;
4338 	}
4339 
4340 	/* Select & load the firmware */
4341 	err = mxge_select_firmware(sc);
4342 	if (err != 0) {
4343 		device_printf(dev, "select firmware failed\n");
4344 		goto failed;
4345 	}
4346 
4347 	mxge_slice_probe(sc);
4348 	err = mxge_alloc_slices(sc);
4349 	if (err != 0) {
4350 		device_printf(dev, "alloc slices failed\n");
4351 		goto failed;
4352 	}
4353 
4354 	err = mxge_alloc_intr(sc);
4355 	if (err != 0) {
4356 		device_printf(dev, "alloc intr failed\n");
4357 		goto failed;
4358 	}
4359 
4360 	/* Setup serializes */
4361 	mxge_setup_serialize(sc);
4362 
4363 	err = mxge_reset(sc, 0);
4364 	if (err != 0) {
4365 		device_printf(dev, "reset failed\n");
4366 		goto failed;
4367 	}
4368 
4369 	err = mxge_alloc_rings(sc);
4370 	if (err != 0) {
4371 		device_printf(dev, "failed to allocate rings\n");
4372 		goto failed;
4373 	}
4374 
4375 	ifp->if_baudrate = IF_Gbps(10UL);
4376 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO;
4377 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4378 
4379 	ifp->if_capabilities |= IFCAP_VLAN_MTU;
4380 #if 0
4381 	/* Well, its software, sigh */
4382 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
4383 #endif
4384 	ifp->if_capenable = ifp->if_capabilities;
4385 
4386 	ifp->if_softc = sc;
4387 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4388 	ifp->if_init = mxge_init;
4389 	ifp->if_ioctl = mxge_ioctl;
4390 	ifp->if_start = mxge_start;
4391 #ifdef IFPOLL_ENABLE
4392 	if (sc->intr_type != PCI_INTR_TYPE_LEGACY)
4393 		ifp->if_npoll = mxge_npoll;
4394 #endif
4395 	ifp->if_serialize = mxge_serialize;
4396 	ifp->if_deserialize = mxge_deserialize;
4397 	ifp->if_tryserialize = mxge_tryserialize;
4398 #ifdef INVARIANTS
4399 	ifp->if_serialize_assert = mxge_serialize_assert;
4400 #endif
4401 
4402 	/* Increase TSO burst length */
4403 	ifp->if_tsolen = (32 * ETHERMTU);
4404 
4405 	/* Initialise the ifmedia structure */
4406 	mxge_media_init(sc);
4407 	mxge_media_probe(sc);
4408 
4409 	ether_ifattach(ifp, sc->mac_addr, NULL);
4410 
4411 	/* Setup TX rings and subqueues */
4412 	for (i = 0; i < sc->num_tx_rings; ++i) {
4413 		struct ifaltq_subque *ifsq = ifq_get_subq(&ifp->if_snd, i);
4414 		struct mxge_slice_state *ss = &sc->ss[i];
4415 
4416 		ifsq_set_cpuid(ifsq, ss->intr_cpuid);
4417 		ifsq_set_hw_serialize(ifsq, &ss->tx.tx_serialize);
4418 		ifsq_set_priv(ifsq, &ss->tx);
4419 		ss->tx.ifsq = ifsq;
4420 
4421 		ifsq_watchdog_init(&ss->tx.watchdog, ifsq, mxge_watchdog, 0);
4422 	}
4423 
4424 	/*
4425 	 * XXX
4426 	 * We are not ready to do "gather" jumbo frame, so
4427 	 * limit MTU to MJUMPAGESIZE
4428 	 */
4429 	sc->max_mtu = MJUMPAGESIZE -
4430 	    ETHER_HDR_LEN - EVL_ENCAPLEN - MXGEFW_PAD - 1;
4431 	sc->dying = 0;
4432 
4433 	err = mxge_setup_intr(sc);
4434 	if (err != 0) {
4435 		device_printf(dev, "alloc and setup intr failed\n");
4436 		ether_ifdetach(ifp);
4437 		goto failed;
4438 	}
4439 
4440 	mxge_add_sysctls(sc);
4441 
4442 	/* Increase non-cluster mbuf limit; used by small RX rings */
4443 	mb_inclimit(ifp->if_nmbclusters);
4444 
4445 	callout_reset_bycpu(&sc->co_hdl, mxge_ticks, mxge_tick, sc,
4446 	    sc->ss[0].intr_cpuid);
4447 	return 0;
4448 
4449 failed:
4450 	mxge_detach(dev);
4451 	return err;
4452 }
4453 
4454 static int
4455 mxge_detach(device_t dev)
4456 {
4457 	mxge_softc_t *sc = device_get_softc(dev);
4458 
4459 	if (device_is_attached(dev)) {
4460 		struct ifnet *ifp = sc->ifp;
4461 		int mblimit = ifp->if_nmbclusters;
4462 
4463 		ifnet_serialize_all(ifp);
4464 
4465 		sc->dying = 1;
4466 		if (ifp->if_flags & IFF_RUNNING)
4467 			mxge_close(sc, 1);
4468 		callout_stop(&sc->co_hdl);
4469 
4470 		mxge_teardown_intr(sc, sc->num_slices);
4471 
4472 		ifnet_deserialize_all(ifp);
4473 
4474 		callout_terminate(&sc->co_hdl);
4475 
4476 		ether_ifdetach(ifp);
4477 
4478 		/* Decrease non-cluster mbuf limit increased by us */
4479 		mb_inclimit(-mblimit);
4480 	}
4481 	ifmedia_removeall(&sc->media);
4482 
4483 	if (sc->cmd != NULL && sc->zeropad_dma.dmem_addr != NULL &&
4484 	    sc->sram != NULL)
4485 		mxge_dummy_rdma(sc, 0);
4486 
4487 	mxge_free_intr(sc);
4488 	mxge_rem_sysctls(sc);
4489 	mxge_free_rings(sc);
4490 
4491 	/* MUST after sysctls, intr and rings are freed */
4492 	mxge_free_slices(sc);
4493 
4494 	if (sc->dmabench_dma.dmem_addr != NULL)
4495 		mxge_dma_free(&sc->dmabench_dma);
4496 	if (sc->zeropad_dma.dmem_addr != NULL)
4497 		mxge_dma_free(&sc->zeropad_dma);
4498 	if (sc->cmd_dma.dmem_addr != NULL)
4499 		mxge_dma_free(&sc->cmd_dma);
4500 
4501 	if (sc->msix_table_res != NULL) {
4502 		bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(2),
4503 		    sc->msix_table_res);
4504 	}
4505 	if (sc->mem_res != NULL) {
4506 		bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS,
4507 		    sc->mem_res);
4508 	}
4509 
4510 	if (sc->parent_dmat != NULL)
4511 		bus_dma_tag_destroy(sc->parent_dmat);
4512 
4513 	if (sc->ring_map != NULL)
4514 		if_ringmap_free(sc->ring_map);
4515 
4516 	return 0;
4517 }
4518 
4519 static int
4520 mxge_shutdown(device_t dev)
4521 {
4522 	return 0;
4523 }
4524 
4525 static void
4526 mxge_free_msix(struct mxge_softc *sc, boolean_t setup)
4527 {
4528 	int i;
4529 
4530 	KKASSERT(sc->num_slices > 1);
4531 
4532 	for (i = 0; i < sc->num_slices; ++i) {
4533 		struct mxge_slice_state *ss = &sc->ss[i];
4534 
4535 		if (ss->intr_res != NULL) {
4536 			bus_release_resource(sc->dev, SYS_RES_IRQ,
4537 			    ss->intr_rid, ss->intr_res);
4538 		}
4539 		if (ss->intr_rid >= 0)
4540 			pci_release_msix_vector(sc->dev, ss->intr_rid);
4541 	}
4542 	if (setup)
4543 		pci_teardown_msix(sc->dev);
4544 }
4545 
4546 static int
4547 mxge_alloc_msix(struct mxge_softc *sc)
4548 {
4549 	struct mxge_slice_state *ss;
4550 	int rid, error, i;
4551 	boolean_t setup = FALSE;
4552 
4553 	KKASSERT(sc->num_slices > 1);
4554 
4555 	ss = &sc->ss[0];
4556 
4557 	ss->intr_serialize = &sc->main_serialize;
4558 	ss->intr_func = mxge_msi;
4559 	ksnprintf(ss->intr_desc0, sizeof(ss->intr_desc0),
4560 	    "%s comb", device_get_nameunit(sc->dev));
4561 	ss->intr_desc = ss->intr_desc0;
4562 	ss->intr_cpuid = if_ringmap_cpumap(sc->ring_map, 0);
4563 
4564 	for (i = 1; i < sc->num_slices; ++i) {
4565 		ss = &sc->ss[i];
4566 
4567 		ss->intr_serialize = &ss->rx_data.rx_serialize;
4568 		if (sc->num_tx_rings == 1) {
4569 			ss->intr_func = mxge_msix_rx;
4570 			ksnprintf(ss->intr_desc0, sizeof(ss->intr_desc0),
4571 			    "%s rx%d", device_get_nameunit(sc->dev), i);
4572 		} else {
4573 			ss->intr_func = mxge_msix_rxtx;
4574 			ksnprintf(ss->intr_desc0, sizeof(ss->intr_desc0),
4575 			    "%s rxtx%d", device_get_nameunit(sc->dev), i);
4576 		}
4577 		ss->intr_desc = ss->intr_desc0;
4578 		ss->intr_cpuid = if_ringmap_cpumap(sc->ring_map, i);
4579 	}
4580 
4581 	rid = PCIR_BAR(2);
4582 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4583 	    &rid, RF_ACTIVE);
4584 	if (sc->msix_table_res == NULL) {
4585 		device_printf(sc->dev, "couldn't alloc MSI-X table res\n");
4586 		return ENXIO;
4587 	}
4588 
4589 	error = pci_setup_msix(sc->dev);
4590 	if (error) {
4591 		device_printf(sc->dev, "could not setup MSI-X\n");
4592 		goto back;
4593 	}
4594 	setup = TRUE;
4595 
4596 	for (i = 0; i < sc->num_slices; ++i) {
4597 		ss = &sc->ss[i];
4598 
4599 		error = pci_alloc_msix_vector(sc->dev, i, &ss->intr_rid,
4600 		    ss->intr_cpuid);
4601 		if (error) {
4602 			device_printf(sc->dev, "could not alloc "
4603 			    "MSI-X %d on cpu%d\n", i, ss->intr_cpuid);
4604 			goto back;
4605 		}
4606 
4607 		ss->intr_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
4608 		    &ss->intr_rid, RF_ACTIVE);
4609 		if (ss->intr_res == NULL) {
4610 			device_printf(sc->dev, "could not alloc "
4611 			    "MSI-X %d resource\n", i);
4612 			error = ENXIO;
4613 			goto back;
4614 		}
4615 	}
4616 
4617 	pci_enable_msix(sc->dev);
4618 	sc->intr_type = PCI_INTR_TYPE_MSIX;
4619 back:
4620 	if (error)
4621 		mxge_free_msix(sc, setup);
4622 	return error;
4623 }
4624 
4625 static int
4626 mxge_alloc_intr(struct mxge_softc *sc)
4627 {
4628 	struct mxge_slice_state *ss;
4629 	u_int irq_flags;
4630 
4631 	if (sc->num_slices > 1) {
4632 		int error;
4633 
4634 		error = mxge_alloc_msix(sc);
4635 		if (error)
4636 			return error;
4637 		KKASSERT(sc->intr_type == PCI_INTR_TYPE_MSIX);
4638 		return 0;
4639 	}
4640 
4641 	ss = &sc->ss[0];
4642 
4643 	sc->intr_type = pci_alloc_1intr(sc->dev, mxge_msi_enable,
4644 	    &ss->intr_rid, &irq_flags);
4645 
4646 	ss->intr_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
4647 	    &ss->intr_rid, irq_flags);
4648 	if (ss->intr_res == NULL) {
4649 		device_printf(sc->dev, "could not alloc interrupt\n");
4650 		return ENXIO;
4651 	}
4652 
4653 	if (sc->intr_type == PCI_INTR_TYPE_LEGACY)
4654 		ss->intr_func = mxge_legacy;
4655 	else
4656 		ss->intr_func = mxge_msi;
4657 	ss->intr_serialize = &sc->main_serialize;
4658 	ss->intr_cpuid = rman_get_cpuid(ss->intr_res);
4659 
4660 	return 0;
4661 }
4662 
4663 static int
4664 mxge_setup_intr(struct mxge_softc *sc)
4665 {
4666 	int i;
4667 
4668 	for (i = 0; i < sc->num_slices; ++i) {
4669 		struct mxge_slice_state *ss = &sc->ss[i];
4670 		int error;
4671 
4672 		error = bus_setup_intr_descr(sc->dev, ss->intr_res,
4673 		    INTR_MPSAFE, ss->intr_func, ss, &ss->intr_hand,
4674 		    ss->intr_serialize, ss->intr_desc);
4675 		if (error) {
4676 			device_printf(sc->dev, "can't setup %dth intr\n", i);
4677 			mxge_teardown_intr(sc, i);
4678 			return error;
4679 		}
4680 	}
4681 	return 0;
4682 }
4683 
4684 static void
4685 mxge_teardown_intr(struct mxge_softc *sc, int cnt)
4686 {
4687 	int i;
4688 
4689 	if (sc->ss == NULL)
4690 		return;
4691 
4692 	for (i = 0; i < cnt; ++i) {
4693 		struct mxge_slice_state *ss = &sc->ss[i];
4694 
4695 		bus_teardown_intr(sc->dev, ss->intr_res, ss->intr_hand);
4696 	}
4697 }
4698 
4699 static void
4700 mxge_free_intr(struct mxge_softc *sc)
4701 {
4702 	if (sc->ss == NULL)
4703 		return;
4704 
4705 	if (sc->intr_type != PCI_INTR_TYPE_MSIX) {
4706 		struct mxge_slice_state *ss = &sc->ss[0];
4707 
4708 		if (ss->intr_res != NULL) {
4709 			bus_release_resource(sc->dev, SYS_RES_IRQ,
4710 			    ss->intr_rid, ss->intr_res);
4711 		}
4712 		if (sc->intr_type == PCI_INTR_TYPE_MSI)
4713 			pci_release_msi(sc->dev);
4714 	} else {
4715 		mxge_free_msix(sc, TRUE);
4716 	}
4717 }
4718