xref: /dragonfly/sys/dev/netif/mxge/if_mxge.c (revision 6ca88057)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2013, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 $FreeBSD: head/sys/dev/mxge/if_mxge.c 254263 2013-08-12 23:30:01Z scottl $
29 
30 ***************************************************************************/
31 
32 #include "opt_ifpoll.h"
33 #include "opt_inet.h"
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/linker.h>
38 #include <sys/firmware.h>
39 #include <sys/endian.h>
40 #include <sys/in_cksum.h>
41 #include <sys/sockio.h>
42 #include <sys/mbuf.h>
43 #include <sys/malloc.h>
44 #include <sys/kernel.h>
45 #include <sys/module.h>
46 #include <sys/serialize.h>
47 #include <sys/socket.h>
48 #include <sys/sysctl.h>
49 
50 #include <net/if.h>
51 #include <net/if_arp.h>
52 #include <net/ifq_var.h>
53 #include <net/ethernet.h>
54 #include <net/if_dl.h>
55 #include <net/if_media.h>
56 #include <net/if_poll.h>
57 
58 #include <net/bpf.h>
59 
60 #include <net/if_types.h>
61 #include <net/vlan/if_vlan_var.h>
62 #include <net/zlib.h>
63 #include <net/toeplitz.h>
64 
65 #include <netinet/in_systm.h>
66 #include <netinet/in.h>
67 #include <netinet/ip.h>
68 #include <netinet/tcp.h>
69 
70 #include <sys/bus.h>
71 #include <sys/rman.h>
72 
73 #include <bus/pci/pcireg.h>
74 #include <bus/pci/pcivar.h>
75 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
76 
77 #include <vm/vm.h>		/* for pmap_mapdev() */
78 #include <vm/pmap.h>
79 
80 #if defined(__i386__) || defined(__x86_64__)
81 #include <machine/specialreg.h>
82 #endif
83 
84 #include <dev/netif/mxge/mxge_mcp.h>
85 #include <dev/netif/mxge/mcp_gen_header.h>
86 #include <dev/netif/mxge/if_mxge_var.h>
87 
88 #define MXGE_IFM	(IFM_ETHER | IFM_FDX | IFM_ETH_FORCEPAUSE)
89 
90 #define MXGE_RX_SMALL_BUFLEN		(MHLEN - MXGEFW_PAD)
91 #define MXGE_HWRSS_KEYLEN		16
92 
93 /* Tunable params */
94 static int mxge_nvidia_ecrc_enable = 1;
95 static int mxge_force_firmware = 0;
96 static int mxge_intr_coal_delay = MXGE_INTR_COAL_DELAY;
97 static int mxge_deassert_wait = 1;
98 static int mxge_ticks;
99 static int mxge_num_slices = 0;
100 static int mxge_always_promisc = 0;
101 static int mxge_throttle = 0;
102 static int mxge_msi_enable = 1;
103 static int mxge_msix_enable = 1;
104 static int mxge_multi_tx = 1;
105 /*
106  * Don't use RSS by default, its just too slow
107  */
108 static int mxge_use_rss = 0;
109 
110 static char mxge_flowctrl[IFM_ETH_FC_STRLEN] = IFM_ETH_FC_FORCE_FULL;
111 
112 static const char *mxge_fw_unaligned = "mxge_ethp_z8e";
113 static const char *mxge_fw_aligned = "mxge_eth_z8e";
114 static const char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
115 static const char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
116 
117 TUNABLE_INT("hw.mxge.num_slices", &mxge_num_slices);
118 TUNABLE_INT("hw.mxge.intr_coal_delay", &mxge_intr_coal_delay);
119 TUNABLE_INT("hw.mxge.nvidia_ecrc_enable", &mxge_nvidia_ecrc_enable);
120 TUNABLE_INT("hw.mxge.force_firmware", &mxge_force_firmware);
121 TUNABLE_INT("hw.mxge.deassert_wait", &mxge_deassert_wait);
122 TUNABLE_INT("hw.mxge.ticks", &mxge_ticks);
123 TUNABLE_INT("hw.mxge.always_promisc", &mxge_always_promisc);
124 TUNABLE_INT("hw.mxge.throttle", &mxge_throttle);
125 TUNABLE_INT("hw.mxge.multi_tx", &mxge_multi_tx);
126 TUNABLE_INT("hw.mxge.use_rss", &mxge_use_rss);
127 TUNABLE_INT("hw.mxge.msi.enable", &mxge_msi_enable);
128 TUNABLE_INT("hw.mxge.msix.enable", &mxge_msix_enable);
129 TUNABLE_STR("hw.mxge.flow_ctrl", mxge_flowctrl, sizeof(mxge_flowctrl));
130 
131 static int mxge_probe(device_t dev);
132 static int mxge_attach(device_t dev);
133 static int mxge_detach(device_t dev);
134 static int mxge_shutdown(device_t dev);
135 
136 static int mxge_alloc_intr(struct mxge_softc *sc);
137 static void mxge_free_intr(struct mxge_softc *sc);
138 static int mxge_setup_intr(struct mxge_softc *sc);
139 static void mxge_teardown_intr(struct mxge_softc *sc, int cnt);
140 
141 static device_method_t mxge_methods[] = {
142 	/* Device interface */
143 	DEVMETHOD(device_probe, mxge_probe),
144 	DEVMETHOD(device_attach, mxge_attach),
145 	DEVMETHOD(device_detach, mxge_detach),
146 	DEVMETHOD(device_shutdown, mxge_shutdown),
147 	DEVMETHOD_END
148 };
149 
150 static driver_t mxge_driver = {
151 	"mxge",
152 	mxge_methods,
153 	sizeof(mxge_softc_t),
154 };
155 
156 static devclass_t mxge_devclass;
157 
158 /* Declare ourselves to be a child of the PCI bus.*/
159 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, NULL, NULL);
160 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
161 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
162 
163 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
164 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
165 static void mxge_close(mxge_softc_t *sc, int down);
166 static int mxge_open(mxge_softc_t *sc);
167 static void mxge_tick(void *arg);
168 static void mxge_watchdog_reset(mxge_softc_t *sc);
169 static void mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice);
170 
171 static int
172 mxge_probe(device_t dev)
173 {
174 	if (pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM &&
175 	    (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E ||
176 	     pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9)) {
177 		int rev = pci_get_revid(dev);
178 
179 		switch (rev) {
180 		case MXGE_PCI_REV_Z8E:
181 			device_set_desc(dev, "Myri10G-PCIE-8A");
182 			break;
183 		case MXGE_PCI_REV_Z8ES:
184 			device_set_desc(dev, "Myri10G-PCIE-8B");
185 			break;
186 		default:
187 			device_set_desc(dev, "Myri10G-PCIE-8??");
188 			device_printf(dev, "Unrecognized rev %d NIC\n", rev);
189 			break;
190 		}
191 		return 0;
192 	}
193 	return ENXIO;
194 }
195 
196 static void
197 mxge_enable_wc(mxge_softc_t *sc)
198 {
199 #if defined(__i386__) || defined(__x86_64__)
200 	vm_offset_t len;
201 
202 	sc->wc = 1;
203 	len = rman_get_size(sc->mem_res);
204 	pmap_change_attr((vm_offset_t) sc->sram, len / PAGE_SIZE,
205 	    PAT_WRITE_COMBINING);
206 #endif
207 }
208 
209 static int
210 mxge_dma_alloc(mxge_softc_t *sc, bus_dmamem_t *dma, size_t bytes,
211     bus_size_t alignment)
212 {
213 	bus_size_t boundary;
214 	int err;
215 
216 	if (bytes > 4096 && alignment == 4096)
217 		boundary = 0;
218 	else
219 		boundary = 4096;
220 
221 	err = bus_dmamem_coherent(sc->parent_dmat, alignment, boundary,
222 	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, bytes,
223 	    BUS_DMA_WAITOK | BUS_DMA_ZERO, dma);
224 	if (err != 0) {
225 		device_printf(sc->dev, "bus_dmamem_coherent failed: %d\n", err);
226 		return err;
227 	}
228 	return 0;
229 }
230 
231 static void
232 mxge_dma_free(bus_dmamem_t *dma)
233 {
234 	bus_dmamap_unload(dma->dmem_tag, dma->dmem_map);
235 	bus_dmamem_free(dma->dmem_tag, dma->dmem_addr, dma->dmem_map);
236 	bus_dma_tag_destroy(dma->dmem_tag);
237 }
238 
239 /*
240  * The eeprom strings on the lanaiX have the format
241  * SN=x\0
242  * MAC=x:x:x:x:x:x\0
243  * PC=text\0
244  */
245 static int
246 mxge_parse_strings(mxge_softc_t *sc)
247 {
248 	const char *ptr;
249 	int i, found_mac, found_sn2;
250 	char *endptr;
251 
252 	ptr = sc->eeprom_strings;
253 	found_mac = 0;
254 	found_sn2 = 0;
255 	while (*ptr != '\0') {
256 		if (strncmp(ptr, "MAC=", 4) == 0) {
257 			ptr += 4;
258 			for (i = 0;;) {
259 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
260 				if (endptr - ptr != 2)
261 					goto abort;
262 				ptr = endptr;
263 				if (++i == 6)
264 					break;
265 				if (*ptr++ != ':')
266 					goto abort;
267 			}
268 			found_mac = 1;
269 		} else if (strncmp(ptr, "PC=", 3) == 0) {
270 			ptr += 3;
271 			strlcpy(sc->product_code_string, ptr,
272 			    sizeof(sc->product_code_string));
273 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
274 			ptr += 3;
275 			strlcpy(sc->serial_number_string, ptr,
276 			    sizeof(sc->serial_number_string));
277 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
278 			/* SN2 takes precedence over SN */
279 			ptr += 4;
280 			found_sn2 = 1;
281 			strlcpy(sc->serial_number_string, ptr,
282 			    sizeof(sc->serial_number_string));
283 		}
284 		while (*ptr++ != '\0') {}
285 	}
286 
287 	if (found_mac)
288 		return 0;
289 
290 abort:
291 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
292 	return ENXIO;
293 }
294 
295 #if defined(__i386__) || defined(__x86_64__)
296 
297 static void
298 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
299 {
300 	uint32_t val;
301 	unsigned long base, off;
302 	char *va, *cfgptr;
303 	device_t pdev, mcp55;
304 	uint16_t vendor_id, device_id, word;
305 	uintptr_t bus, slot, func, ivend, idev;
306 	uint32_t *ptr32;
307 
308 	if (!mxge_nvidia_ecrc_enable)
309 		return;
310 
311 	pdev = device_get_parent(device_get_parent(sc->dev));
312 	if (pdev == NULL) {
313 		device_printf(sc->dev, "could not find parent?\n");
314 		return;
315 	}
316 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
317 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
318 
319 	if (vendor_id != 0x10de)
320 		return;
321 
322 	base = 0;
323 
324 	if (device_id == 0x005d) {
325 		/* ck804, base address is magic */
326 		base = 0xe0000000UL;
327 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
328 		/* mcp55, base address stored in chipset */
329 		mcp55 = pci_find_bsf(0, 0, 0);
330 		if (mcp55 &&
331 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
332 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
333 			word = pci_read_config(mcp55, 0x90, 2);
334 			base = ((unsigned long)word & 0x7ffeU) << 25;
335 		}
336 	}
337 	if (!base)
338 		return;
339 
340 	/*
341 	 * XXXX
342 	 * Test below is commented because it is believed that doing
343 	 * config read/write beyond 0xff will access the config space
344 	 * for the next larger function.  Uncomment this and remove
345 	 * the hacky pmap_mapdev() way of accessing config space when
346 	 * DragonFly grows support for extended pcie config space access.
347 	 */
348 #if 0
349 	/*
350 	 * See if we can, by some miracle, access the extended
351 	 * config space
352 	 */
353 	val = pci_read_config(pdev, 0x178, 4);
354 	if (val != 0xffffffff) {
355 		val |= 0x40;
356 		pci_write_config(pdev, 0x178, val, 4);
357 		return;
358 	}
359 #endif
360 	/*
361 	 * Rather than using normal pci config space writes, we must
362 	 * map the Nvidia config space ourselves.  This is because on
363 	 * opteron/nvidia class machine the 0xe000000 mapping is
364 	 * handled by the nvidia chipset, that means the internal PCI
365 	 * device (the on-chip northbridge), or the amd-8131 bridge
366 	 * and things behind them are not visible by this method.
367 	 */
368 
369 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
370 		      PCI_IVAR_BUS, &bus);
371 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
372 		      PCI_IVAR_SLOT, &slot);
373 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
374 		      PCI_IVAR_FUNCTION, &func);
375 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
376 		      PCI_IVAR_VENDOR, &ivend);
377 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
378 		      PCI_IVAR_DEVICE, &idev);
379 
380 	off =  base + 0x00100000UL * (unsigned long)bus +
381 	    0x00001000UL * (unsigned long)(func + 8 * slot);
382 
383 	/* map it into the kernel */
384 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
385 	if (va == NULL) {
386 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
387 		return;
388 	}
389 	/* get a pointer to the config space mapped into the kernel */
390 	cfgptr = va + (off & PAGE_MASK);
391 
392 	/* make sure that we can really access it */
393 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
394 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
395 	if (!(vendor_id == ivend && device_id == idev)) {
396 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
397 		    vendor_id, device_id);
398 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
399 		return;
400 	}
401 
402 	ptr32 = (uint32_t*)(cfgptr + 0x178);
403 	val = *ptr32;
404 
405 	if (val == 0xffffffff) {
406 		device_printf(sc->dev, "extended mapping failed\n");
407 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
408 		return;
409 	}
410 	*ptr32 = val | 0x40;
411 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
412 	if (bootverbose) {
413 		device_printf(sc->dev, "Enabled ECRC on upstream "
414 		    "Nvidia bridge at %d:%d:%d\n",
415 		    (int)bus, (int)slot, (int)func);
416 	}
417 }
418 
419 #else	/* __i386__ || __x86_64__ */
420 
421 static void
422 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
423 {
424 	device_printf(sc->dev, "Nforce 4 chipset on non-x86/x86_64!?!?!\n");
425 }
426 
427 #endif
428 
429 static int
430 mxge_dma_test(mxge_softc_t *sc, int test_type)
431 {
432 	mxge_cmd_t cmd;
433 	bus_addr_t dmatest_bus = sc->dmabench_dma.dmem_busaddr;
434 	int status;
435 	uint32_t len;
436 	const char *test = " ";
437 
438 	/*
439 	 * Run a small DMA test.
440 	 * The magic multipliers to the length tell the firmware
441 	 * to do DMA read, write, or read+write tests.  The
442 	 * results are returned in cmd.data0.  The upper 16
443 	 * bits of the return is the number of transfers completed.
444 	 * The lower 16 bits is the time in 0.5us ticks that the
445 	 * transfers took to complete.
446 	 */
447 
448 	len = sc->tx_boundary;
449 
450 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
451 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
452 	cmd.data2 = len * 0x10000;
453 	status = mxge_send_cmd(sc, test_type, &cmd);
454 	if (status != 0) {
455 		test = "read";
456 		goto abort;
457 	}
458 	sc->read_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
459 
460 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
461 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
462 	cmd.data2 = len * 0x1;
463 	status = mxge_send_cmd(sc, test_type, &cmd);
464 	if (status != 0) {
465 		test = "write";
466 		goto abort;
467 	}
468 	sc->write_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
469 
470 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
471 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
472 	cmd.data2 = len * 0x10001;
473 	status = mxge_send_cmd(sc, test_type, &cmd);
474 	if (status != 0) {
475 		test = "read/write";
476 		goto abort;
477 	}
478 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
479 	    (cmd.data0 & 0xffff);
480 
481 abort:
482 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST) {
483 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
484 		    test, status);
485 	}
486 	return status;
487 }
488 
489 /*
490  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
491  * when the PCI-E Completion packets are aligned on an 8-byte
492  * boundary.  Some PCI-E chip sets always align Completion packets; on
493  * the ones that do not, the alignment can be enforced by enabling
494  * ECRC generation (if supported).
495  *
496  * When PCI-E Completion packets are not aligned, it is actually more
497  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
498  *
499  * If the driver can neither enable ECRC nor verify that it has
500  * already been enabled, then it must use a firmware image which works
501  * around unaligned completion packets (ethp_z8e.dat), and it should
502  * also ensure that it never gives the device a Read-DMA which is
503  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
504  * enabled, then the driver should use the aligned (eth_z8e.dat)
505  * firmware image, and set tx_boundary to 4KB.
506  */
507 static int
508 mxge_firmware_probe(mxge_softc_t *sc)
509 {
510 	device_t dev = sc->dev;
511 	int reg, status;
512 	uint16_t pectl;
513 
514 	sc->tx_boundary = 4096;
515 
516 	/*
517 	 * Verify the max read request size was set to 4KB
518 	 * before trying the test with 4KB.
519 	 */
520 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
521 		pectl = pci_read_config(dev, reg + 0x8, 2);
522 		if ((pectl & (5 << 12)) != (5 << 12)) {
523 			device_printf(dev, "Max Read Req. size != 4k (0x%x)\n",
524 			    pectl);
525 			sc->tx_boundary = 2048;
526 		}
527 	}
528 
529 	/*
530 	 * Load the optimized firmware (which assumes aligned PCIe
531 	 * completions) in order to see if it works on this host.
532 	 */
533 	sc->fw_name = mxge_fw_aligned;
534 	status = mxge_load_firmware(sc, 1);
535 	if (status != 0)
536 		return status;
537 
538 	/*
539 	 * Enable ECRC if possible
540 	 */
541 	mxge_enable_nvidia_ecrc(sc);
542 
543 	/*
544 	 * Run a DMA test which watches for unaligned completions and
545 	 * aborts on the first one seen.  Not required on Z8ES or newer.
546 	 */
547 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
548 		return 0;
549 
550 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
551 	if (status == 0)
552 		return 0; /* keep the aligned firmware */
553 
554 	if (status != E2BIG)
555 		device_printf(dev, "DMA test failed: %d\n", status);
556 	if (status == ENOSYS) {
557 		device_printf(dev, "Falling back to ethp! "
558 		    "Please install up to date fw\n");
559 	}
560 	return status;
561 }
562 
563 static int
564 mxge_select_firmware(mxge_softc_t *sc)
565 {
566 	int aligned = 0;
567 	int force_firmware = mxge_force_firmware;
568 
569 	if (sc->throttle)
570 		force_firmware = sc->throttle;
571 
572 	if (force_firmware != 0) {
573 		if (force_firmware == 1)
574 			aligned = 1;
575 		else
576 			aligned = 0;
577 		if (bootverbose) {
578 			device_printf(sc->dev,
579 			    "Assuming %s completions (forced)\n",
580 			    aligned ? "aligned" : "unaligned");
581 		}
582 		goto abort;
583 	}
584 
585 	/*
586 	 * If the PCIe link width is 4 or less, we can use the aligned
587 	 * firmware and skip any checks
588 	 */
589 	if (sc->link_width != 0 && sc->link_width <= 4) {
590 		device_printf(sc->dev, "PCIe x%d Link, "
591 		    "expect reduced performance\n", sc->link_width);
592 		aligned = 1;
593 		goto abort;
594 	}
595 
596 	if (mxge_firmware_probe(sc) == 0)
597 		return 0;
598 
599 abort:
600 	if (aligned) {
601 		sc->fw_name = mxge_fw_aligned;
602 		sc->tx_boundary = 4096;
603 	} else {
604 		sc->fw_name = mxge_fw_unaligned;
605 		sc->tx_boundary = 2048;
606 	}
607 	return mxge_load_firmware(sc, 0);
608 }
609 
610 static int
611 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
612 {
613 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
614 		if_printf(sc->ifp, "Bad firmware type: 0x%x\n",
615 		    be32toh(hdr->mcp_type));
616 		return EIO;
617 	}
618 
619 	/* Save firmware version for sysctl */
620 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
621 	if (bootverbose)
622 		if_printf(sc->ifp, "firmware id: %s\n", hdr->version);
623 
624 	ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
625 	    &sc->fw_ver_minor, &sc->fw_ver_tiny);
626 
627 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR &&
628 	      sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
629 		if_printf(sc->ifp, "Found firmware version %s\n",
630 		    sc->fw_version);
631 		if_printf(sc->ifp, "Driver needs %d.%d\n",
632 		    MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
633 		return EINVAL;
634 	}
635 	return 0;
636 }
637 
638 static void *
639 z_alloc(void *nil, u_int items, u_int size)
640 {
641 	return kmalloc(items * size, M_TEMP, M_WAITOK);
642 }
643 
644 static void
645 z_free(void *nil, void *ptr)
646 {
647 	kfree(ptr, M_TEMP);
648 }
649 
650 static int
651 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
652 {
653 	z_stream zs;
654 	char *inflate_buffer;
655 	const struct firmware *fw;
656 	const mcp_gen_header_t *hdr;
657 	unsigned hdr_offset;
658 	int status;
659 	unsigned int i;
660 	char dummy;
661 	size_t fw_len;
662 
663 	fw = firmware_get(sc->fw_name);
664 	if (fw == NULL) {
665 		if_printf(sc->ifp, "Could not find firmware image %s\n",
666 		    sc->fw_name);
667 		return ENOENT;
668 	}
669 
670 	/* Setup zlib and decompress f/w */
671 	bzero(&zs, sizeof(zs));
672 	zs.zalloc = z_alloc;
673 	zs.zfree = z_free;
674 	status = inflateInit(&zs);
675 	if (status != Z_OK) {
676 		status = EIO;
677 		goto abort_with_fw;
678 	}
679 
680 	/*
681 	 * The uncompressed size is stored as the firmware version,
682 	 * which would otherwise go unused
683 	 */
684 	fw_len = (size_t)fw->version;
685 	inflate_buffer = kmalloc(fw_len, M_TEMP, M_WAITOK);
686 	zs.avail_in = fw->datasize;
687 	zs.next_in = __DECONST(char *, fw->data);
688 	zs.avail_out = fw_len;
689 	zs.next_out = inflate_buffer;
690 	status = inflate(&zs, Z_FINISH);
691 	if (status != Z_STREAM_END) {
692 		if_printf(sc->ifp, "zlib %d\n", status);
693 		status = EIO;
694 		goto abort_with_buffer;
695 	}
696 
697 	/* Check id */
698 	hdr_offset =
699 	htobe32(*(const uint32_t *)(inflate_buffer + MCP_HEADER_PTR_OFFSET));
700 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
701 		if_printf(sc->ifp, "Bad firmware file");
702 		status = EIO;
703 		goto abort_with_buffer;
704 	}
705 	hdr = (const void*)(inflate_buffer + hdr_offset);
706 
707 	status = mxge_validate_firmware(sc, hdr);
708 	if (status != 0)
709 		goto abort_with_buffer;
710 
711 	/* Copy the inflated firmware to NIC SRAM. */
712 	for (i = 0; i < fw_len; i += 256) {
713 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i, inflate_buffer + i,
714 		    min(256U, (unsigned)(fw_len - i)));
715 		wmb();
716 		dummy = *sc->sram;
717 		wmb();
718 	}
719 
720 	*limit = fw_len;
721 	status = 0;
722 abort_with_buffer:
723 	kfree(inflate_buffer, M_TEMP);
724 	inflateEnd(&zs);
725 abort_with_fw:
726 	firmware_put(fw, FIRMWARE_UNLOAD);
727 	return status;
728 }
729 
730 /*
731  * Enable or disable periodic RDMAs from the host to make certain
732  * chipsets resend dropped PCIe messages
733  */
734 static void
735 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
736 {
737 	char buf_bytes[72];
738 	volatile uint32_t *confirm;
739 	volatile char *submit;
740 	uint32_t *buf, dma_low, dma_high;
741 	int i;
742 
743 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
744 
745 	/* Clear confirmation addr */
746 	confirm = (volatile uint32_t *)sc->cmd;
747 	*confirm = 0;
748 	wmb();
749 
750 	/*
751 	 * Send an rdma command to the PCIe engine, and wait for the
752 	 * response in the confirmation address.  The firmware should
753 	 * write a -1 there to indicate it is alive and well
754 	 */
755 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
756 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
757 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
758 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
759 	buf[2] = htobe32(0xffffffff);		/* confirm data */
760 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
761 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
762 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
763 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
764 	buf[5] = htobe32(enable);		/* enable? */
765 
766 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
767 
768 	mxge_pio_copy(submit, buf, 64);
769 	wmb();
770 	DELAY(1000);
771 	wmb();
772 	i = 0;
773 	while (*confirm != 0xffffffff && i < 20) {
774 		DELAY(1000);
775 		i++;
776 	}
777 	if (*confirm != 0xffffffff) {
778 		if_printf(sc->ifp, "dummy rdma %s failed (%p = 0x%x)",
779 		    (enable ? "enable" : "disable"), confirm, *confirm);
780 	}
781 }
782 
783 static int
784 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
785 {
786 	mcp_cmd_t *buf;
787 	char buf_bytes[sizeof(*buf) + 8];
788 	volatile mcp_cmd_response_t *response = sc->cmd;
789 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
790 	uint32_t dma_low, dma_high;
791 	int err, sleep_total = 0;
792 
793 	/* Ensure buf is aligned to 8 bytes */
794 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
795 
796 	buf->data0 = htobe32(data->data0);
797 	buf->data1 = htobe32(data->data1);
798 	buf->data2 = htobe32(data->data2);
799 	buf->cmd = htobe32(cmd);
800 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
801 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
802 
803 	buf->response_addr.low = htobe32(dma_low);
804 	buf->response_addr.high = htobe32(dma_high);
805 
806 	response->result = 0xffffffff;
807 	wmb();
808 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
809 
810 	/*
811 	 * Wait up to 20ms
812 	 */
813 	err = EAGAIN;
814 	for (sleep_total = 0; sleep_total < 20; sleep_total++) {
815 		wmb();
816 		switch (be32toh(response->result)) {
817 		case 0:
818 			data->data0 = be32toh(response->data);
819 			err = 0;
820 			break;
821 		case 0xffffffff:
822 			DELAY(1000);
823 			break;
824 		case MXGEFW_CMD_UNKNOWN:
825 			err = ENOSYS;
826 			break;
827 		case MXGEFW_CMD_ERROR_UNALIGNED:
828 			err = E2BIG;
829 			break;
830 		case MXGEFW_CMD_ERROR_BUSY:
831 			err = EBUSY;
832 			break;
833 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
834 			err = ENXIO;
835 			break;
836 		default:
837 			if_printf(sc->ifp, "command %d failed, result = %d\n",
838 			    cmd, be32toh(response->result));
839 			err = ENXIO;
840 			break;
841 		}
842 		if (err != EAGAIN)
843 			break;
844 	}
845 	if (err == EAGAIN) {
846 		if_printf(sc->ifp, "command %d timed out result = %d\n",
847 		    cmd, be32toh(response->result));
848 	}
849 	return err;
850 }
851 
852 static int
853 mxge_adopt_running_firmware(mxge_softc_t *sc)
854 {
855 	struct mcp_gen_header *hdr;
856 	const size_t bytes = sizeof(struct mcp_gen_header);
857 	size_t hdr_offset;
858 	int status;
859 
860 	/*
861 	 * Find running firmware header
862 	 */
863 	hdr_offset =
864 	htobe32(*(volatile uint32_t *)(sc->sram + MCP_HEADER_PTR_OFFSET));
865 
866 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
867 		if_printf(sc->ifp, "Running firmware has bad header offset "
868 		    "(%zu)\n", hdr_offset);
869 		return EIO;
870 	}
871 
872 	/*
873 	 * Copy header of running firmware from SRAM to host memory to
874 	 * validate firmware
875 	 */
876 	hdr = kmalloc(bytes, M_DEVBUF, M_WAITOK);
877 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
878 	    rman_get_bushandle(sc->mem_res), hdr_offset, (char *)hdr, bytes);
879 	status = mxge_validate_firmware(sc, hdr);
880 	kfree(hdr, M_DEVBUF);
881 
882 	/*
883 	 * Check to see if adopted firmware has bug where adopting
884 	 * it will cause broadcasts to be filtered unless the NIC
885 	 * is kept in ALLMULTI mode
886 	 */
887 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
888 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
889 		sc->adopted_rx_filter_bug = 1;
890 		if_printf(sc->ifp, "Adopting fw %d.%d.%d: "
891 		    "working around rx filter bug\n",
892 		    sc->fw_ver_major, sc->fw_ver_minor, sc->fw_ver_tiny);
893 	}
894 
895 	return status;
896 }
897 
898 static int
899 mxge_load_firmware(mxge_softc_t *sc, int adopt)
900 {
901 	volatile uint32_t *confirm;
902 	volatile char *submit;
903 	char buf_bytes[72];
904 	uint32_t *buf, size, dma_low, dma_high;
905 	int status, i;
906 
907 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
908 
909 	size = sc->sram_size;
910 	status = mxge_load_firmware_helper(sc, &size);
911 	if (status) {
912 		if (!adopt)
913 			return status;
914 
915 		/*
916 		 * Try to use the currently running firmware, if
917 		 * it is new enough
918 		 */
919 		status = mxge_adopt_running_firmware(sc);
920 		if (status) {
921 			if_printf(sc->ifp,
922 			    "failed to adopt running firmware\n");
923 			return status;
924 		}
925 		if_printf(sc->ifp, "Successfully adopted running firmware\n");
926 
927 		if (sc->tx_boundary == 4096) {
928 			if_printf(sc->ifp,
929 			     "Using firmware currently running on NIC.  "
930 			     "For optimal\n");
931 			if_printf(sc->ifp, "performance consider loading "
932 			     "optimized firmware\n");
933 		}
934 		sc->fw_name = mxge_fw_unaligned;
935 		sc->tx_boundary = 2048;
936 		return 0;
937 	}
938 
939 	/* Clear confirmation addr */
940 	confirm = (volatile uint32_t *)sc->cmd;
941 	*confirm = 0;
942 	wmb();
943 
944 	/*
945 	 * Send a reload command to the bootstrap MCP, and wait for the
946 	 * response in the confirmation address.  The firmware should
947 	 * write a -1 there to indicate it is alive and well
948 	 */
949 
950 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
951 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
952 
953 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
954 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
955 	buf[2] = htobe32(0xffffffff);	/* confirm data */
956 
957 	/*
958 	 * FIX: All newest firmware should un-protect the bottom of
959 	 * the sram before handoff. However, the very first interfaces
960 	 * do not. Therefore the handoff copy must skip the first 8 bytes
961 	 */
962 					/* where the code starts*/
963 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
964 	buf[4] = htobe32(size - 8); 	/* length of code */
965 	buf[5] = htobe32(8);		/* where to copy to */
966 	buf[6] = htobe32(0);		/* where to jump to */
967 
968 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
969 	mxge_pio_copy(submit, buf, 64);
970 	wmb();
971 	DELAY(1000);
972 	wmb();
973 	i = 0;
974 	while (*confirm != 0xffffffff && i < 20) {
975 		DELAY(1000*10);
976 		i++;
977 	}
978 	if (*confirm != 0xffffffff) {
979 		if_printf(sc->ifp,"handoff failed (%p = 0x%x)",
980 		    confirm, *confirm);
981 		return ENXIO;
982 	}
983 	return 0;
984 }
985 
986 static int
987 mxge_update_mac_address(mxge_softc_t *sc)
988 {
989 	mxge_cmd_t cmd;
990 	uint8_t *addr = sc->mac_addr;
991 
992 	cmd.data0 = (addr[0] << 24) | (addr[1] << 16) |
993 	    (addr[2] << 8) | addr[3];
994 	cmd.data1 = (addr[4] << 8) | (addr[5]);
995 	return mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
996 }
997 
998 static int
999 mxge_change_pause(mxge_softc_t *sc, int pause)
1000 {
1001 	mxge_cmd_t cmd;
1002 	int status;
1003 
1004 	if (pause)
1005 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL, &cmd);
1006 	else
1007 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL, &cmd);
1008 	if (status) {
1009 		if_printf(sc->ifp, "Failed to set flow control mode\n");
1010 		return ENXIO;
1011 	}
1012 	sc->pause = pause;
1013 	return 0;
1014 }
1015 
1016 static void
1017 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1018 {
1019 	mxge_cmd_t cmd;
1020 	int status;
1021 
1022 	if (mxge_always_promisc)
1023 		promisc = 1;
1024 
1025 	if (promisc)
1026 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC, &cmd);
1027 	else
1028 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC, &cmd);
1029 	if (status)
1030 		if_printf(sc->ifp, "Failed to set promisc mode\n");
1031 }
1032 
1033 static void
1034 mxge_set_multicast_list(mxge_softc_t *sc)
1035 {
1036 	mxge_cmd_t cmd;
1037 	struct ifmultiaddr *ifma;
1038 	struct ifnet *ifp = sc->ifp;
1039 	int err;
1040 
1041 	/* This firmware is known to not support multicast */
1042 	if (!sc->fw_multicast_support)
1043 		return;
1044 
1045 	/* Disable multicast filtering while we play with the lists*/
1046 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1047 	if (err != 0) {
1048 		if_printf(ifp, "Failed MXGEFW_ENABLE_ALLMULTI, "
1049 		    "error status: %d\n", err);
1050 		return;
1051 	}
1052 
1053 	if (sc->adopted_rx_filter_bug)
1054 		return;
1055 
1056 	if (ifp->if_flags & IFF_ALLMULTI) {
1057 		/* Request to disable multicast filtering, so quit here */
1058 		return;
1059 	}
1060 
1061 	/* Flush all the filters */
1062 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1063 	if (err != 0) {
1064 		if_printf(ifp, "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, "
1065 		    "error status: %d\n", err);
1066 		return;
1067 	}
1068 
1069 	/*
1070 	 * Walk the multicast list, and add each address
1071 	 */
1072 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1073 		if (ifma->ifma_addr->sa_family != AF_LINK)
1074 			continue;
1075 
1076 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1077 		    &cmd.data0, 4);
1078 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1079 		    &cmd.data1, 2);
1080 		cmd.data0 = htonl(cmd.data0);
1081 		cmd.data1 = htonl(cmd.data1);
1082 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1083 		if (err != 0) {
1084 			if_printf(ifp, "Failed MXGEFW_JOIN_MULTICAST_GROUP, "
1085 			    "error status: %d\n", err);
1086 			/* Abort, leaving multicast filtering off */
1087 			return;
1088 		}
1089 	}
1090 
1091 	/* Enable multicast filtering */
1092 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1093 	if (err != 0) {
1094 		if_printf(ifp, "Failed MXGEFW_DISABLE_ALLMULTI, "
1095 		    "error status: %d\n", err);
1096 	}
1097 }
1098 
1099 #if 0
1100 static int
1101 mxge_max_mtu(mxge_softc_t *sc)
1102 {
1103 	mxge_cmd_t cmd;
1104 	int status;
1105 
1106 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1107 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1108 
1109 	/* try to set nbufs to see if it we can
1110 	   use virtually contiguous jumbos */
1111 	cmd.data0 = 0;
1112 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1113 			       &cmd);
1114 	if (status == 0)
1115 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1116 
1117 	/* otherwise, we're limited to MJUMPAGESIZE */
1118 	return MJUMPAGESIZE - MXGEFW_PAD;
1119 }
1120 #endif
1121 
1122 static int
1123 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1124 {
1125 	struct mxge_slice_state *ss;
1126 	mxge_rx_done_t *rx_done;
1127 	volatile uint32_t *irq_claim;
1128 	mxge_cmd_t cmd;
1129 	int slice, status, rx_intr_size;
1130 
1131 	/*
1132 	 * Try to send a reset command to the card to see if it
1133 	 * is alive
1134 	 */
1135 	memset(&cmd, 0, sizeof (cmd));
1136 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1137 	if (status != 0) {
1138 		if_printf(sc->ifp, "failed reset\n");
1139 		return ENXIO;
1140 	}
1141 
1142 	mxge_dummy_rdma(sc, 1);
1143 
1144 	/*
1145 	 * Set the intrq size
1146 	 * XXX assume 4byte mcp_slot
1147 	 */
1148 	rx_intr_size = sc->rx_intr_slots * sizeof(mcp_slot_t);
1149 	cmd.data0 = rx_intr_size;
1150 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1151 
1152 	/*
1153 	 * Even though we already know how many slices are supported
1154 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1155 	 * has magic side effects, and must be called after a reset.
1156 	 * It must be called prior to calling any RSS related cmds,
1157 	 * including assigning an interrupt queue for anything but
1158 	 * slice 0.  It must also be called *after*
1159 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1160 	 * the firmware to compute offsets.
1161 	 */
1162 	if (sc->num_slices > 1) {
1163 		/* Ask the maximum number of slices it supports */
1164 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
1165 		if (status != 0) {
1166 			if_printf(sc->ifp, "failed to get number of slices\n");
1167 			return status;
1168 		}
1169 
1170 		/*
1171 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1172 		 * to setting up the interrupt queue DMA
1173 		 */
1174 		cmd.data0 = sc->num_slices;
1175 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1176 		if (sc->num_tx_rings > 1)
1177 			cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1178 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES, &cmd);
1179 		if (status != 0) {
1180 			if_printf(sc->ifp, "failed to set number of slices\n");
1181 			return status;
1182 		}
1183 	}
1184 
1185 	if (interrupts_setup) {
1186 		/* Now exchange information about interrupts  */
1187 		for (slice = 0; slice < sc->num_slices; slice++) {
1188 			ss = &sc->ss[slice];
1189 
1190 			rx_done = &ss->rx_data.rx_done;
1191 			memset(rx_done->entry, 0, rx_intr_size);
1192 
1193 			cmd.data0 =
1194 			    MXGE_LOWPART_TO_U32(ss->rx_done_dma.dmem_busaddr);
1195 			cmd.data1 =
1196 			    MXGE_HIGHPART_TO_U32(ss->rx_done_dma.dmem_busaddr);
1197 			cmd.data2 = slice;
1198 			status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA,
1199 			    &cmd);
1200 		}
1201 	}
1202 
1203 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET,
1204 	    &cmd);
1205 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1206 
1207 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1208 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1209 
1210 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, &cmd);
1211 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1212 
1213 	if (status != 0) {
1214 		if_printf(sc->ifp, "failed set interrupt parameters\n");
1215 		return status;
1216 	}
1217 
1218 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1219 
1220 	/* Run a DMA benchmark */
1221 	mxge_dma_test(sc, MXGEFW_DMA_TEST);
1222 
1223 	for (slice = 0; slice < sc->num_slices; slice++) {
1224 		ss = &sc->ss[slice];
1225 
1226 		ss->irq_claim = irq_claim + (2 * slice);
1227 
1228 		/* Reset mcp/driver shared state back to 0 */
1229 		ss->rx_data.rx_done.idx = 0;
1230 		ss->tx.req = 0;
1231 		ss->tx.done = 0;
1232 		ss->tx.pkt_done = 0;
1233 		ss->tx.queue_active = 0;
1234 		ss->tx.activate = 0;
1235 		ss->tx.deactivate = 0;
1236 		ss->rx_data.rx_big.cnt = 0;
1237 		ss->rx_data.rx_small.cnt = 0;
1238 		if (ss->fw_stats != NULL)
1239 			bzero(ss->fw_stats, sizeof(*ss->fw_stats));
1240 	}
1241 	sc->rdma_tags_available = 15;
1242 
1243 	status = mxge_update_mac_address(sc);
1244 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1245 	mxge_change_pause(sc, sc->pause);
1246 	mxge_set_multicast_list(sc);
1247 
1248 	if (sc->throttle) {
1249 		cmd.data0 = sc->throttle;
1250 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd))
1251 			if_printf(sc->ifp, "can't enable throttle\n");
1252 	}
1253 	return status;
1254 }
1255 
1256 static int
1257 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1258 {
1259 	mxge_cmd_t cmd;
1260 	mxge_softc_t *sc;
1261 	int err;
1262 	unsigned int throttle;
1263 
1264 	sc = arg1;
1265 	throttle = sc->throttle;
1266 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1267 	if (err != 0)
1268 		return err;
1269 
1270 	if (throttle == sc->throttle)
1271 		return 0;
1272 
1273 	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1274 		return EINVAL;
1275 
1276 	ifnet_serialize_all(sc->ifp);
1277 
1278 	cmd.data0 = throttle;
1279 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1280 	if (err == 0)
1281 		sc->throttle = throttle;
1282 
1283 	ifnet_deserialize_all(sc->ifp);
1284 	return err;
1285 }
1286 
1287 static int
1288 mxge_change_use_rss(SYSCTL_HANDLER_ARGS)
1289 {
1290 	mxge_softc_t *sc;
1291 	int err, use_rss;
1292 
1293 	sc = arg1;
1294 	use_rss = sc->use_rss;
1295 	err = sysctl_handle_int(oidp, &use_rss, arg2, req);
1296 	if (err != 0)
1297 		return err;
1298 
1299 	if (use_rss == sc->use_rss)
1300 		return 0;
1301 
1302 	ifnet_serialize_all(sc->ifp);
1303 
1304 	sc->use_rss = use_rss;
1305 	if (sc->ifp->if_flags & IFF_RUNNING) {
1306 		mxge_close(sc, 0);
1307 		mxge_open(sc);
1308 	}
1309 
1310 	ifnet_deserialize_all(sc->ifp);
1311 	return err;
1312 }
1313 
1314 static int
1315 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1316 {
1317 	mxge_softc_t *sc;
1318 	unsigned int intr_coal_delay;
1319 	int err;
1320 
1321 	sc = arg1;
1322 	intr_coal_delay = sc->intr_coal_delay;
1323 	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1324 	if (err != 0)
1325 		return err;
1326 
1327 	if (intr_coal_delay == sc->intr_coal_delay)
1328 		return 0;
1329 
1330 	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1331 		return EINVAL;
1332 
1333 	ifnet_serialize_all(sc->ifp);
1334 
1335 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1336 	sc->intr_coal_delay = intr_coal_delay;
1337 
1338 	ifnet_deserialize_all(sc->ifp);
1339 	return err;
1340 }
1341 
1342 static int
1343 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1344 {
1345 	int err;
1346 
1347 	if (arg1 == NULL)
1348 		return EFAULT;
1349 	arg2 = be32toh(*(int *)arg1);
1350 	arg1 = NULL;
1351 	err = sysctl_handle_int(oidp, arg1, arg2, req);
1352 
1353 	return err;
1354 }
1355 
1356 static void
1357 mxge_rem_sysctls(mxge_softc_t *sc)
1358 {
1359 	if (sc->ss != NULL) {
1360 		struct mxge_slice_state *ss;
1361 		int slice;
1362 
1363 		for (slice = 0; slice < sc->num_slices; slice++) {
1364 			ss = &sc->ss[slice];
1365 			if (ss->sysctl_tree != NULL) {
1366 				sysctl_ctx_free(&ss->sysctl_ctx);
1367 				ss->sysctl_tree = NULL;
1368 			}
1369 		}
1370 	}
1371 
1372 	if (sc->slice_sysctl_tree != NULL) {
1373 		sysctl_ctx_free(&sc->slice_sysctl_ctx);
1374 		sc->slice_sysctl_tree = NULL;
1375 	}
1376 }
1377 
1378 static void
1379 mxge_add_sysctls(mxge_softc_t *sc)
1380 {
1381 	struct sysctl_ctx_list *ctx;
1382 	struct sysctl_oid_list *children;
1383 	mcp_irq_data_t *fw;
1384 	struct mxge_slice_state *ss;
1385 	int slice;
1386 	char slice_num[8];
1387 
1388 	ctx = device_get_sysctl_ctx(sc->dev);
1389 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1390 	fw = sc->ss[0].fw_stats;
1391 
1392 	/*
1393 	 * Random information
1394 	 */
1395 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version",
1396 	    CTLFLAG_RD, &sc->fw_version, 0, "firmware version");
1397 
1398 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "serial_number",
1399 	    CTLFLAG_RD, &sc->serial_number_string, 0, "serial number");
1400 
1401 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "product_code",
1402 	    CTLFLAG_RD, &sc->product_code_string, 0, "product code");
1403 
1404 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "pcie_link_width",
1405 	    CTLFLAG_RD, &sc->link_width, 0, "link width");
1406 
1407 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_boundary",
1408 	    CTLFLAG_RD, &sc->tx_boundary, 0, "tx boundary");
1409 
1410 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_combine",
1411 	    CTLFLAG_RD, &sc->wc, 0, "write combining PIO");
1412 
1413 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_dma_MBs",
1414 	    CTLFLAG_RD, &sc->read_dma, 0, "DMA Read speed in MB/s");
1415 
1416 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_dma_MBs",
1417 	    CTLFLAG_RD, &sc->write_dma, 0, "DMA Write speed in MB/s");
1418 
1419 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_write_dma_MBs",
1420 	    CTLFLAG_RD, &sc->read_write_dma, 0,
1421 	    "DMA concurrent Read/Write speed in MB/s");
1422 
1423 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "watchdog_resets",
1424 	    CTLFLAG_RD, &sc->watchdog_resets, 0,
1425 	    "Number of times NIC was reset");
1426 
1427 	/*
1428 	 * Performance related tunables
1429 	 */
1430 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "intr_coal_delay",
1431 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_intr_coal, "I",
1432 	    "Interrupt coalescing delay in usecs");
1433 
1434 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "throttle",
1435 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_throttle, "I",
1436 	    "Transmit throttling");
1437 
1438 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "use_rss",
1439 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_use_rss, "I",
1440 	    "Use RSS");
1441 
1442 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "deassert_wait",
1443 	    CTLFLAG_RW, &mxge_deassert_wait, 0,
1444 	    "Wait for IRQ line to go low in ihandler");
1445 
1446 	/*
1447 	 * Stats block from firmware is in network byte order.
1448 	 * Need to swap it
1449 	 */
1450 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "link_up",
1451 	    CTLTYPE_INT|CTLFLAG_RD, &fw->link_up, 0,
1452 	    mxge_handle_be32, "I", "link up");
1453 
1454 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_tags_available",
1455 	    CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available, 0,
1456 	    mxge_handle_be32, "I", "rdma_tags_available");
1457 
1458 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_crc32",
1459 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_crc32, 0,
1460 	    mxge_handle_be32, "I", "dropped_bad_crc32");
1461 
1462 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_phy",
1463 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_phy, 0,
1464 	    mxge_handle_be32, "I", "dropped_bad_phy");
1465 
1466 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_error_or_filtered",
1467 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_error_or_filtered, 0,
1468 	    mxge_handle_be32, "I", "dropped_link_error_or_filtered");
1469 
1470 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_overflow",
1471 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow, 0,
1472 	    mxge_handle_be32, "I", "dropped_link_overflow");
1473 
1474 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_multicast_filtered",
1475 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_multicast_filtered, 0,
1476 	    mxge_handle_be32, "I", "dropped_multicast_filtered");
1477 
1478 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_big_buffer",
1479 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer, 0,
1480 	    mxge_handle_be32, "I", "dropped_no_big_buffer");
1481 
1482 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_small_buffer",
1483 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_small_buffer, 0,
1484 	    mxge_handle_be32, "I", "dropped_no_small_buffer");
1485 
1486 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_overrun",
1487 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun, 0,
1488 	    mxge_handle_be32, "I", "dropped_overrun");
1489 
1490 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_pause",
1491 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_pause, 0,
1492 	    mxge_handle_be32, "I", "dropped_pause");
1493 
1494 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_runt",
1495 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt, 0,
1496 	    mxge_handle_be32, "I", "dropped_runt");
1497 
1498 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_unicast_filtered",
1499 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered, 0,
1500 	    mxge_handle_be32, "I", "dropped_unicast_filtered");
1501 
1502 	/* add counters exported for debugging from all slices */
1503 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1504 	sc->slice_sysctl_tree = SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx,
1505 	    children, OID_AUTO, "slice", CTLFLAG_RD, 0, "");
1506 	if (sc->slice_sysctl_tree == NULL) {
1507 		device_printf(sc->dev, "can't add slice sysctl node\n");
1508 		return;
1509 	}
1510 
1511 	for (slice = 0; slice < sc->num_slices; slice++) {
1512 		ss = &sc->ss[slice];
1513 		sysctl_ctx_init(&ss->sysctl_ctx);
1514 		ctx = &ss->sysctl_ctx;
1515 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1516 		ksprintf(slice_num, "%d", slice);
1517 		ss->sysctl_tree = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
1518 		    slice_num, CTLFLAG_RD, 0, "");
1519 		if (ss->sysctl_tree == NULL) {
1520 			device_printf(sc->dev,
1521 			    "can't add %d slice sysctl node\n", slice);
1522 			return;	/* XXX continue? */
1523 		}
1524 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1525 
1526 		/*
1527 		 * XXX change to ULONG
1528 		 */
1529 
1530 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_small_cnt",
1531 		    CTLFLAG_RD, &ss->rx_data.rx_small.cnt, 0, "rx_small_cnt");
1532 
1533 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_big_cnt",
1534 		    CTLFLAG_RD, &ss->rx_data.rx_big.cnt, 0, "rx_small_cnt");
1535 
1536 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_req",
1537 		    CTLFLAG_RD, &ss->tx.req, 0, "tx_req");
1538 
1539 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_done",
1540 		    CTLFLAG_RD, &ss->tx.done, 0, "tx_done");
1541 
1542 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_pkt_done",
1543 		    CTLFLAG_RD, &ss->tx.pkt_done, 0, "tx_done");
1544 
1545 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_queue_active",
1546 		    CTLFLAG_RD, &ss->tx.queue_active, 0, "tx_queue_active");
1547 
1548 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_activate",
1549 		    CTLFLAG_RD, &ss->tx.activate, 0, "tx_activate");
1550 
1551 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_deactivate",
1552 		    CTLFLAG_RD, &ss->tx.deactivate, 0, "tx_deactivate");
1553 	}
1554 }
1555 
1556 /*
1557  * Copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1558  * backwards one at a time and handle ring wraps
1559  */
1560 static __inline void
1561 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1562     mcp_kreq_ether_send_t *src, int cnt)
1563 {
1564 	int idx, starting_slot;
1565 
1566 	starting_slot = tx->req;
1567 	while (cnt > 1) {
1568 		cnt--;
1569 		idx = (starting_slot + cnt) & tx->mask;
1570 		mxge_pio_copy(&tx->lanai[idx], &src[cnt], sizeof(*src));
1571 		wmb();
1572 	}
1573 }
1574 
1575 /*
1576  * Copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1577  * at most 32 bytes at a time, so as to avoid involving the software
1578  * pio handler in the nic.  We re-write the first segment's flags
1579  * to mark them valid only after writing the entire chain
1580  */
1581 static __inline void
1582 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src, int cnt)
1583 {
1584 	int idx, i;
1585 	uint32_t *src_ints;
1586 	volatile uint32_t *dst_ints;
1587 	mcp_kreq_ether_send_t *srcp;
1588 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1589 	uint8_t last_flags;
1590 
1591 	idx = tx->req & tx->mask;
1592 
1593 	last_flags = src->flags;
1594 	src->flags = 0;
1595 	wmb();
1596 	dst = dstp = &tx->lanai[idx];
1597 	srcp = src;
1598 
1599 	if ((idx + cnt) < tx->mask) {
1600 		for (i = 0; i < cnt - 1; i += 2) {
1601 			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1602 			wmb(); /* force write every 32 bytes */
1603 			srcp += 2;
1604 			dstp += 2;
1605 		}
1606 	} else {
1607 		/*
1608 		 * Submit all but the first request, and ensure
1609 		 * that it is submitted below
1610 		 */
1611 		mxge_submit_req_backwards(tx, src, cnt);
1612 		i = 0;
1613 	}
1614 	if (i < cnt) {
1615 		/* Submit the first request */
1616 		mxge_pio_copy(dstp, srcp, sizeof(*src));
1617 		wmb(); /* barrier before setting valid flag */
1618 	}
1619 
1620 	/* Re-write the last 32-bits with the valid flags */
1621 	src->flags = last_flags;
1622 	src_ints = (uint32_t *)src;
1623 	src_ints+=3;
1624 	dst_ints = (volatile uint32_t *)dst;
1625 	dst_ints+=3;
1626 	*dst_ints = *src_ints;
1627 	tx->req += cnt;
1628 	wmb();
1629 }
1630 
1631 static int
1632 mxge_pullup_tso(struct mbuf **mp)
1633 {
1634 	int hoff, iphlen, thoff;
1635 	struct mbuf *m;
1636 
1637 	m = *mp;
1638 	KASSERT(M_WRITABLE(m), ("TSO mbuf not writable"));
1639 
1640 	iphlen = m->m_pkthdr.csum_iphlen;
1641 	thoff = m->m_pkthdr.csum_thlen;
1642 	hoff = m->m_pkthdr.csum_lhlen;
1643 
1644 	KASSERT(iphlen > 0, ("invalid ip hlen"));
1645 	KASSERT(thoff > 0, ("invalid tcp hlen"));
1646 	KASSERT(hoff > 0, ("invalid ether hlen"));
1647 
1648 	if (__predict_false(m->m_len < hoff + iphlen + thoff)) {
1649 		m = m_pullup(m, hoff + iphlen + thoff);
1650 		if (m == NULL) {
1651 			*mp = NULL;
1652 			return ENOBUFS;
1653 		}
1654 		*mp = m;
1655 	}
1656 	return 0;
1657 }
1658 
1659 static int
1660 mxge_encap_tso(mxge_tx_ring_t *tx, struct mxge_buffer_state *info_map,
1661     struct mbuf *m, int busdma_seg_cnt)
1662 {
1663 	mcp_kreq_ether_send_t *req;
1664 	bus_dma_segment_t *seg;
1665 	uint32_t low, high_swapped;
1666 	int len, seglen, cum_len, cum_len_next;
1667 	int next_is_first, chop, cnt, rdma_count, small;
1668 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1669 	uint8_t flags, flags_next;
1670 	struct mxge_buffer_state *info_last;
1671 	bus_dmamap_t map = info_map->map;
1672 
1673 	mss = m->m_pkthdr.tso_segsz;
1674 
1675 	/*
1676 	 * Negative cum_len signifies to the send loop that we are
1677 	 * still in the header portion of the TSO packet.
1678 	 */
1679 	cum_len = -(m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen +
1680 	    m->m_pkthdr.csum_thlen);
1681 
1682 	/*
1683 	 * TSO implies checksum offload on this hardware
1684 	 */
1685 	cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1686 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1687 
1688 	/*
1689 	 * For TSO, pseudo_hdr_offset holds mss.  The firmware figures
1690 	 * out where to put the checksum by parsing the header.
1691 	 */
1692 	pseudo_hdr_offset = htobe16(mss);
1693 
1694 	req = tx->req_list;
1695 	seg = tx->seg_list;
1696 	cnt = 0;
1697 	rdma_count = 0;
1698 
1699 	/*
1700 	 * "rdma_count" is the number of RDMAs belonging to the current
1701 	 * packet BEFORE the current send request.  For non-TSO packets,
1702 	 * this is equal to "count".
1703 	 *
1704 	 * For TSO packets, rdma_count needs to be reset to 0 after a
1705 	 * segment cut.
1706 	 *
1707 	 * The rdma_count field of the send request is the number of
1708 	 * RDMAs of the packet starting at that request.  For TSO send
1709 	 * requests with one ore more cuts in the middle, this is the
1710 	 * number of RDMAs starting after the last cut in the request.
1711 	 * All previous segments before the last cut implicitly have 1
1712 	 * RDMA.
1713 	 *
1714 	 * Since the number of RDMAs is not known beforehand, it must be
1715 	 * filled-in retroactively - after each segmentation cut or at
1716 	 * the end of the entire packet.
1717 	 */
1718 
1719 	while (busdma_seg_cnt) {
1720 		/*
1721 		 * Break the busdma segment up into pieces
1722 		 */
1723 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1724 		high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1725 		len = seg->ds_len;
1726 
1727 		while (len) {
1728 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1729 			seglen = len;
1730 			cum_len_next = cum_len + seglen;
1731 			(req - rdma_count)->rdma_count = rdma_count + 1;
1732 			if (__predict_true(cum_len >= 0)) {
1733 				/* Payload */
1734 				chop = (cum_len_next > mss);
1735 				cum_len_next = cum_len_next % mss;
1736 				next_is_first = (cum_len_next == 0);
1737 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1738 				flags_next |=
1739 				    next_is_first * MXGEFW_FLAGS_FIRST;
1740 				rdma_count |= -(chop | next_is_first);
1741 				rdma_count += chop & !next_is_first;
1742 			} else if (cum_len_next >= 0) {
1743 				/* Header ends */
1744 				rdma_count = -1;
1745 				cum_len_next = 0;
1746 				seglen = -cum_len;
1747 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1748 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1749 				    MXGEFW_FLAGS_FIRST |
1750 				    (small * MXGEFW_FLAGS_SMALL);
1751 			}
1752 
1753 			req->addr_high = high_swapped;
1754 			req->addr_low = htobe32(low);
1755 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1756 			req->pad = 0;
1757 			req->rdma_count = 1;
1758 			req->length = htobe16(seglen);
1759 			req->cksum_offset = cksum_offset;
1760 			req->flags =
1761 			    flags | ((cum_len & 1) * MXGEFW_FLAGS_ALIGN_ODD);
1762 			low += seglen;
1763 			len -= seglen;
1764 			cum_len = cum_len_next;
1765 			flags = flags_next;
1766 			req++;
1767 			cnt++;
1768 			rdma_count++;
1769 			if (__predict_false(cksum_offset > seglen))
1770 				cksum_offset -= seglen;
1771 			else
1772 				cksum_offset = 0;
1773 			if (__predict_false(cnt > tx->max_desc))
1774 				goto drop;
1775 		}
1776 		busdma_seg_cnt--;
1777 		seg++;
1778 	}
1779 	(req - rdma_count)->rdma_count = rdma_count;
1780 
1781 	do {
1782 		req--;
1783 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1784 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1785 
1786 	info_last = &tx->info[((cnt - 1) + tx->req) & tx->mask];
1787 
1788 	info_map->map = info_last->map;
1789 	info_last->map = map;
1790 	info_last->m = m;
1791 
1792 	mxge_submit_req(tx, tx->req_list, cnt);
1793 
1794 	if (tx->send_go != NULL && tx->queue_active == 0) {
1795 		/* Tell the NIC to start polling this slice */
1796 		*tx->send_go = 1;
1797 		tx->queue_active = 1;
1798 		tx->activate++;
1799 		wmb();
1800 	}
1801 	return 0;
1802 
1803 drop:
1804 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1805 	m_freem(m);
1806 	return ENOBUFS;
1807 }
1808 
1809 static int
1810 mxge_encap(mxge_tx_ring_t *tx, struct mbuf *m, bus_addr_t zeropad)
1811 {
1812 	mcp_kreq_ether_send_t *req;
1813 	bus_dma_segment_t *seg;
1814 	bus_dmamap_t map;
1815 	int cnt, cum_len, err, i, idx, odd_flag;
1816 	uint16_t pseudo_hdr_offset;
1817 	uint8_t flags, cksum_offset;
1818 	struct mxge_buffer_state *info_map, *info_last;
1819 
1820 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
1821 		err = mxge_pullup_tso(&m);
1822 		if (__predict_false(err))
1823 			return err;
1824 	}
1825 
1826 	/*
1827 	 * Map the frame for DMA
1828 	 */
1829 	idx = tx->req & tx->mask;
1830 	info_map = &tx->info[idx];
1831 	map = info_map->map;
1832 
1833 	err = bus_dmamap_load_mbuf_defrag(tx->dmat, map, &m,
1834 	    tx->seg_list, tx->max_desc - 2, &cnt, BUS_DMA_NOWAIT);
1835 	if (__predict_false(err != 0))
1836 		goto drop;
1837 	bus_dmamap_sync(tx->dmat, map, BUS_DMASYNC_PREWRITE);
1838 
1839 	/*
1840 	 * TSO is different enough, we handle it in another routine
1841 	 */
1842 	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1843 		return mxge_encap_tso(tx, info_map, m, cnt);
1844 
1845 	req = tx->req_list;
1846 	cksum_offset = 0;
1847 	pseudo_hdr_offset = 0;
1848 	flags = MXGEFW_FLAGS_NO_TSO;
1849 
1850 	/*
1851 	 * Checksum offloading
1852 	 */
1853 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1854 		cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1855 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1856 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1857 		req->cksum_offset = cksum_offset;
1858 		flags |= MXGEFW_FLAGS_CKSUM;
1859 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1860 	} else {
1861 		odd_flag = 0;
1862 	}
1863 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1864 		flags |= MXGEFW_FLAGS_SMALL;
1865 
1866 	/*
1867 	 * Convert segments into a request list
1868 	 */
1869 	cum_len = 0;
1870 	seg = tx->seg_list;
1871 	req->flags = MXGEFW_FLAGS_FIRST;
1872 	for (i = 0; i < cnt; i++) {
1873 		req->addr_low = htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1874 		req->addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1875 		req->length = htobe16(seg->ds_len);
1876 		req->cksum_offset = cksum_offset;
1877 		if (cksum_offset > seg->ds_len)
1878 			cksum_offset -= seg->ds_len;
1879 		else
1880 			cksum_offset = 0;
1881 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1882 		req->pad = 0; /* complete solid 16-byte block */
1883 		req->rdma_count = 1;
1884 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1885 		cum_len += seg->ds_len;
1886 		seg++;
1887 		req++;
1888 		req->flags = 0;
1889 	}
1890 	req--;
1891 
1892 	/*
1893 	 * Pad runt to 60 bytes
1894 	 */
1895 	if (cum_len < 60) {
1896 		req++;
1897 		req->addr_low = htobe32(MXGE_LOWPART_TO_U32(zeropad));
1898 		req->addr_high = htobe32(MXGE_HIGHPART_TO_U32(zeropad));
1899 		req->length = htobe16(60 - cum_len);
1900 		req->cksum_offset = 0;
1901 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1902 		req->pad = 0; /* complete solid 16-byte block */
1903 		req->rdma_count = 1;
1904 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1905 		cnt++;
1906 	}
1907 
1908 	tx->req_list[0].rdma_count = cnt;
1909 #if 0
1910 	/* print what the firmware will see */
1911 	for (i = 0; i < cnt; i++) {
1912 		kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1913 		    "cso:%d, flags:0x%x, rdma:%d\n",
1914 		    i, (int)ntohl(tx->req_list[i].addr_high),
1915 		    (int)ntohl(tx->req_list[i].addr_low),
1916 		    (int)ntohs(tx->req_list[i].length),
1917 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1918 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1919 		    tx->req_list[i].rdma_count);
1920 	}
1921 	kprintf("--------------\n");
1922 #endif
1923 	info_last = &tx->info[((cnt - 1) + tx->req) & tx->mask];
1924 
1925 	info_map->map = info_last->map;
1926 	info_last->map = map;
1927 	info_last->m = m;
1928 
1929 	mxge_submit_req(tx, tx->req_list, cnt);
1930 
1931 	if (tx->send_go != NULL && tx->queue_active == 0) {
1932 		/* Tell the NIC to start polling this slice */
1933 		*tx->send_go = 1;
1934 		tx->queue_active = 1;
1935 		tx->activate++;
1936 		wmb();
1937 	}
1938 	return 0;
1939 
1940 drop:
1941 	m_freem(m);
1942 	return err;
1943 }
1944 
1945 static void
1946 mxge_start(struct ifnet *ifp, struct ifaltq_subque *ifsq)
1947 {
1948 	mxge_softc_t *sc = ifp->if_softc;
1949 	mxge_tx_ring_t *tx = ifsq_get_priv(ifsq);
1950 	bus_addr_t zeropad;
1951 	int encap = 0;
1952 
1953 	KKASSERT(tx->ifsq == ifsq);
1954 	ASSERT_SERIALIZED(&tx->tx_serialize);
1955 
1956 	if ((ifp->if_flags & IFF_RUNNING) == 0 || ifsq_is_oactive(ifsq))
1957 		return;
1958 
1959 	zeropad = sc->zeropad_dma.dmem_busaddr;
1960 	while (tx->mask - (tx->req - tx->done) > tx->max_desc) {
1961 		struct mbuf *m;
1962 		int error;
1963 
1964 		m = ifsq_dequeue(ifsq);
1965 		if (m == NULL)
1966 			goto done;
1967 
1968 		BPF_MTAP(ifp, m);
1969 		error = mxge_encap(tx, m, zeropad);
1970 		if (!error)
1971 			encap = 1;
1972 		else
1973 			IFNET_STAT_INC(ifp, oerrors, 1);
1974 	}
1975 
1976 	/* Ran out of transmit slots */
1977 	ifsq_set_oactive(ifsq);
1978 done:
1979 	if (encap)
1980 		tx->watchdog.wd_timer = 5;
1981 }
1982 
1983 static void
1984 mxge_watchdog(struct ifaltq_subque *ifsq)
1985 {
1986 	struct ifnet *ifp = ifsq_get_ifp(ifsq);
1987 	struct mxge_softc *sc = ifp->if_softc;
1988 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
1989 	mxge_tx_ring_t *tx = ifsq_get_priv(ifsq);
1990 
1991 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
1992 
1993 	/* Check for pause blocking before resetting */
1994 	if (tx->watchdog_rx_pause == rx_pause) {
1995 		mxge_warn_stuck(sc, tx, 0);
1996 		mxge_watchdog_reset(sc);
1997 		return;
1998 	} else {
1999 		if_printf(ifp, "Flow control blocking xmits, "
2000 		    "check link partner\n");
2001 	}
2002 	tx->watchdog_rx_pause = rx_pause;
2003 }
2004 
2005 /*
2006  * Copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2007  * at most 32 bytes at a time, so as to avoid involving the software
2008  * pio handler in the nic.  We re-write the first segment's low
2009  * DMA address to mark it valid only after we write the entire chunk
2010  * in a burst
2011  */
2012 static __inline void
2013 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2014     mcp_kreq_ether_recv_t *src)
2015 {
2016 	uint32_t low;
2017 
2018 	low = src->addr_low;
2019 	src->addr_low = 0xffffffff;
2020 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2021 	wmb();
2022 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2023 	wmb();
2024 	src->addr_low = low;
2025 	dst->addr_low = low;
2026 	wmb();
2027 }
2028 
2029 static int
2030 mxge_get_buf_small(mxge_rx_ring_t *rx, bus_dmamap_t map, int idx,
2031     boolean_t init)
2032 {
2033 	bus_dma_segment_t seg;
2034 	struct mbuf *m;
2035 	int cnt, err, mflag;
2036 
2037 	mflag = M_NOWAIT;
2038 	if (__predict_false(init))
2039 		mflag = M_WAITOK;
2040 
2041 	m = m_gethdr(mflag, MT_DATA);
2042 	if (m == NULL) {
2043 		err = ENOBUFS;
2044 		if (__predict_false(init)) {
2045 			/*
2046 			 * During initialization, there
2047 			 * is nothing to setup; bail out
2048 			 */
2049 			return err;
2050 		}
2051 		goto done;
2052 	}
2053 	m->m_len = m->m_pkthdr.len = MHLEN;
2054 
2055 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2056 	    &seg, 1, &cnt, BUS_DMA_NOWAIT);
2057 	if (err != 0) {
2058 		m_freem(m);
2059 		if (__predict_false(init)) {
2060 			/*
2061 			 * During initialization, there
2062 			 * is nothing to setup; bail out
2063 			 */
2064 			return err;
2065 		}
2066 		goto done;
2067 	}
2068 
2069 	rx->info[idx].m = m;
2070 	rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2071 	rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2072 
2073 done:
2074 	if ((idx & 7) == 7)
2075 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2076 	return err;
2077 }
2078 
2079 static int
2080 mxge_get_buf_big(mxge_rx_ring_t *rx, bus_dmamap_t map, int idx,
2081     boolean_t init)
2082 {
2083 	bus_dma_segment_t seg;
2084 	struct mbuf *m;
2085 	int cnt, err, mflag;
2086 
2087 	mflag = M_NOWAIT;
2088 	if (__predict_false(init))
2089 		mflag = M_WAITOK;
2090 
2091 	if (rx->cl_size == MCLBYTES)
2092 		m = m_getcl(mflag, MT_DATA, M_PKTHDR);
2093 	else
2094 		m = m_getjcl(mflag, MT_DATA, M_PKTHDR, MJUMPAGESIZE);
2095 	if (m == NULL) {
2096 		err = ENOBUFS;
2097 		if (__predict_false(init)) {
2098 			/*
2099 			 * During initialization, there
2100 			 * is nothing to setup; bail out
2101 			 */
2102 			return err;
2103 		}
2104 		goto done;
2105 	}
2106 	m->m_len = m->m_pkthdr.len = rx->cl_size;
2107 
2108 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2109 	    &seg, 1, &cnt, BUS_DMA_NOWAIT);
2110 	if (err != 0) {
2111 		m_freem(m);
2112 		if (__predict_false(init)) {
2113 			/*
2114 			 * During initialization, there
2115 			 * is nothing to setup; bail out
2116 			 */
2117 			return err;
2118 		}
2119 		goto done;
2120 	}
2121 
2122 	rx->info[idx].m = m;
2123 	rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2124 	rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2125 
2126 done:
2127 	if ((idx & 7) == 7)
2128 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2129 	return err;
2130 }
2131 
2132 /*
2133  * Myri10GE hardware checksums are not valid if the sender
2134  * padded the frame with non-zero padding.  This is because
2135  * the firmware just does a simple 16-bit 1s complement
2136  * checksum across the entire frame, excluding the first 14
2137  * bytes.  It is best to simply to check the checksum and
2138  * tell the stack about it only if the checksum is good
2139  */
2140 static __inline uint16_t
2141 mxge_rx_csum(struct mbuf *m, int csum)
2142 {
2143 	const struct ether_header *eh;
2144 	const struct ip *ip;
2145 	uint16_t c;
2146 
2147 	eh = mtod(m, const struct ether_header *);
2148 
2149 	/* Only deal with IPv4 TCP & UDP for now */
2150 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2151 		return 1;
2152 
2153 	ip = (const struct ip *)(eh + 1);
2154 	if (__predict_false(ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP))
2155 		return 1;
2156 
2157 #ifdef INET
2158 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2159 	    htonl(ntohs(csum) + ntohs(ip->ip_len) +
2160 	          - (ip->ip_hl << 2) + ip->ip_p));
2161 #else
2162 	c = 1;
2163 #endif
2164 	c ^= 0xffff;
2165 	return c;
2166 }
2167 
2168 static void
2169 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2170 {
2171 	struct ether_vlan_header *evl;
2172 	uint32_t partial;
2173 
2174 	evl = mtod(m, struct ether_vlan_header *);
2175 
2176 	/*
2177 	 * Fix checksum by subtracting EVL_ENCAPLEN bytes after
2178 	 * what the firmware thought was the end of the ethernet
2179 	 * header.
2180 	 */
2181 
2182 	/* Put checksum into host byte order */
2183 	*csum = ntohs(*csum);
2184 
2185 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2186 	*csum += ~partial;
2187 	*csum += ((*csum) < ~partial);
2188 	*csum = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2189 	*csum = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2190 
2191 	/*
2192 	 * Restore checksum to network byte order;
2193 	 * later consumers expect this
2194 	 */
2195 	*csum = htons(*csum);
2196 
2197 	/* save the tag */
2198 	m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2199 	m->m_flags |= M_VLANTAG;
2200 
2201 	/*
2202 	 * Remove the 802.1q header by copying the Ethernet
2203 	 * addresses over it and adjusting the beginning of
2204 	 * the data in the mbuf.  The encapsulated Ethernet
2205 	 * type field is already in place.
2206 	 */
2207 	bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2208 	    ETHER_HDR_LEN - ETHER_TYPE_LEN);
2209 	m_adj(m, EVL_ENCAPLEN);
2210 }
2211 
2212 
2213 static __inline void
2214 mxge_rx_done_big(struct ifnet *ifp, mxge_rx_ring_t *rx,
2215     uint32_t len, uint32_t csum)
2216 {
2217 	struct mbuf *m;
2218 	const struct ether_header *eh;
2219 	bus_dmamap_t old_map;
2220 	int idx;
2221 
2222 	idx = rx->cnt & rx->mask;
2223 	rx->cnt++;
2224 
2225 	/* Save a pointer to the received mbuf */
2226 	m = rx->info[idx].m;
2227 
2228 	/* Try to replace the received mbuf */
2229 	if (mxge_get_buf_big(rx, rx->extra_map, idx, FALSE)) {
2230 		/* Drop the frame -- the old mbuf is re-cycled */
2231 		IFNET_STAT_INC(ifp, ierrors, 1);
2232 		return;
2233 	}
2234 
2235 	/* Unmap the received buffer */
2236 	old_map = rx->info[idx].map;
2237 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2238 	bus_dmamap_unload(rx->dmat, old_map);
2239 
2240 	/* Swap the bus_dmamap_t's */
2241 	rx->info[idx].map = rx->extra_map;
2242 	rx->extra_map = old_map;
2243 
2244 	/*
2245 	 * mcp implicitly skips 1st 2 bytes so that packet is properly
2246 	 * aligned
2247 	 */
2248 	m->m_data += MXGEFW_PAD;
2249 
2250 	m->m_pkthdr.rcvif = ifp;
2251 	m->m_len = m->m_pkthdr.len = len;
2252 
2253 	IFNET_STAT_INC(ifp, ipackets, 1);
2254 
2255 	eh = mtod(m, const struct ether_header *);
2256 	if (eh->ether_type == htons(ETHERTYPE_VLAN))
2257 		mxge_vlan_tag_remove(m, &csum);
2258 
2259 	/* If the checksum is valid, mark it in the mbuf header */
2260 	if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2261 	    mxge_rx_csum(m, csum) == 0) {
2262 		/* Tell the stack that the checksum is good */
2263 		m->m_pkthdr.csum_data = 0xffff;
2264 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2265 		    CSUM_DATA_VALID;
2266 	}
2267 	ifp->if_input(ifp, m, NULL, -1);
2268 }
2269 
2270 static __inline void
2271 mxge_rx_done_small(struct ifnet *ifp, mxge_rx_ring_t *rx,
2272     uint32_t len, uint32_t csum)
2273 {
2274 	const struct ether_header *eh;
2275 	struct mbuf *m;
2276 	bus_dmamap_t old_map;
2277 	int idx;
2278 
2279 	idx = rx->cnt & rx->mask;
2280 	rx->cnt++;
2281 
2282 	/* Save a pointer to the received mbuf */
2283 	m = rx->info[idx].m;
2284 
2285 	/* Try to replace the received mbuf */
2286 	if (mxge_get_buf_small(rx, rx->extra_map, idx, FALSE)) {
2287 		/* Drop the frame -- the old mbuf is re-cycled */
2288 		IFNET_STAT_INC(ifp, ierrors, 1);
2289 		return;
2290 	}
2291 
2292 	/* Unmap the received buffer */
2293 	old_map = rx->info[idx].map;
2294 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2295 	bus_dmamap_unload(rx->dmat, old_map);
2296 
2297 	/* Swap the bus_dmamap_t's */
2298 	rx->info[idx].map = rx->extra_map;
2299 	rx->extra_map = old_map;
2300 
2301 	/*
2302 	 * mcp implicitly skips 1st 2 bytes so that packet is properly
2303 	 * aligned
2304 	 */
2305 	m->m_data += MXGEFW_PAD;
2306 
2307 	m->m_pkthdr.rcvif = ifp;
2308 	m->m_len = m->m_pkthdr.len = len;
2309 
2310 	IFNET_STAT_INC(ifp, ipackets, 1);
2311 
2312 	eh = mtod(m, const struct ether_header *);
2313 	if (eh->ether_type == htons(ETHERTYPE_VLAN))
2314 		mxge_vlan_tag_remove(m, &csum);
2315 
2316 	/* If the checksum is valid, mark it in the mbuf header */
2317 	if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2318 	    mxge_rx_csum(m, csum) == 0) {
2319 		/* Tell the stack that the checksum is good */
2320 		m->m_pkthdr.csum_data = 0xffff;
2321 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2322 		    CSUM_DATA_VALID;
2323 	}
2324 	ifp->if_input(ifp, m, NULL, -1);
2325 }
2326 
2327 static __inline void
2328 mxge_clean_rx_done(struct ifnet *ifp, struct mxge_rx_data *rx_data, int cycle)
2329 {
2330 	mxge_rx_done_t *rx_done = &rx_data->rx_done;
2331 
2332 	while (rx_done->entry[rx_done->idx].length != 0 && cycle != 0) {
2333 		uint16_t length, checksum;
2334 
2335 		length = ntohs(rx_done->entry[rx_done->idx].length);
2336 		rx_done->entry[rx_done->idx].length = 0;
2337 
2338 		checksum = rx_done->entry[rx_done->idx].checksum;
2339 
2340 		if (length <= MXGE_RX_SMALL_BUFLEN) {
2341 			mxge_rx_done_small(ifp, &rx_data->rx_small,
2342 			    length, checksum);
2343 		} else {
2344 			mxge_rx_done_big(ifp, &rx_data->rx_big,
2345 			    length, checksum);
2346 		}
2347 
2348 		rx_done->idx++;
2349 		rx_done->idx &= rx_done->mask;
2350 		--cycle;
2351 	}
2352 }
2353 
2354 static __inline void
2355 mxge_tx_done(struct ifnet *ifp, mxge_tx_ring_t *tx, uint32_t mcp_idx)
2356 {
2357 	ASSERT_SERIALIZED(&tx->tx_serialize);
2358 
2359 	while (tx->pkt_done != mcp_idx) {
2360 		struct mbuf *m;
2361 		int idx;
2362 
2363 		idx = tx->done & tx->mask;
2364 		tx->done++;
2365 
2366 		m = tx->info[idx].m;
2367 		/*
2368 		 * mbuf and DMA map only attached to the first
2369 		 * segment per-mbuf.
2370 		 */
2371 		if (m != NULL) {
2372 			tx->pkt_done++;
2373 			IFNET_STAT_INC(ifp, opackets, 1);
2374 			tx->info[idx].m = NULL;
2375 			bus_dmamap_unload(tx->dmat, tx->info[idx].map);
2376 			m_freem(m);
2377 		}
2378 	}
2379 
2380 	/*
2381 	 * If we have space, clear OACTIVE to tell the stack that
2382 	 * its OK to send packets
2383 	 */
2384 	if (tx->req - tx->done < (tx->mask + 1) / 2) {
2385 		ifsq_clr_oactive(tx->ifsq);
2386 		if (tx->req == tx->done) {
2387 			/* Reset watchdog */
2388 			tx->watchdog.wd_timer = 0;
2389 		}
2390 	}
2391 
2392 	if (!ifsq_is_empty(tx->ifsq))
2393 		ifsq_devstart(tx->ifsq);
2394 
2395 	if (tx->send_stop != NULL && tx->req == tx->done) {
2396 		/*
2397 		 * Let the NIC stop polling this queue, since there
2398 		 * are no more transmits pending
2399 		 */
2400 		*tx->send_stop = 1;
2401 		tx->queue_active = 0;
2402 		tx->deactivate++;
2403 		wmb();
2404 	}
2405 }
2406 
2407 static struct mxge_media_type mxge_xfp_media_types[] = {
2408 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2409 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2410 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2411 	{IFM_NONE,	(1 << 5),	"10GBASE-ER"},
2412 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2413 	{IFM_NONE,	(1 << 3),	"10GBASE-SW"},
2414 	{IFM_NONE,	(1 << 2),	"10GBASE-LW"},
2415 	{IFM_NONE,	(1 << 1),	"10GBASE-EW"},
2416 	{IFM_NONE,	(1 << 0),	"Reserved"}
2417 };
2418 
2419 static struct mxge_media_type mxge_sfp_media_types[] = {
2420 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2421 	{IFM_NONE,	(1 << 7),	"Reserved"},
2422 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2423 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2424 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2425 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2426 };
2427 
2428 static void
2429 mxge_media_set(mxge_softc_t *sc, int media_type)
2430 {
2431 	int fc_opt = 0;
2432 
2433 	if (media_type == IFM_NONE)
2434 		return;
2435 
2436 	if (sc->pause)
2437 		fc_opt = IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE;
2438 
2439 	ifmedia_add(&sc->media, MXGE_IFM | media_type, 0, NULL);
2440 	ifmedia_set(&sc->media, MXGE_IFM | media_type | fc_opt);
2441 
2442 	sc->current_media = media_type;
2443 }
2444 
2445 static void
2446 mxge_media_unset(mxge_softc_t *sc)
2447 {
2448 	ifmedia_removeall(&sc->media);
2449 	sc->current_media = IFM_NONE;
2450 }
2451 
2452 static void
2453 mxge_media_init(mxge_softc_t *sc)
2454 {
2455 	const char *ptr;
2456 	int i;
2457 
2458 	mxge_media_unset(sc);
2459 
2460 	/*
2461 	 * Parse the product code to deterimine the interface type
2462 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2463 	 * after the 3rd dash in the driver's cached copy of the
2464 	 * EEPROM's product code string.
2465 	 */
2466 	ptr = sc->product_code_string;
2467 	if (ptr == NULL) {
2468 		if_printf(sc->ifp, "Missing product code\n");
2469 		return;
2470 	}
2471 
2472 	for (i = 0; i < 3; i++, ptr++) {
2473 		ptr = strchr(ptr, '-');
2474 		if (ptr == NULL) {
2475 			if_printf(sc->ifp, "only %d dashes in PC?!?\n", i);
2476 			return;
2477 		}
2478 	}
2479 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2480 		/* -C is CX4 */
2481 		sc->connector = MXGE_CX4;
2482 		mxge_media_set(sc, IFM_10G_CX4);
2483 	} else if (*ptr == 'Q') {
2484 		/* -Q is Quad Ribbon Fiber */
2485 		sc->connector = MXGE_QRF;
2486 		if_printf(sc->ifp, "Quad Ribbon Fiber Media\n");
2487 		/* DragonFly has no media type for Quad ribbon fiber */
2488 	} else if (*ptr == 'R') {
2489 		/* -R is XFP */
2490 		sc->connector = MXGE_XFP;
2491 		/* NOTE: ifmedia will be installed later */
2492 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2493 		/* -S or -2S is SFP+ */
2494 		sc->connector = MXGE_SFP;
2495 		/* NOTE: ifmedia will be installed later */
2496 	} else {
2497 		sc->connector = MXGE_UNK;
2498 		if_printf(sc->ifp, "Unknown media type: %c\n", *ptr);
2499 	}
2500 }
2501 
2502 /*
2503  * Determine the media type for a NIC.  Some XFPs will identify
2504  * themselves only when their link is up, so this is initiated via a
2505  * link up interrupt.  However, this can potentially take up to
2506  * several milliseconds, so it is run via the watchdog routine, rather
2507  * than in the interrupt handler itself.
2508  */
2509 static void
2510 mxge_media_probe(mxge_softc_t *sc)
2511 {
2512 	mxge_cmd_t cmd;
2513 	const char *cage_type;
2514 	struct mxge_media_type *mxge_media_types = NULL;
2515 	int i, err, ms, mxge_media_type_entries;
2516 	uint32_t byte;
2517 
2518 	sc->need_media_probe = 0;
2519 
2520 	if (sc->connector == MXGE_XFP) {
2521 		/* -R is XFP */
2522 		mxge_media_types = mxge_xfp_media_types;
2523 		mxge_media_type_entries = NELEM(mxge_xfp_media_types);
2524 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2525 		cage_type = "XFP";
2526 	} else 	if (sc->connector == MXGE_SFP) {
2527 		/* -S or -2S is SFP+ */
2528 		mxge_media_types = mxge_sfp_media_types;
2529 		mxge_media_type_entries = NELEM(mxge_sfp_media_types);
2530 		cage_type = "SFP+";
2531 		byte = 3;
2532 	} else {
2533 		/* nothing to do; media type cannot change */
2534 		return;
2535 	}
2536 
2537 	/*
2538 	 * At this point we know the NIC has an XFP cage, so now we
2539 	 * try to determine what is in the cage by using the
2540 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2541 	 * register.  We read just one byte, which may take over
2542 	 * a millisecond
2543 	 */
2544 
2545 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2546 	cmd.data1 = byte;
2547 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2548 	if (err != MXGEFW_CMD_OK) {
2549 		if (err == MXGEFW_CMD_ERROR_I2C_FAILURE)
2550 			if_printf(sc->ifp, "failed to read XFP\n");
2551 		else if (err == MXGEFW_CMD_ERROR_I2C_ABSENT)
2552 			if_printf(sc->ifp, "Type R/S with no XFP!?!?\n");
2553 		else
2554 			if_printf(sc->ifp, "I2C read failed, err: %d", err);
2555 		mxge_media_unset(sc);
2556 		return;
2557 	}
2558 
2559 	/* Now we wait for the data to be cached */
2560 	cmd.data0 = byte;
2561 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2562 	for (ms = 0; err == EBUSY && ms < 50; ms++) {
2563 		DELAY(1000);
2564 		cmd.data0 = byte;
2565 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2566 	}
2567 	if (err != MXGEFW_CMD_OK) {
2568 		if_printf(sc->ifp, "failed to read %s (%d, %dms)\n",
2569 		    cage_type, err, ms);
2570 		mxge_media_unset(sc);
2571 		return;
2572 	}
2573 
2574 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2575 		if (bootverbose) {
2576 			if_printf(sc->ifp, "%s:%s\n", cage_type,
2577 			    mxge_media_types[0].name);
2578 		}
2579 		if (sc->current_media != mxge_media_types[0].flag) {
2580 			mxge_media_unset(sc);
2581 			mxge_media_set(sc, mxge_media_types[0].flag);
2582 		}
2583 		return;
2584 	}
2585 	for (i = 1; i < mxge_media_type_entries; i++) {
2586 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2587 			if (bootverbose) {
2588 				if_printf(sc->ifp, "%s:%s\n", cage_type,
2589 				    mxge_media_types[i].name);
2590 			}
2591 
2592 			if (sc->current_media != mxge_media_types[i].flag) {
2593 				mxge_media_unset(sc);
2594 				mxge_media_set(sc, mxge_media_types[i].flag);
2595 			}
2596 			return;
2597 		}
2598 	}
2599 	mxge_media_unset(sc);
2600 	if (bootverbose) {
2601 		if_printf(sc->ifp, "%s media 0x%x unknown\n", cage_type,
2602 		    cmd.data0);
2603 	}
2604 }
2605 
2606 static void
2607 mxge_intr_status(struct mxge_softc *sc, const mcp_irq_data_t *stats)
2608 {
2609 	if (sc->link_state != stats->link_up) {
2610 		sc->link_state = stats->link_up;
2611 		if (sc->link_state) {
2612 			sc->ifp->if_link_state = LINK_STATE_UP;
2613 			if_link_state_change(sc->ifp);
2614 			if (bootverbose)
2615 				if_printf(sc->ifp, "link up\n");
2616 		} else {
2617 			sc->ifp->if_link_state = LINK_STATE_DOWN;
2618 			if_link_state_change(sc->ifp);
2619 			if (bootverbose)
2620 				if_printf(sc->ifp, "link down\n");
2621 		}
2622 		sc->need_media_probe = 1;
2623 	}
2624 
2625 	if (sc->rdma_tags_available != be32toh(stats->rdma_tags_available)) {
2626 		sc->rdma_tags_available = be32toh(stats->rdma_tags_available);
2627 		if_printf(sc->ifp, "RDMA timed out! %d tags left\n",
2628 		    sc->rdma_tags_available);
2629 	}
2630 
2631 	if (stats->link_down) {
2632 		sc->down_cnt += stats->link_down;
2633 		sc->link_state = 0;
2634 		sc->ifp->if_link_state = LINK_STATE_DOWN;
2635 		if_link_state_change(sc->ifp);
2636 	}
2637 }
2638 
2639 static void
2640 mxge_serialize_skipmain(struct mxge_softc *sc)
2641 {
2642 	lwkt_serialize_array_enter(sc->serializes, sc->nserialize, 1);
2643 }
2644 
2645 static void
2646 mxge_deserialize_skipmain(struct mxge_softc *sc)
2647 {
2648 	lwkt_serialize_array_exit(sc->serializes, sc->nserialize, 1);
2649 }
2650 
2651 static void
2652 mxge_legacy(void *arg)
2653 {
2654 	struct mxge_slice_state *ss = arg;
2655 	mxge_softc_t *sc = ss->sc;
2656 	mcp_irq_data_t *stats = ss->fw_stats;
2657 	mxge_tx_ring_t *tx = &ss->tx;
2658 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2659 	uint32_t send_done_count;
2660 	uint8_t valid;
2661 
2662 	ASSERT_SERIALIZED(&sc->main_serialize);
2663 
2664 	/* Make sure the DMA has finished */
2665 	if (!stats->valid)
2666 		return;
2667 	valid = stats->valid;
2668 
2669 	/* Lower legacy IRQ */
2670 	*sc->irq_deassert = 0;
2671 	if (!mxge_deassert_wait) {
2672 		/* Don't wait for conf. that irq is low */
2673 		stats->valid = 0;
2674 	}
2675 
2676 	mxge_serialize_skipmain(sc);
2677 
2678 	/*
2679 	 * Loop while waiting for legacy irq deassertion
2680 	 * XXX do we really want to loop?
2681 	 */
2682 	do {
2683 		/* Check for transmit completes and receives */
2684 		send_done_count = be32toh(stats->send_done_count);
2685 		while ((send_done_count != tx->pkt_done) ||
2686 		       (rx_done->entry[rx_done->idx].length != 0)) {
2687 			if (send_done_count != tx->pkt_done) {
2688 				mxge_tx_done(&sc->arpcom.ac_if, tx,
2689 				    (int)send_done_count);
2690 			}
2691 			mxge_clean_rx_done(&sc->arpcom.ac_if, &ss->rx_data, -1);
2692 			send_done_count = be32toh(stats->send_done_count);
2693 		}
2694 		if (mxge_deassert_wait)
2695 			wmb();
2696 	} while (*((volatile uint8_t *)&stats->valid));
2697 
2698 	mxge_deserialize_skipmain(sc);
2699 
2700 	/* Fw link & error stats meaningful only on the first slice */
2701 	if (__predict_false(stats->stats_updated))
2702 		mxge_intr_status(sc, stats);
2703 
2704 	/* Check to see if we have rx token to pass back */
2705 	if (valid & 0x1)
2706 		*ss->irq_claim = be32toh(3);
2707 	*(ss->irq_claim + 1) = be32toh(3);
2708 }
2709 
2710 static void
2711 mxge_msi(void *arg)
2712 {
2713 	struct mxge_slice_state *ss = arg;
2714 	mxge_softc_t *sc = ss->sc;
2715 	mcp_irq_data_t *stats = ss->fw_stats;
2716 	mxge_tx_ring_t *tx = &ss->tx;
2717 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2718 	uint32_t send_done_count;
2719 	uint8_t valid;
2720 #ifndef IFPOLL_ENABLE
2721 	const boolean_t polling = FALSE;
2722 #else
2723 	boolean_t polling = FALSE;
2724 #endif
2725 
2726 	ASSERT_SERIALIZED(&sc->main_serialize);
2727 
2728 	/* Make sure the DMA has finished */
2729 	if (__predict_false(!stats->valid))
2730 		return;
2731 
2732 	valid = stats->valid;
2733 	stats->valid = 0;
2734 
2735 #ifdef IFPOLL_ENABLE
2736 	if (sc->arpcom.ac_if.if_flags & IFF_NPOLLING)
2737 		polling = TRUE;
2738 #endif
2739 
2740 	if (!polling) {
2741 		/* Check for receives */
2742 		lwkt_serialize_enter(&ss->rx_data.rx_serialize);
2743 		if (rx_done->entry[rx_done->idx].length != 0)
2744 			mxge_clean_rx_done(&sc->arpcom.ac_if, &ss->rx_data, -1);
2745 		lwkt_serialize_exit(&ss->rx_data.rx_serialize);
2746 	}
2747 
2748 	/*
2749 	 * Check for transmit completes
2750 	 *
2751 	 * NOTE:
2752 	 * Since pkt_done is only changed by mxge_tx_done(),
2753 	 * which is called only in interrupt handler, the
2754 	 * check w/o holding tx serializer is MPSAFE.
2755 	 */
2756 	send_done_count = be32toh(stats->send_done_count);
2757 	if (send_done_count != tx->pkt_done) {
2758 		lwkt_serialize_enter(&tx->tx_serialize);
2759 		mxge_tx_done(&sc->arpcom.ac_if, tx, (int)send_done_count);
2760 		lwkt_serialize_exit(&tx->tx_serialize);
2761 	}
2762 
2763 	if (__predict_false(stats->stats_updated))
2764 		mxge_intr_status(sc, stats);
2765 
2766 	/* Check to see if we have rx token to pass back */
2767 	if (!polling && (valid & 0x1))
2768 		*ss->irq_claim = be32toh(3);
2769 	*(ss->irq_claim + 1) = be32toh(3);
2770 }
2771 
2772 static void
2773 mxge_msix_rx(void *arg)
2774 {
2775 	struct mxge_slice_state *ss = arg;
2776 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2777 
2778 #ifdef IFPOLL_ENABLE
2779 	if (ss->sc->arpcom.ac_if.if_flags & IFF_NPOLLING)
2780 		return;
2781 #endif
2782 
2783 	ASSERT_SERIALIZED(&ss->rx_data.rx_serialize);
2784 
2785 	if (rx_done->entry[rx_done->idx].length != 0)
2786 		mxge_clean_rx_done(&ss->sc->arpcom.ac_if, &ss->rx_data, -1);
2787 
2788 	*ss->irq_claim = be32toh(3);
2789 }
2790 
2791 static void
2792 mxge_msix_rxtx(void *arg)
2793 {
2794 	struct mxge_slice_state *ss = arg;
2795 	mxge_softc_t *sc = ss->sc;
2796 	mcp_irq_data_t *stats = ss->fw_stats;
2797 	mxge_tx_ring_t *tx = &ss->tx;
2798 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2799 	uint32_t send_done_count;
2800 	uint8_t valid;
2801 #ifndef IFPOLL_ENABLE
2802 	const boolean_t polling = FALSE;
2803 #else
2804 	boolean_t polling = FALSE;
2805 #endif
2806 
2807 	ASSERT_SERIALIZED(&ss->rx_data.rx_serialize);
2808 
2809 	/* Make sure the DMA has finished */
2810 	if (__predict_false(!stats->valid))
2811 		return;
2812 
2813 	valid = stats->valid;
2814 	stats->valid = 0;
2815 
2816 #ifdef IFPOLL_ENABLE
2817 	if (sc->arpcom.ac_if.if_flags & IFF_NPOLLING)
2818 		polling = TRUE;
2819 #endif
2820 
2821 	/* Check for receives */
2822 	if (!polling && rx_done->entry[rx_done->idx].length != 0)
2823 		mxge_clean_rx_done(&sc->arpcom.ac_if, &ss->rx_data, -1);
2824 
2825 	/*
2826 	 * Check for transmit completes
2827 	 *
2828 	 * NOTE:
2829 	 * Since pkt_done is only changed by mxge_tx_done(),
2830 	 * which is called only in interrupt handler, the
2831 	 * check w/o holding tx serializer is MPSAFE.
2832 	 */
2833 	send_done_count = be32toh(stats->send_done_count);
2834 	if (send_done_count != tx->pkt_done) {
2835 		lwkt_serialize_enter(&tx->tx_serialize);
2836 		mxge_tx_done(&sc->arpcom.ac_if, tx, (int)send_done_count);
2837 		lwkt_serialize_exit(&tx->tx_serialize);
2838 	}
2839 
2840 	/* Check to see if we have rx token to pass back */
2841 	if (!polling && (valid & 0x1))
2842 		*ss->irq_claim = be32toh(3);
2843 	*(ss->irq_claim + 1) = be32toh(3);
2844 }
2845 
2846 static void
2847 mxge_init(void *arg)
2848 {
2849 	struct mxge_softc *sc = arg;
2850 
2851 	ASSERT_IFNET_SERIALIZED_ALL(sc->ifp);
2852 	if ((sc->ifp->if_flags & IFF_RUNNING) == 0)
2853 		mxge_open(sc);
2854 }
2855 
2856 static void
2857 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2858 {
2859 	int i;
2860 
2861 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
2862 		if (ss->rx_data.rx_big.info[i].m == NULL)
2863 			continue;
2864 		bus_dmamap_unload(ss->rx_data.rx_big.dmat,
2865 		    ss->rx_data.rx_big.info[i].map);
2866 		m_freem(ss->rx_data.rx_big.info[i].m);
2867 		ss->rx_data.rx_big.info[i].m = NULL;
2868 	}
2869 
2870 	for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
2871 		if (ss->rx_data.rx_small.info[i].m == NULL)
2872 			continue;
2873 		bus_dmamap_unload(ss->rx_data.rx_small.dmat,
2874 		    ss->rx_data.rx_small.info[i].map);
2875 		m_freem(ss->rx_data.rx_small.info[i].m);
2876 		ss->rx_data.rx_small.info[i].m = NULL;
2877 	}
2878 
2879 	/* Transmit ring used only on the first slice */
2880 	if (ss->tx.info == NULL)
2881 		return;
2882 
2883 	for (i = 0; i <= ss->tx.mask; i++) {
2884 		if (ss->tx.info[i].m == NULL)
2885 			continue;
2886 		bus_dmamap_unload(ss->tx.dmat, ss->tx.info[i].map);
2887 		m_freem(ss->tx.info[i].m);
2888 		ss->tx.info[i].m = NULL;
2889 	}
2890 }
2891 
2892 static void
2893 mxge_free_mbufs(mxge_softc_t *sc)
2894 {
2895 	int slice;
2896 
2897 	for (slice = 0; slice < sc->num_slices; slice++)
2898 		mxge_free_slice_mbufs(&sc->ss[slice]);
2899 }
2900 
2901 static void
2902 mxge_free_slice_rings(struct mxge_slice_state *ss)
2903 {
2904 	int i;
2905 
2906 	if (ss->rx_data.rx_done.entry != NULL) {
2907 		mxge_dma_free(&ss->rx_done_dma);
2908 		ss->rx_data.rx_done.entry = NULL;
2909 	}
2910 
2911 	if (ss->tx.req_list != NULL) {
2912 		kfree(ss->tx.req_list, M_DEVBUF);
2913 		ss->tx.req_list = NULL;
2914 	}
2915 
2916 	if (ss->tx.seg_list != NULL) {
2917 		kfree(ss->tx.seg_list, M_DEVBUF);
2918 		ss->tx.seg_list = NULL;
2919 	}
2920 
2921 	if (ss->rx_data.rx_small.shadow != NULL) {
2922 		kfree(ss->rx_data.rx_small.shadow, M_DEVBUF);
2923 		ss->rx_data.rx_small.shadow = NULL;
2924 	}
2925 
2926 	if (ss->rx_data.rx_big.shadow != NULL) {
2927 		kfree(ss->rx_data.rx_big.shadow, M_DEVBUF);
2928 		ss->rx_data.rx_big.shadow = NULL;
2929 	}
2930 
2931 	if (ss->tx.info != NULL) {
2932 		if (ss->tx.dmat != NULL) {
2933 			for (i = 0; i <= ss->tx.mask; i++) {
2934 				bus_dmamap_destroy(ss->tx.dmat,
2935 				    ss->tx.info[i].map);
2936 			}
2937 			bus_dma_tag_destroy(ss->tx.dmat);
2938 		}
2939 		kfree(ss->tx.info, M_DEVBUF);
2940 		ss->tx.info = NULL;
2941 	}
2942 
2943 	if (ss->rx_data.rx_small.info != NULL) {
2944 		if (ss->rx_data.rx_small.dmat != NULL) {
2945 			for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
2946 				bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
2947 				    ss->rx_data.rx_small.info[i].map);
2948 			}
2949 			bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
2950 			    ss->rx_data.rx_small.extra_map);
2951 			bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
2952 		}
2953 		kfree(ss->rx_data.rx_small.info, M_DEVBUF);
2954 		ss->rx_data.rx_small.info = NULL;
2955 	}
2956 
2957 	if (ss->rx_data.rx_big.info != NULL) {
2958 		if (ss->rx_data.rx_big.dmat != NULL) {
2959 			for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
2960 				bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
2961 				    ss->rx_data.rx_big.info[i].map);
2962 			}
2963 			bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
2964 			    ss->rx_data.rx_big.extra_map);
2965 			bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
2966 		}
2967 		kfree(ss->rx_data.rx_big.info, M_DEVBUF);
2968 		ss->rx_data.rx_big.info = NULL;
2969 	}
2970 }
2971 
2972 static void
2973 mxge_free_rings(mxge_softc_t *sc)
2974 {
2975 	int slice;
2976 
2977 	if (sc->ss == NULL)
2978 		return;
2979 
2980 	for (slice = 0; slice < sc->num_slices; slice++)
2981 		mxge_free_slice_rings(&sc->ss[slice]);
2982 }
2983 
2984 static int
2985 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
2986     int tx_ring_entries)
2987 {
2988 	mxge_softc_t *sc = ss->sc;
2989 	size_t bytes;
2990 	int err, i;
2991 
2992 	/*
2993 	 * Allocate per-slice receive resources
2994 	 */
2995 
2996 	ss->rx_data.rx_small.mask = ss->rx_data.rx_big.mask =
2997 	    rx_ring_entries - 1;
2998 	ss->rx_data.rx_done.mask = (2 * rx_ring_entries) - 1;
2999 
3000 	/* Allocate the rx shadow rings */
3001 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_small.shadow);
3002 	ss->rx_data.rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3003 
3004 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_big.shadow);
3005 	ss->rx_data.rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3006 
3007 	/* Allocate the rx host info rings */
3008 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_small.info);
3009 	ss->rx_data.rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3010 
3011 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_big.info);
3012 	ss->rx_data.rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3013 
3014 	/* Allocate the rx busdma resources */
3015 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3016 				 1,			/* alignment */
3017 				 4096,			/* boundary */
3018 				 BUS_SPACE_MAXADDR,	/* low */
3019 				 BUS_SPACE_MAXADDR,	/* high */
3020 				 NULL, NULL,		/* filter */
3021 				 MHLEN,			/* maxsize */
3022 				 1,			/* num segs */
3023 				 MHLEN,			/* maxsegsize */
3024 				 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
3025 				 			/* flags */
3026 				 &ss->rx_data.rx_small.dmat); /* tag */
3027 	if (err != 0) {
3028 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3029 		    err);
3030 		return err;
3031 	}
3032 
3033 	err = bus_dmamap_create(ss->rx_data.rx_small.dmat, BUS_DMA_WAITOK,
3034 	    &ss->rx_data.rx_small.extra_map);
3035 	if (err != 0) {
3036 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n", err);
3037 		bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
3038 		ss->rx_data.rx_small.dmat = NULL;
3039 		return err;
3040 	}
3041 	for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
3042 		err = bus_dmamap_create(ss->rx_data.rx_small.dmat,
3043 		    BUS_DMA_WAITOK, &ss->rx_data.rx_small.info[i].map);
3044 		if (err != 0) {
3045 			int j;
3046 
3047 			device_printf(sc->dev, "Err %d rx_small dmamap\n", err);
3048 
3049 			for (j = 0; j < i; ++j) {
3050 				bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
3051 				    ss->rx_data.rx_small.info[j].map);
3052 			}
3053 			bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
3054 			    ss->rx_data.rx_small.extra_map);
3055 			bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
3056 			ss->rx_data.rx_small.dmat = NULL;
3057 			return err;
3058 		}
3059 	}
3060 
3061 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3062 				 1,			/* alignment */
3063 				 4096,			/* boundary */
3064 				 BUS_SPACE_MAXADDR,	/* low */
3065 				 BUS_SPACE_MAXADDR,	/* high */
3066 				 NULL, NULL,		/* filter */
3067 				 4096,			/* maxsize */
3068 				 1,			/* num segs */
3069 				 4096,			/* maxsegsize*/
3070 				 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
3071 				 			/* flags */
3072 				 &ss->rx_data.rx_big.dmat); /* tag */
3073 	if (err != 0) {
3074 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3075 		    err);
3076 		return err;
3077 	}
3078 
3079 	err = bus_dmamap_create(ss->rx_data.rx_big.dmat, BUS_DMA_WAITOK,
3080 	    &ss->rx_data.rx_big.extra_map);
3081 	if (err != 0) {
3082 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n", err);
3083 		bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
3084 		ss->rx_data.rx_big.dmat = NULL;
3085 		return err;
3086 	}
3087 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
3088 		err = bus_dmamap_create(ss->rx_data.rx_big.dmat, BUS_DMA_WAITOK,
3089 		    &ss->rx_data.rx_big.info[i].map);
3090 		if (err != 0) {
3091 			int j;
3092 
3093 			device_printf(sc->dev, "Err %d rx_big dmamap\n", err);
3094 			for (j = 0; j < i; ++j) {
3095 				bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
3096 				    ss->rx_data.rx_big.info[j].map);
3097 			}
3098 			bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
3099 			    ss->rx_data.rx_big.extra_map);
3100 			bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
3101 			ss->rx_data.rx_big.dmat = NULL;
3102 			return err;
3103 		}
3104 	}
3105 
3106 	/*
3107 	 * Now allocate TX resources
3108 	 */
3109 
3110 	ss->tx.mask = tx_ring_entries - 1;
3111 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3112 
3113 	/*
3114 	 * Allocate the tx request copy block; MUST be at least 8 bytes
3115 	 * aligned
3116 	 */
3117 	bytes = sizeof(*ss->tx.req_list) * (ss->tx.max_desc + 4);
3118 	ss->tx.req_list = kmalloc_cachealign(__VM_CACHELINE_ALIGN(bytes),
3119 	    M_DEVBUF, M_WAITOK);
3120 
3121 	/* Allocate the tx busdma segment list */
3122 	bytes = sizeof(*ss->tx.seg_list) * ss->tx.max_desc;
3123 	ss->tx.seg_list = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3124 
3125 	/* Allocate the tx host info ring */
3126 	bytes = tx_ring_entries * sizeof(*ss->tx.info);
3127 	ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3128 
3129 	/* Allocate the tx busdma resources */
3130 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3131 				 1,			/* alignment */
3132 				 sc->tx_boundary,	/* boundary */
3133 				 BUS_SPACE_MAXADDR,	/* low */
3134 				 BUS_SPACE_MAXADDR,	/* high */
3135 				 NULL, NULL,		/* filter */
3136 				 IP_MAXPACKET +
3137 				 sizeof(struct ether_vlan_header),
3138 				 			/* maxsize */
3139 				 ss->tx.max_desc - 2,	/* num segs */
3140 				 sc->tx_boundary,	/* maxsegsz */
3141 				 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW |
3142 				 BUS_DMA_ONEBPAGE,	/* flags */
3143 				 &ss->tx.dmat);		/* tag */
3144 	if (err != 0) {
3145 		device_printf(sc->dev, "Err %d allocating tx dmat\n", err);
3146 		return err;
3147 	}
3148 
3149 	/*
3150 	 * Now use these tags to setup DMA maps for each slot in the ring
3151 	 */
3152 	for (i = 0; i <= ss->tx.mask; i++) {
3153 		err = bus_dmamap_create(ss->tx.dmat,
3154 		    BUS_DMA_WAITOK | BUS_DMA_ONEBPAGE, &ss->tx.info[i].map);
3155 		if (err != 0) {
3156 			int j;
3157 
3158 			device_printf(sc->dev, "Err %d tx dmamap\n", err);
3159 			for (j = 0; j < i; ++j) {
3160 				bus_dmamap_destroy(ss->tx.dmat,
3161 				    ss->tx.info[j].map);
3162 			}
3163 			bus_dma_tag_destroy(ss->tx.dmat);
3164 			ss->tx.dmat = NULL;
3165 			return err;
3166 		}
3167 	}
3168 	return 0;
3169 }
3170 
3171 static int
3172 mxge_alloc_rings(mxge_softc_t *sc)
3173 {
3174 	mxge_cmd_t cmd;
3175 	int tx_ring_size;
3176 	int tx_ring_entries, rx_ring_entries;
3177 	int err, slice;
3178 
3179 	/* Get ring sizes */
3180 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3181 	if (err != 0) {
3182 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3183 		return err;
3184 	}
3185 	tx_ring_size = cmd.data0;
3186 
3187 	tx_ring_entries = tx_ring_size / sizeof(mcp_kreq_ether_send_t);
3188 	rx_ring_entries = sc->rx_intr_slots / 2;
3189 
3190 	if (bootverbose) {
3191 		device_printf(sc->dev, "tx desc %d, rx desc %d\n",
3192 		    tx_ring_entries, rx_ring_entries);
3193 	}
3194 
3195 	sc->ifp->if_nmbclusters = rx_ring_entries * sc->num_slices;
3196 	sc->ifp->if_nmbjclusters = sc->ifp->if_nmbclusters;
3197 
3198 	ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3199 	ifq_set_ready(&sc->ifp->if_snd);
3200 	ifq_set_subq_cnt(&sc->ifp->if_snd, sc->num_tx_rings);
3201 
3202 	if (sc->num_tx_rings > 1) {
3203 		sc->ifp->if_mapsubq = ifq_mapsubq_mask;
3204 		ifq_set_subq_mask(&sc->ifp->if_snd, sc->num_tx_rings - 1);
3205 	}
3206 
3207 	for (slice = 0; slice < sc->num_slices; slice++) {
3208 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3209 		    rx_ring_entries, tx_ring_entries);
3210 		if (err != 0) {
3211 			device_printf(sc->dev,
3212 			    "alloc %d slice rings failed\n", slice);
3213 			return err;
3214 		}
3215 	}
3216 	return 0;
3217 }
3218 
3219 static void
3220 mxge_choose_params(int mtu, int *cl_size)
3221 {
3222 	int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3223 
3224 	if (bufsize < MCLBYTES) {
3225 		*cl_size = MCLBYTES;
3226 	} else {
3227 		KASSERT(bufsize < MJUMPAGESIZE, ("invalid MTU %d", mtu));
3228 		*cl_size = MJUMPAGESIZE;
3229 	}
3230 }
3231 
3232 static int
3233 mxge_slice_open(struct mxge_slice_state *ss, int cl_size)
3234 {
3235 	mxge_cmd_t cmd;
3236 	int err, i, slice;
3237 
3238 	slice = ss - ss->sc->ss;
3239 
3240 	/*
3241 	 * Get the lanai pointers to the send and receive rings
3242 	 */
3243 	err = 0;
3244 
3245 	if (ss->sc->num_tx_rings == 1) {
3246 		if (slice == 0) {
3247 			cmd.data0 = slice;
3248 			err = mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SEND_OFFSET,
3249 			    &cmd);
3250 			ss->tx.lanai = (volatile mcp_kreq_ether_send_t *)
3251 			    (ss->sc->sram + cmd.data0);
3252 			/* Leave send_go and send_stop as NULL */
3253 		}
3254 	} else {
3255 		cmd.data0 = slice;
3256 		err = mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3257 		ss->tx.lanai = (volatile mcp_kreq_ether_send_t *)
3258 		    (ss->sc->sram + cmd.data0);
3259 		ss->tx.send_go = (volatile uint32_t *)
3260 		    (ss->sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3261 		ss->tx.send_stop = (volatile uint32_t *)
3262 		    (ss->sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3263 	}
3264 
3265 	cmd.data0 = slice;
3266 	err |= mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3267 	ss->rx_data.rx_small.lanai =
3268 	    (volatile mcp_kreq_ether_recv_t *)(ss->sc->sram + cmd.data0);
3269 
3270 	cmd.data0 = slice;
3271 	err |= mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3272 	ss->rx_data.rx_big.lanai =
3273 	    (volatile mcp_kreq_ether_recv_t *)(ss->sc->sram + cmd.data0);
3274 
3275 	if (err != 0) {
3276 		if_printf(ss->sc->ifp,
3277 		    "failed to get ring sizes or locations\n");
3278 		return EIO;
3279 	}
3280 
3281 	/*
3282 	 * Stock small receive ring
3283 	 */
3284 	for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
3285 		err = mxge_get_buf_small(&ss->rx_data.rx_small,
3286 		    ss->rx_data.rx_small.info[i].map, i, TRUE);
3287 		if (err) {
3288 			if_printf(ss->sc->ifp, "alloced %d/%d smalls\n", i,
3289 			    ss->rx_data.rx_small.mask + 1);
3290 			return ENOMEM;
3291 		}
3292 	}
3293 
3294 	/*
3295 	 * Stock big receive ring
3296 	 */
3297 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
3298 		ss->rx_data.rx_big.shadow[i].addr_low = 0xffffffff;
3299 		ss->rx_data.rx_big.shadow[i].addr_high = 0xffffffff;
3300 	}
3301 
3302 	ss->rx_data.rx_big.cl_size = cl_size;
3303 
3304 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
3305 		err = mxge_get_buf_big(&ss->rx_data.rx_big,
3306 		    ss->rx_data.rx_big.info[i].map, i, TRUE);
3307 		if (err) {
3308 			if_printf(ss->sc->ifp, "alloced %d/%d bigs\n", i,
3309 			    ss->rx_data.rx_big.mask + 1);
3310 			return ENOMEM;
3311 		}
3312 	}
3313 	return 0;
3314 }
3315 
3316 static int
3317 mxge_open(mxge_softc_t *sc)
3318 {
3319 	struct ifnet *ifp = sc->ifp;
3320 	mxge_cmd_t cmd;
3321 	int err, slice, cl_size, i;
3322 	bus_addr_t bus;
3323 	volatile uint8_t *itable;
3324 	struct mxge_slice_state *ss;
3325 
3326 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
3327 
3328 	/* Copy the MAC address in case it was overridden */
3329 	bcopy(IF_LLADDR(ifp), sc->mac_addr, ETHER_ADDR_LEN);
3330 
3331 	err = mxge_reset(sc, 1);
3332 	if (err != 0) {
3333 		if_printf(ifp, "failed to reset\n");
3334 		return EIO;
3335 	}
3336 
3337 	if (sc->num_slices > 1) {
3338 		/* Setup the indirection table */
3339 		cmd.data0 = sc->num_slices;
3340 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE, &cmd);
3341 
3342 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET, &cmd);
3343 		if (err != 0) {
3344 			if_printf(ifp, "failed to setup rss tables\n");
3345 			return err;
3346 		}
3347 
3348 		/* Just enable an identity mapping */
3349 		itable = sc->sram + cmd.data0;
3350 		for (i = 0; i < sc->num_slices; i++)
3351 			itable[i] = (uint8_t)i;
3352 
3353 		if (sc->use_rss) {
3354 			volatile uint8_t *hwkey;
3355 			uint8_t swkey[MXGE_HWRSS_KEYLEN];
3356 
3357 			err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_KEY_OFFSET,
3358 			    &cmd);
3359 			if (err != 0) {
3360 				if_printf(ifp, "failed to get rsskey\n");
3361 				return err;
3362 			}
3363 			hwkey = sc->sram + cmd.data0;
3364 
3365 			toeplitz_get_key(swkey, MXGE_HWRSS_KEYLEN);
3366 			for (i = 0; i < MXGE_HWRSS_KEYLEN; ++i)
3367 				hwkey[i] = swkey[i];
3368 			wmb();
3369 
3370 			err = mxge_send_cmd(sc, MXGEFW_CMD_RSS_KEY_UPDATED,
3371 			    &cmd);
3372 			if (err != 0) {
3373 				if_printf(ifp, "failed to update rsskey\n");
3374 				return err;
3375 			}
3376 			if (bootverbose)
3377 				if_printf(ifp, "RSS key updated\n");
3378 		}
3379 
3380 		cmd.data0 = 1;
3381 		if (sc->use_rss) {
3382 			if (bootverbose)
3383 				if_printf(ifp, "input hash: RSS\n");
3384 			cmd.data1 = MXGEFW_RSS_HASH_TYPE_IPV4 |
3385 			    MXGEFW_RSS_HASH_TYPE_TCP_IPV4;
3386 		} else {
3387 			if (bootverbose)
3388 				if_printf(ifp, "input hash: SRC_DST_PORT\n");
3389 			cmd.data1 = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
3390 		}
3391 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3392 		if (err != 0) {
3393 			if_printf(ifp, "failed to enable slices\n");
3394 			return err;
3395 		}
3396 	}
3397 
3398 	cmd.data0 = MXGEFW_TSO_MODE_NDIS;
3399 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_TSO_MODE, &cmd);
3400 	if (err) {
3401 		/*
3402 		 * Can't change TSO mode to NDIS, never allow TSO then
3403 		 */
3404 		if_printf(ifp, "failed to set TSO mode\n");
3405 		ifp->if_capenable &= ~IFCAP_TSO;
3406 		ifp->if_capabilities &= ~IFCAP_TSO;
3407 		ifp->if_hwassist &= ~CSUM_TSO;
3408 	}
3409 
3410 	mxge_choose_params(ifp->if_mtu, &cl_size);
3411 
3412 	cmd.data0 = 1;
3413 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS, &cmd);
3414 	/*
3415 	 * Error is only meaningful if we're trying to set
3416 	 * MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1
3417 	 */
3418 
3419 	/*
3420 	 * Give the firmware the mtu and the big and small buffer
3421 	 * sizes.  The firmware wants the big buf size to be a power
3422 	 * of two. Luckily, DragonFly's clusters are powers of two
3423 	 */
3424 	cmd.data0 = ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3425 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3426 
3427 	cmd.data0 = MXGE_RX_SMALL_BUFLEN;
3428 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE, &cmd);
3429 
3430 	cmd.data0 = cl_size;
3431 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3432 
3433 	if (err != 0) {
3434 		if_printf(ifp, "failed to setup params\n");
3435 		goto abort;
3436 	}
3437 
3438 	/* Now give him the pointer to the stats block */
3439 	for (slice = 0; slice < sc->num_slices; slice++) {
3440 		ss = &sc->ss[slice];
3441 		cmd.data0 = MXGE_LOWPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3442 		cmd.data1 = MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3443 		cmd.data2 = sizeof(struct mcp_irq_data);
3444 		cmd.data2 |= (slice << 16);
3445 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3446 	}
3447 
3448 	if (err != 0) {
3449 		bus = sc->ss->fw_stats_dma.dmem_busaddr;
3450 		bus += offsetof(struct mcp_irq_data, send_done_count);
3451 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3452 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3453 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3454 		    &cmd);
3455 
3456 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3457 		sc->fw_multicast_support = 0;
3458 	} else {
3459 		sc->fw_multicast_support = 1;
3460 	}
3461 
3462 	if (err != 0) {
3463 		if_printf(ifp, "failed to setup params\n");
3464 		goto abort;
3465 	}
3466 
3467 	for (slice = 0; slice < sc->num_slices; slice++) {
3468 		err = mxge_slice_open(&sc->ss[slice], cl_size);
3469 		if (err != 0) {
3470 			if_printf(ifp, "couldn't open slice %d\n", slice);
3471 			goto abort;
3472 		}
3473 	}
3474 
3475 	/* Finally, start the firmware running */
3476 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3477 	if (err) {
3478 		if_printf(ifp, "Couldn't bring up link\n");
3479 		goto abort;
3480 	}
3481 
3482 	ifp->if_flags |= IFF_RUNNING;
3483 	for (i = 0; i < sc->num_tx_rings; ++i) {
3484 		mxge_tx_ring_t *tx = &sc->ss[i].tx;
3485 
3486 		ifsq_clr_oactive(tx->ifsq);
3487 		ifsq_watchdog_start(&tx->watchdog);
3488 	}
3489 
3490 	return 0;
3491 
3492 abort:
3493 	mxge_free_mbufs(sc);
3494 	return err;
3495 }
3496 
3497 static void
3498 mxge_close(mxge_softc_t *sc, int down)
3499 {
3500 	struct ifnet *ifp = sc->ifp;
3501 	mxge_cmd_t cmd;
3502 	int err, old_down_cnt, i;
3503 
3504 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
3505 
3506 	if (!down) {
3507 		old_down_cnt = sc->down_cnt;
3508 		wmb();
3509 
3510 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3511 		if (err)
3512 			if_printf(ifp, "Couldn't bring down link\n");
3513 
3514 		if (old_down_cnt == sc->down_cnt) {
3515 			/*
3516 			 * Wait for down irq
3517 			 * XXX racy
3518 			 */
3519 			ifnet_deserialize_all(ifp);
3520 			DELAY(10 * sc->intr_coal_delay);
3521 			ifnet_serialize_all(ifp);
3522 		}
3523 
3524 		wmb();
3525 		if (old_down_cnt == sc->down_cnt)
3526 			if_printf(ifp, "never got down irq\n");
3527 	}
3528 	mxge_free_mbufs(sc);
3529 
3530 	ifp->if_flags &= ~IFF_RUNNING;
3531 	for (i = 0; i < sc->num_tx_rings; ++i) {
3532 		mxge_tx_ring_t *tx = &sc->ss[i].tx;
3533 
3534 		ifsq_clr_oactive(tx->ifsq);
3535 		ifsq_watchdog_stop(&tx->watchdog);
3536 	}
3537 }
3538 
3539 static void
3540 mxge_setup_cfg_space(mxge_softc_t *sc)
3541 {
3542 	device_t dev = sc->dev;
3543 	int reg;
3544 	uint16_t lnk, pectl;
3545 
3546 	/* Find the PCIe link width and set max read request to 4KB */
3547 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3548 		lnk = pci_read_config(dev, reg + 0x12, 2);
3549 		sc->link_width = (lnk >> 4) & 0x3f;
3550 
3551 		if (sc->pectl == 0) {
3552 			pectl = pci_read_config(dev, reg + 0x8, 2);
3553 			pectl = (pectl & ~0x7000) | (5 << 12);
3554 			pci_write_config(dev, reg + 0x8, pectl, 2);
3555 			sc->pectl = pectl;
3556 		} else {
3557 			/* Restore saved pectl after watchdog reset */
3558 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3559 		}
3560 	}
3561 
3562 	/* Enable DMA and memory space access */
3563 	pci_enable_busmaster(dev);
3564 }
3565 
3566 static uint32_t
3567 mxge_read_reboot(mxge_softc_t *sc)
3568 {
3569 	device_t dev = sc->dev;
3570 	uint32_t vs;
3571 
3572 	/* Find the vendor specific offset */
3573 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3574 		if_printf(sc->ifp, "could not find vendor specific offset\n");
3575 		return (uint32_t)-1;
3576 	}
3577 	/* Enable read32 mode */
3578 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3579 	/* Tell NIC which register to read */
3580 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3581 	return pci_read_config(dev, vs + 0x14, 4);
3582 }
3583 
3584 static void
3585 mxge_watchdog_reset(mxge_softc_t *sc)
3586 {
3587 	struct pci_devinfo *dinfo;
3588 	int err, running;
3589 	uint32_t reboot;
3590 	uint16_t cmd;
3591 
3592 	err = ENXIO;
3593 
3594 	if_printf(sc->ifp, "Watchdog reset!\n");
3595 
3596 	/*
3597 	 * Check to see if the NIC rebooted.  If it did, then all of
3598 	 * PCI config space has been reset, and things like the
3599 	 * busmaster bit will be zero.  If this is the case, then we
3600 	 * must restore PCI config space before the NIC can be used
3601 	 * again
3602 	 */
3603 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3604 	if (cmd == 0xffff) {
3605 		/*
3606 		 * Maybe the watchdog caught the NIC rebooting; wait
3607 		 * up to 100ms for it to finish.  If it does not come
3608 		 * back, then give up
3609 		 */
3610 		DELAY(1000*100);
3611 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3612 		if (cmd == 0xffff)
3613 			if_printf(sc->ifp, "NIC disappeared!\n");
3614 	}
3615 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3616 		/* Print the reboot status */
3617 		reboot = mxge_read_reboot(sc);
3618 		if_printf(sc->ifp, "NIC rebooted, status = 0x%x\n", reboot);
3619 
3620 		running = sc->ifp->if_flags & IFF_RUNNING;
3621 		if (running) {
3622 			/*
3623 			 * Quiesce NIC so that TX routines will not try to
3624 			 * xmit after restoration of BAR
3625 			 */
3626 
3627 			/* Mark the link as down */
3628 			if (sc->link_state) {
3629 				sc->ifp->if_link_state = LINK_STATE_DOWN;
3630 				if_link_state_change(sc->ifp);
3631 			}
3632 			mxge_close(sc, 1);
3633 		}
3634 		/* Restore PCI configuration space */
3635 		dinfo = device_get_ivars(sc->dev);
3636 		pci_cfg_restore(sc->dev, dinfo);
3637 
3638 		/* And redo any changes we made to our config space */
3639 		mxge_setup_cfg_space(sc);
3640 
3641 		/* Reload f/w */
3642 		err = mxge_load_firmware(sc, 0);
3643 		if (err)
3644 			if_printf(sc->ifp, "Unable to re-load f/w\n");
3645 		if (running && !err) {
3646 			int i;
3647 
3648 			err = mxge_open(sc);
3649 
3650 			for (i = 0; i < sc->num_tx_rings; ++i)
3651 				ifsq_devstart_sched(sc->ss[i].tx.ifsq);
3652 		}
3653 		sc->watchdog_resets++;
3654 	} else {
3655 		if_printf(sc->ifp, "NIC did not reboot, not resetting\n");
3656 		err = 0;
3657 	}
3658 	if (err) {
3659 		if_printf(sc->ifp, "watchdog reset failed\n");
3660 	} else {
3661 		if (sc->dying == 2)
3662 			sc->dying = 0;
3663 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3664 	}
3665 }
3666 
3667 static void
3668 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3669 {
3670 	if_printf(sc->ifp, "slice %d struck? ring state:\n", slice);
3671 	if_printf(sc->ifp, "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3672 	    tx->req, tx->done, tx->queue_active);
3673 	if_printf(sc->ifp, "tx.activate=%d tx.deactivate=%d\n",
3674 	    tx->activate, tx->deactivate);
3675 	if_printf(sc->ifp, "pkt_done=%d fw=%d\n",
3676 	    tx->pkt_done, be32toh(sc->ss->fw_stats->send_done_count));
3677 }
3678 
3679 static u_long
3680 mxge_update_stats(mxge_softc_t *sc)
3681 {
3682 	u_long ipackets, opackets, pkts;
3683 
3684 	IFNET_STAT_GET(sc->ifp, ipackets, ipackets);
3685 	IFNET_STAT_GET(sc->ifp, opackets, opackets);
3686 
3687 	pkts = ipackets - sc->ipackets;
3688 	pkts += opackets - sc->opackets;
3689 
3690 	sc->ipackets = ipackets;
3691 	sc->opackets = opackets;
3692 
3693 	return pkts;
3694 }
3695 
3696 static void
3697 mxge_tick(void *arg)
3698 {
3699 	mxge_softc_t *sc = arg;
3700 	u_long pkts = 0;
3701 	int err = 0;
3702 	int ticks;
3703 
3704 	lwkt_serialize_enter(&sc->main_serialize);
3705 
3706 	ticks = mxge_ticks;
3707 	if (sc->ifp->if_flags & IFF_RUNNING) {
3708 		/* Aggregate stats from different slices */
3709 		pkts = mxge_update_stats(sc);
3710 		if (sc->need_media_probe)
3711 			mxge_media_probe(sc);
3712 	}
3713 	if (pkts == 0) {
3714 		uint16_t cmd;
3715 
3716 		/* Ensure NIC did not suffer h/w fault while idle */
3717 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3718 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3719 			sc->dying = 2;
3720 			mxge_serialize_skipmain(sc);
3721 			mxge_watchdog_reset(sc);
3722 			mxge_deserialize_skipmain(sc);
3723 			err = ENXIO;
3724 		}
3725 
3726 		/* Look less often if NIC is idle */
3727 		ticks *= 4;
3728 	}
3729 
3730 	if (err == 0)
3731 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
3732 
3733 	lwkt_serialize_exit(&sc->main_serialize);
3734 }
3735 
3736 static int
3737 mxge_media_change(struct ifnet *ifp)
3738 {
3739 	mxge_softc_t *sc = ifp->if_softc;
3740 	const struct ifmedia *ifm = &sc->media;
3741 	int pause;
3742 
3743 	if (IFM_OPTIONS(ifm->ifm_media) & (IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE)) {
3744 		if (sc->pause)
3745 			return 0;
3746 		pause = 1;
3747 	} else {
3748 		if (!sc->pause)
3749 			return 0;
3750 		pause = 0;
3751 	}
3752 	return mxge_change_pause(sc, pause);
3753 }
3754 
3755 static int
3756 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3757 {
3758 	struct ifnet *ifp = sc->ifp;
3759 	int real_mtu, old_mtu;
3760 	int err = 0;
3761 
3762 	real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3763 	if (mtu > sc->max_mtu || real_mtu < 60)
3764 		return EINVAL;
3765 
3766 	old_mtu = ifp->if_mtu;
3767 	ifp->if_mtu = mtu;
3768 	if (ifp->if_flags & IFF_RUNNING) {
3769 		mxge_close(sc, 0);
3770 		err = mxge_open(sc);
3771 		if (err != 0) {
3772 			ifp->if_mtu = old_mtu;
3773 			mxge_close(sc, 0);
3774 			mxge_open(sc);
3775 		}
3776 	}
3777 	return err;
3778 }
3779 
3780 static void
3781 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3782 {
3783 	mxge_softc_t *sc = ifp->if_softc;
3784 
3785 	ifmr->ifm_status = IFM_AVALID;
3786 	ifmr->ifm_active = IFM_ETHER;
3787 
3788 	if (sc->link_state)
3789 		ifmr->ifm_status |= IFM_ACTIVE;
3790 
3791 	/*
3792 	 * Autoselect is not supported, so the current media
3793 	 * should be delivered.
3794 	 */
3795 	ifmr->ifm_active |= sc->current_media;
3796 	if (sc->current_media != IFM_NONE) {
3797 		ifmr->ifm_active |= MXGE_IFM;
3798 		if (sc->pause)
3799 			ifmr->ifm_active |= IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE;
3800 	}
3801 }
3802 
3803 static int
3804 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data,
3805     struct ucred *cr __unused)
3806 {
3807 	mxge_softc_t *sc = ifp->if_softc;
3808 	struct ifreq *ifr = (struct ifreq *)data;
3809 	int err, mask;
3810 
3811 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
3812 	err = 0;
3813 
3814 	switch (command) {
3815 	case SIOCSIFMTU:
3816 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3817 		break;
3818 
3819 	case SIOCSIFFLAGS:
3820 		if (sc->dying)
3821 			return EINVAL;
3822 
3823 		if (ifp->if_flags & IFF_UP) {
3824 			if (!(ifp->if_flags & IFF_RUNNING)) {
3825 				err = mxge_open(sc);
3826 			} else {
3827 				/*
3828 				 * Take care of PROMISC and ALLMULTI
3829 				 * flag changes
3830 				 */
3831 				mxge_change_promisc(sc,
3832 				    ifp->if_flags & IFF_PROMISC);
3833 				mxge_set_multicast_list(sc);
3834 			}
3835 		} else {
3836 			if (ifp->if_flags & IFF_RUNNING)
3837 				mxge_close(sc, 0);
3838 		}
3839 		break;
3840 
3841 	case SIOCADDMULTI:
3842 	case SIOCDELMULTI:
3843 		mxge_set_multicast_list(sc);
3844 		break;
3845 
3846 	case SIOCSIFCAP:
3847 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3848 		if (mask & IFCAP_TXCSUM) {
3849 			ifp->if_capenable ^= IFCAP_TXCSUM;
3850 			if (ifp->if_capenable & IFCAP_TXCSUM)
3851 				ifp->if_hwassist |= CSUM_TCP | CSUM_UDP;
3852 			else
3853 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
3854 		}
3855 		if (mask & IFCAP_TSO) {
3856 			ifp->if_capenable ^= IFCAP_TSO;
3857 			if (ifp->if_capenable & IFCAP_TSO)
3858 				ifp->if_hwassist |= CSUM_TSO;
3859 			else
3860 				ifp->if_hwassist &= ~CSUM_TSO;
3861 		}
3862 		if (mask & IFCAP_RXCSUM)
3863 			ifp->if_capenable ^= IFCAP_RXCSUM;
3864 		if (mask & IFCAP_VLAN_HWTAGGING)
3865 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3866 		break;
3867 
3868 	case SIOCGIFMEDIA:
3869 	case SIOCSIFMEDIA:
3870 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3871 		    &sc->media, command);
3872 		break;
3873 
3874 	default:
3875 		err = ether_ioctl(ifp, command, data);
3876 		break;
3877 	}
3878 	return err;
3879 }
3880 
3881 static void
3882 mxge_fetch_tunables(mxge_softc_t *sc)
3883 {
3884 	int ifm;
3885 
3886 	sc->intr_coal_delay = mxge_intr_coal_delay;
3887 	if (sc->intr_coal_delay < 0 || sc->intr_coal_delay > (10 * 1000))
3888 		sc->intr_coal_delay = MXGE_INTR_COAL_DELAY;
3889 
3890 	/* XXX */
3891 	if (mxge_ticks == 0)
3892 		mxge_ticks = hz / 2;
3893 
3894 	ifm = ifmedia_str2ethfc(mxge_flowctrl);
3895 	if (ifm & (IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE))
3896 		sc->pause = 1;
3897 
3898 	sc->use_rss = mxge_use_rss;
3899 
3900 	sc->throttle = mxge_throttle;
3901 	if (sc->throttle && sc->throttle > MXGE_MAX_THROTTLE)
3902 		sc->throttle = MXGE_MAX_THROTTLE;
3903 	if (sc->throttle && sc->throttle < MXGE_MIN_THROTTLE)
3904 		sc->throttle = MXGE_MIN_THROTTLE;
3905 }
3906 
3907 static void
3908 mxge_free_slices(mxge_softc_t *sc)
3909 {
3910 	struct mxge_slice_state *ss;
3911 	int i;
3912 
3913 	if (sc->ss == NULL)
3914 		return;
3915 
3916 	for (i = 0; i < sc->num_slices; i++) {
3917 		ss = &sc->ss[i];
3918 		if (ss->fw_stats != NULL) {
3919 			mxge_dma_free(&ss->fw_stats_dma);
3920 			ss->fw_stats = NULL;
3921 		}
3922 		if (ss->rx_data.rx_done.entry != NULL) {
3923 			mxge_dma_free(&ss->rx_done_dma);
3924 			ss->rx_data.rx_done.entry = NULL;
3925 		}
3926 	}
3927 	kfree(sc->ss, M_DEVBUF);
3928 	sc->ss = NULL;
3929 }
3930 
3931 static int
3932 mxge_alloc_slices(mxge_softc_t *sc)
3933 {
3934 	mxge_cmd_t cmd;
3935 	struct mxge_slice_state *ss;
3936 	size_t bytes;
3937 	int err, i, rx_ring_size;
3938 
3939 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3940 	if (err != 0) {
3941 		device_printf(sc->dev, "Cannot determine rx ring size\n");
3942 		return err;
3943 	}
3944 	rx_ring_size = cmd.data0;
3945 	sc->rx_intr_slots = 2 * (rx_ring_size / sizeof (mcp_dma_addr_t));
3946 
3947 	bytes = sizeof(*sc->ss) * sc->num_slices;
3948 	sc->ss = kmalloc_cachealign(bytes, M_DEVBUF, M_WAITOK | M_ZERO);
3949 
3950 	for (i = 0; i < sc->num_slices; i++) {
3951 		ss = &sc->ss[i];
3952 
3953 		ss->sc = sc;
3954 
3955 		lwkt_serialize_init(&ss->rx_data.rx_serialize);
3956 		lwkt_serialize_init(&ss->tx.tx_serialize);
3957 		ss->intr_rid = -1;
3958 
3959 		/*
3960 		 * Allocate per-slice rx interrupt queue
3961 		 * XXX assume 4bytes mcp_slot
3962 		 */
3963 		bytes = sc->rx_intr_slots * sizeof(mcp_slot_t);
3964 		err = mxge_dma_alloc(sc, &ss->rx_done_dma, bytes, 4096);
3965 		if (err != 0) {
3966 			device_printf(sc->dev,
3967 			    "alloc %d slice rx_done failed\n", i);
3968 			return err;
3969 		}
3970 		ss->rx_data.rx_done.entry = ss->rx_done_dma.dmem_addr;
3971 
3972 		/*
3973 		 * Allocate the per-slice firmware stats
3974 		 */
3975 		bytes = sizeof(*ss->fw_stats);
3976 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
3977 		    sizeof(*ss->fw_stats), 64);
3978 		if (err != 0) {
3979 			device_printf(sc->dev,
3980 			    "alloc %d fw_stats failed\n", i);
3981 			return err;
3982 		}
3983 		ss->fw_stats = ss->fw_stats_dma.dmem_addr;
3984 	}
3985 	return 0;
3986 }
3987 
3988 static void
3989 mxge_slice_probe(mxge_softc_t *sc)
3990 {
3991 	int status, max_intr_slots, max_slices, num_slices;
3992 	int msix_cnt, msix_enable, i, multi_tx;
3993 	mxge_cmd_t cmd;
3994 	const char *old_fw;
3995 
3996 	sc->num_slices = 1;
3997 	sc->num_tx_rings = 1;
3998 
3999 	num_slices = device_getenv_int(sc->dev, "num_slices", mxge_num_slices);
4000 	if (num_slices == 1)
4001 		return;
4002 
4003 	if (ncpus2 == 1)
4004 		return;
4005 
4006 	msix_enable = device_getenv_int(sc->dev, "msix.enable",
4007 	    mxge_msix_enable);
4008 	if (!msix_enable)
4009 		return;
4010 
4011 	msix_cnt = pci_msix_count(sc->dev);
4012 	if (msix_cnt < 2)
4013 		return;
4014 
4015 	/*
4016 	 * Round down MSI-X vector count to the nearest power of 2
4017 	 */
4018 	i = 0;
4019 	while ((1 << (i + 1)) <= msix_cnt)
4020 		++i;
4021 	msix_cnt = 1 << i;
4022 
4023 	/*
4024 	 * Now load the slice aware firmware see what it supports
4025 	 */
4026 	old_fw = sc->fw_name;
4027 	if (old_fw == mxge_fw_aligned)
4028 		sc->fw_name = mxge_fw_rss_aligned;
4029 	else
4030 		sc->fw_name = mxge_fw_rss_unaligned;
4031 	status = mxge_load_firmware(sc, 0);
4032 	if (status != 0) {
4033 		device_printf(sc->dev, "Falling back to a single slice\n");
4034 		return;
4035 	}
4036 
4037 	/*
4038 	 * Try to send a reset command to the card to see if it is alive
4039 	 */
4040 	memset(&cmd, 0, sizeof(cmd));
4041 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4042 	if (status != 0) {
4043 		device_printf(sc->dev, "failed reset\n");
4044 		goto abort_with_fw;
4045 	}
4046 
4047 	/*
4048 	 * Get rx ring size to calculate rx interrupt queue size
4049 	 */
4050 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4051 	if (status != 0) {
4052 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4053 		goto abort_with_fw;
4054 	}
4055 	max_intr_slots = 2 * (cmd.data0 / sizeof(mcp_dma_addr_t));
4056 
4057 	/*
4058 	 * Tell it the size of the rx interrupt queue
4059 	 */
4060 	cmd.data0 = max_intr_slots * sizeof(struct mcp_slot);
4061 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4062 	if (status != 0) {
4063 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4064 		goto abort_with_fw;
4065 	}
4066 
4067 	/*
4068 	 * Ask the maximum number of slices it supports
4069 	 */
4070 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4071 	if (status != 0) {
4072 		device_printf(sc->dev,
4073 		    "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4074 		goto abort_with_fw;
4075 	}
4076 	max_slices = cmd.data0;
4077 
4078 	/*
4079 	 * Round down max slices count to the nearest power of 2
4080 	 */
4081 	i = 0;
4082 	while ((1 << (i + 1)) <= max_slices)
4083 		++i;
4084 	max_slices = 1 << i;
4085 
4086 	if (max_slices > msix_cnt)
4087 		max_slices = msix_cnt;
4088 
4089 	sc->num_slices = num_slices;
4090 	sc->num_slices = if_ring_count2(sc->num_slices, max_slices);
4091 
4092 	multi_tx = device_getenv_int(sc->dev, "multi_tx", mxge_multi_tx);
4093 	if (multi_tx)
4094 		sc->num_tx_rings = sc->num_slices;
4095 
4096 	if (bootverbose) {
4097 		device_printf(sc->dev, "using %d slices, max %d\n",
4098 		    sc->num_slices, max_slices);
4099 	}
4100 
4101 	if (sc->num_slices == 1)
4102 		goto abort_with_fw;
4103 	return;
4104 
4105 abort_with_fw:
4106 	sc->fw_name = old_fw;
4107 	mxge_load_firmware(sc, 0);
4108 }
4109 
4110 static void
4111 mxge_setup_serialize(struct mxge_softc *sc)
4112 {
4113 	int i = 0, slice;
4114 
4115 	/* Main + rx + tx */
4116 	sc->nserialize = (2 * sc->num_slices) + 1;
4117 	sc->serializes =
4118 	    kmalloc(sc->nserialize * sizeof(struct lwkt_serialize *),
4119 	        M_DEVBUF, M_WAITOK | M_ZERO);
4120 
4121 	/*
4122 	 * Setup serializes
4123 	 *
4124 	 * NOTE: Order is critical
4125 	 */
4126 
4127 	KKASSERT(i < sc->nserialize);
4128 	sc->serializes[i++] = &sc->main_serialize;
4129 
4130 	for (slice = 0; slice < sc->num_slices; ++slice) {
4131 		KKASSERT(i < sc->nserialize);
4132 		sc->serializes[i++] = &sc->ss[slice].rx_data.rx_serialize;
4133 	}
4134 
4135 	for (slice = 0; slice < sc->num_slices; ++slice) {
4136 		KKASSERT(i < sc->nserialize);
4137 		sc->serializes[i++] = &sc->ss[slice].tx.tx_serialize;
4138 	}
4139 
4140 	KKASSERT(i == sc->nserialize);
4141 }
4142 
4143 static void
4144 mxge_serialize(struct ifnet *ifp, enum ifnet_serialize slz)
4145 {
4146 	struct mxge_softc *sc = ifp->if_softc;
4147 
4148 	ifnet_serialize_array_enter(sc->serializes, sc->nserialize, slz);
4149 }
4150 
4151 static void
4152 mxge_deserialize(struct ifnet *ifp, enum ifnet_serialize slz)
4153 {
4154 	struct mxge_softc *sc = ifp->if_softc;
4155 
4156 	ifnet_serialize_array_exit(sc->serializes, sc->nserialize, slz);
4157 }
4158 
4159 static int
4160 mxge_tryserialize(struct ifnet *ifp, enum ifnet_serialize slz)
4161 {
4162 	struct mxge_softc *sc = ifp->if_softc;
4163 
4164 	return ifnet_serialize_array_try(sc->serializes, sc->nserialize, slz);
4165 }
4166 
4167 #ifdef INVARIANTS
4168 
4169 static void
4170 mxge_serialize_assert(struct ifnet *ifp, enum ifnet_serialize slz,
4171     boolean_t serialized)
4172 {
4173 	struct mxge_softc *sc = ifp->if_softc;
4174 
4175 	ifnet_serialize_array_assert(sc->serializes, sc->nserialize,
4176 	    slz, serialized);
4177 }
4178 
4179 #endif	/* INVARIANTS */
4180 
4181 #ifdef IFPOLL_ENABLE
4182 
4183 static void
4184 mxge_npoll_rx(struct ifnet *ifp, void *xss, int cycle)
4185 {
4186 	struct mxge_slice_state *ss = xss;
4187 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
4188 
4189 	ASSERT_SERIALIZED(&ss->rx_data.rx_serialize);
4190 
4191 	if (rx_done->entry[rx_done->idx].length != 0) {
4192 		mxge_clean_rx_done(&ss->sc->arpcom.ac_if, &ss->rx_data, cycle);
4193 	} else {
4194 		/*
4195 		 * XXX
4196 		 * This register writting obviously has cost,
4197 		 * however, if we don't hand back the rx token,
4198 		 * the upcoming packets may suffer rediculously
4199 		 * large delay, as observed on 8AL-C using ping(8).
4200 		 */
4201 		*ss->irq_claim = be32toh(3);
4202 	}
4203 }
4204 
4205 static void
4206 mxge_npoll(struct ifnet *ifp, struct ifpoll_info *info)
4207 {
4208 	struct mxge_softc *sc = ifp->if_softc;
4209 	int i;
4210 
4211 	if (info == NULL)
4212 		return;
4213 
4214 	/*
4215 	 * Only poll rx; polling tx and status don't seem to work
4216 	 */
4217 	for (i = 0; i < sc->num_slices; ++i) {
4218 		struct mxge_slice_state *ss = &sc->ss[i];
4219 		int idx = ss->intr_cpuid;
4220 
4221 		KKASSERT(idx < ncpus2);
4222 		info->ifpi_rx[idx].poll_func = mxge_npoll_rx;
4223 		info->ifpi_rx[idx].arg = ss;
4224 		info->ifpi_rx[idx].serializer = &ss->rx_data.rx_serialize;
4225 	}
4226 }
4227 
4228 #endif	/* IFPOLL_ENABLE */
4229 
4230 static int
4231 mxge_attach(device_t dev)
4232 {
4233 	mxge_softc_t *sc = device_get_softc(dev);
4234 	struct ifnet *ifp = &sc->arpcom.ac_if;
4235 	int err, rid, i;
4236 
4237 	/*
4238 	 * Avoid rewriting half the lines in this file to use
4239 	 * &sc->arpcom.ac_if instead
4240 	 */
4241 	sc->ifp = ifp;
4242 	sc->dev = dev;
4243 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4244 
4245 	/* IFM_ETH_FORCEPAUSE can't be changed */
4246 	ifmedia_init(&sc->media, IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE,
4247 	    mxge_media_change, mxge_media_status);
4248 
4249 	lwkt_serialize_init(&sc->main_serialize);
4250 
4251 	mxge_fetch_tunables(sc);
4252 
4253 	err = bus_dma_tag_create(NULL,			/* parent */
4254 				 1,			/* alignment */
4255 				 0,			/* boundary */
4256 				 BUS_SPACE_MAXADDR,	/* low */
4257 				 BUS_SPACE_MAXADDR,	/* high */
4258 				 NULL, NULL,		/* filter */
4259 				 BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
4260 				 0, 			/* num segs */
4261 				 BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
4262 				 0,			/* flags */
4263 				 &sc->parent_dmat);	/* tag */
4264 	if (err != 0) {
4265 		device_printf(dev, "Err %d allocating parent dmat\n", err);
4266 		goto failed;
4267 	}
4268 
4269 	callout_init_mp(&sc->co_hdl);
4270 
4271 	mxge_setup_cfg_space(sc);
4272 
4273 	/*
4274 	 * Map the board into the kernel
4275 	 */
4276 	rid = PCIR_BARS;
4277 	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
4278 	    &rid, RF_ACTIVE);
4279 	if (sc->mem_res == NULL) {
4280 		device_printf(dev, "could not map memory\n");
4281 		err = ENXIO;
4282 		goto failed;
4283 	}
4284 
4285 	sc->sram = rman_get_virtual(sc->mem_res);
4286 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4287 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4288 		device_printf(dev, "impossible memory region size %ld\n",
4289 		    rman_get_size(sc->mem_res));
4290 		err = ENXIO;
4291 		goto failed;
4292 	}
4293 
4294 	/*
4295 	 * Make NULL terminated copy of the EEPROM strings section of
4296 	 * lanai SRAM
4297 	 */
4298 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4299 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4300 	    rman_get_bushandle(sc->mem_res),
4301 	    sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4302 	    sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE - 2);
4303 	err = mxge_parse_strings(sc);
4304 	if (err != 0) {
4305 		device_printf(dev, "parse EEPROM string failed\n");
4306 		goto failed;
4307 	}
4308 
4309 	/*
4310 	 * Enable write combining for efficient use of PCIe bus
4311 	 */
4312 	mxge_enable_wc(sc);
4313 
4314 	/*
4315 	 * Allocate the out of band DMA memory
4316 	 */
4317 	err = mxge_dma_alloc(sc, &sc->cmd_dma, sizeof(mxge_cmd_t), 64);
4318 	if (err != 0) {
4319 		device_printf(dev, "alloc cmd DMA buf failed\n");
4320 		goto failed;
4321 	}
4322 	sc->cmd = sc->cmd_dma.dmem_addr;
4323 
4324 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4325 	if (err != 0) {
4326 		device_printf(dev, "alloc zeropad DMA buf failed\n");
4327 		goto failed;
4328 	}
4329 
4330 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4331 	if (err != 0) {
4332 		device_printf(dev, "alloc dmabench DMA buf failed\n");
4333 		goto failed;
4334 	}
4335 
4336 	/* Select & load the firmware */
4337 	err = mxge_select_firmware(sc);
4338 	if (err != 0) {
4339 		device_printf(dev, "select firmware failed\n");
4340 		goto failed;
4341 	}
4342 
4343 	mxge_slice_probe(sc);
4344 	err = mxge_alloc_slices(sc);
4345 	if (err != 0) {
4346 		device_printf(dev, "alloc slices failed\n");
4347 		goto failed;
4348 	}
4349 
4350 	err = mxge_alloc_intr(sc);
4351 	if (err != 0) {
4352 		device_printf(dev, "alloc intr failed\n");
4353 		goto failed;
4354 	}
4355 
4356 	/* Setup serializes */
4357 	mxge_setup_serialize(sc);
4358 
4359 	err = mxge_reset(sc, 0);
4360 	if (err != 0) {
4361 		device_printf(dev, "reset failed\n");
4362 		goto failed;
4363 	}
4364 
4365 	err = mxge_alloc_rings(sc);
4366 	if (err != 0) {
4367 		device_printf(dev, "failed to allocate rings\n");
4368 		goto failed;
4369 	}
4370 
4371 	ifp->if_baudrate = IF_Gbps(10UL);
4372 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO;
4373 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4374 
4375 	ifp->if_capabilities |= IFCAP_VLAN_MTU;
4376 #if 0
4377 	/* Well, its software, sigh */
4378 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
4379 #endif
4380 	ifp->if_capenable = ifp->if_capabilities;
4381 
4382 	ifp->if_softc = sc;
4383 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4384 	ifp->if_init = mxge_init;
4385 	ifp->if_ioctl = mxge_ioctl;
4386 	ifp->if_start = mxge_start;
4387 #ifdef IFPOLL_ENABLE
4388 	if (sc->intr_type != PCI_INTR_TYPE_LEGACY)
4389 		ifp->if_npoll = mxge_npoll;
4390 #endif
4391 	ifp->if_serialize = mxge_serialize;
4392 	ifp->if_deserialize = mxge_deserialize;
4393 	ifp->if_tryserialize = mxge_tryserialize;
4394 #ifdef INVARIANTS
4395 	ifp->if_serialize_assert = mxge_serialize_assert;
4396 #endif
4397 
4398 	/* Increase TSO burst length */
4399 	ifp->if_tsolen = (32 * ETHERMTU);
4400 
4401 	/* Initialise the ifmedia structure */
4402 	mxge_media_init(sc);
4403 	mxge_media_probe(sc);
4404 
4405 	ether_ifattach(ifp, sc->mac_addr, NULL);
4406 
4407 	/* Setup TX rings and subqueues */
4408 	for (i = 0; i < sc->num_tx_rings; ++i) {
4409 		struct ifaltq_subque *ifsq = ifq_get_subq(&ifp->if_snd, i);
4410 		struct mxge_slice_state *ss = &sc->ss[i];
4411 
4412 		ifsq_set_cpuid(ifsq, ss->intr_cpuid);
4413 		ifsq_set_hw_serialize(ifsq, &ss->tx.tx_serialize);
4414 		ifsq_set_priv(ifsq, &ss->tx);
4415 		ss->tx.ifsq = ifsq;
4416 
4417 		ifsq_watchdog_init(&ss->tx.watchdog, ifsq, mxge_watchdog);
4418 	}
4419 
4420 	/*
4421 	 * XXX
4422 	 * We are not ready to do "gather" jumbo frame, so
4423 	 * limit MTU to MJUMPAGESIZE
4424 	 */
4425 	sc->max_mtu = MJUMPAGESIZE -
4426 	    ETHER_HDR_LEN - EVL_ENCAPLEN - MXGEFW_PAD - 1;
4427 	sc->dying = 0;
4428 
4429 	err = mxge_setup_intr(sc);
4430 	if (err != 0) {
4431 		device_printf(dev, "alloc and setup intr failed\n");
4432 		ether_ifdetach(ifp);
4433 		goto failed;
4434 	}
4435 
4436 	mxge_add_sysctls(sc);
4437 
4438 	/* Increase non-cluster mbuf limit; used by small RX rings */
4439 	mb_inclimit(ifp->if_nmbclusters);
4440 
4441 	callout_reset_bycpu(&sc->co_hdl, mxge_ticks, mxge_tick, sc,
4442 	    sc->ss[0].intr_cpuid);
4443 	return 0;
4444 
4445 failed:
4446 	mxge_detach(dev);
4447 	return err;
4448 }
4449 
4450 static int
4451 mxge_detach(device_t dev)
4452 {
4453 	mxge_softc_t *sc = device_get_softc(dev);
4454 
4455 	if (device_is_attached(dev)) {
4456 		struct ifnet *ifp = sc->ifp;
4457 		int mblimit = ifp->if_nmbclusters;
4458 
4459 		ifnet_serialize_all(ifp);
4460 
4461 		sc->dying = 1;
4462 		if (ifp->if_flags & IFF_RUNNING)
4463 			mxge_close(sc, 1);
4464 		callout_stop(&sc->co_hdl);
4465 
4466 		mxge_teardown_intr(sc, sc->num_slices);
4467 
4468 		ifnet_deserialize_all(ifp);
4469 
4470 		callout_terminate(&sc->co_hdl);
4471 
4472 		ether_ifdetach(ifp);
4473 
4474 		/* Decrease non-cluster mbuf limit increased by us */
4475 		mb_inclimit(-mblimit);
4476 	}
4477 	ifmedia_removeall(&sc->media);
4478 
4479 	if (sc->cmd != NULL && sc->zeropad_dma.dmem_addr != NULL &&
4480 	    sc->sram != NULL)
4481 		mxge_dummy_rdma(sc, 0);
4482 
4483 	mxge_free_intr(sc);
4484 	mxge_rem_sysctls(sc);
4485 	mxge_free_rings(sc);
4486 
4487 	/* MUST after sysctls, intr and rings are freed */
4488 	mxge_free_slices(sc);
4489 
4490 	if (sc->dmabench_dma.dmem_addr != NULL)
4491 		mxge_dma_free(&sc->dmabench_dma);
4492 	if (sc->zeropad_dma.dmem_addr != NULL)
4493 		mxge_dma_free(&sc->zeropad_dma);
4494 	if (sc->cmd_dma.dmem_addr != NULL)
4495 		mxge_dma_free(&sc->cmd_dma);
4496 
4497 	if (sc->msix_table_res != NULL) {
4498 		bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(2),
4499 		    sc->msix_table_res);
4500 	}
4501 	if (sc->mem_res != NULL) {
4502 		bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS,
4503 		    sc->mem_res);
4504 	}
4505 
4506 	if (sc->parent_dmat != NULL)
4507 		bus_dma_tag_destroy(sc->parent_dmat);
4508 
4509 	return 0;
4510 }
4511 
4512 static int
4513 mxge_shutdown(device_t dev)
4514 {
4515 	return 0;
4516 }
4517 
4518 static void
4519 mxge_free_msix(struct mxge_softc *sc, boolean_t setup)
4520 {
4521 	int i;
4522 
4523 	KKASSERT(sc->num_slices > 1);
4524 
4525 	for (i = 0; i < sc->num_slices; ++i) {
4526 		struct mxge_slice_state *ss = &sc->ss[i];
4527 
4528 		if (ss->intr_res != NULL) {
4529 			bus_release_resource(sc->dev, SYS_RES_IRQ,
4530 			    ss->intr_rid, ss->intr_res);
4531 		}
4532 		if (ss->intr_rid >= 0)
4533 			pci_release_msix_vector(sc->dev, ss->intr_rid);
4534 	}
4535 	if (setup)
4536 		pci_teardown_msix(sc->dev);
4537 }
4538 
4539 static int
4540 mxge_alloc_msix(struct mxge_softc *sc)
4541 {
4542 	struct mxge_slice_state *ss;
4543 	int offset, rid, error, i;
4544 	boolean_t setup = FALSE;
4545 
4546 	KKASSERT(sc->num_slices > 1);
4547 
4548 	if (sc->num_slices == ncpus2) {
4549 		offset = 0;
4550 	} else {
4551 		int offset_def;
4552 
4553 		offset_def = (sc->num_slices * device_get_unit(sc->dev)) %
4554 		    ncpus2;
4555 
4556 		offset = device_getenv_int(sc->dev, "msix.offset", offset_def);
4557 		if (offset >= ncpus2 ||
4558 		    offset % sc->num_slices != 0) {
4559 			device_printf(sc->dev, "invalid msix.offset %d, "
4560 			    "use %d\n", offset, offset_def);
4561 			offset = offset_def;
4562 		}
4563 	}
4564 
4565 	ss = &sc->ss[0];
4566 
4567 	ss->intr_serialize = &sc->main_serialize;
4568 	ss->intr_func = mxge_msi;
4569 	ksnprintf(ss->intr_desc0, sizeof(ss->intr_desc0),
4570 	    "%s comb", device_get_nameunit(sc->dev));
4571 	ss->intr_desc = ss->intr_desc0;
4572 	ss->intr_cpuid = offset;
4573 
4574 	for (i = 1; i < sc->num_slices; ++i) {
4575 		ss = &sc->ss[i];
4576 
4577 		ss->intr_serialize = &ss->rx_data.rx_serialize;
4578 		if (sc->num_tx_rings == 1) {
4579 			ss->intr_func = mxge_msix_rx;
4580 			ksnprintf(ss->intr_desc0, sizeof(ss->intr_desc0),
4581 			    "%s rx", device_get_nameunit(sc->dev));
4582 		} else {
4583 			ss->intr_func = mxge_msix_rxtx;
4584 			ksnprintf(ss->intr_desc0, sizeof(ss->intr_desc0),
4585 			    "%s rxtx", device_get_nameunit(sc->dev));
4586 		}
4587 		ss->intr_desc = ss->intr_desc0;
4588 		ss->intr_cpuid = offset + i;
4589 	}
4590 
4591 	rid = PCIR_BAR(2);
4592 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4593 	    &rid, RF_ACTIVE);
4594 	if (sc->msix_table_res == NULL) {
4595 		device_printf(sc->dev, "couldn't alloc MSI-X table res\n");
4596 		return ENXIO;
4597 	}
4598 
4599 	error = pci_setup_msix(sc->dev);
4600 	if (error) {
4601 		device_printf(sc->dev, "could not setup MSI-X\n");
4602 		goto back;
4603 	}
4604 	setup = TRUE;
4605 
4606 	for (i = 0; i < sc->num_slices; ++i) {
4607 		ss = &sc->ss[i];
4608 
4609 		error = pci_alloc_msix_vector(sc->dev, i, &ss->intr_rid,
4610 		    ss->intr_cpuid);
4611 		if (error) {
4612 			device_printf(sc->dev, "could not alloc "
4613 			    "MSI-X %d on cpu%d\n", i, ss->intr_cpuid);
4614 			goto back;
4615 		}
4616 
4617 		ss->intr_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
4618 		    &ss->intr_rid, RF_ACTIVE);
4619 		if (ss->intr_res == NULL) {
4620 			device_printf(sc->dev, "could not alloc "
4621 			    "MSI-X %d resource\n", i);
4622 			error = ENXIO;
4623 			goto back;
4624 		}
4625 	}
4626 
4627 	pci_enable_msix(sc->dev);
4628 	sc->intr_type = PCI_INTR_TYPE_MSIX;
4629 back:
4630 	if (error)
4631 		mxge_free_msix(sc, setup);
4632 	return error;
4633 }
4634 
4635 static int
4636 mxge_alloc_intr(struct mxge_softc *sc)
4637 {
4638 	struct mxge_slice_state *ss;
4639 	u_int irq_flags;
4640 
4641 	if (sc->num_slices > 1) {
4642 		int error;
4643 
4644 		error = mxge_alloc_msix(sc);
4645 		if (error)
4646 			return error;
4647 		KKASSERT(sc->intr_type == PCI_INTR_TYPE_MSIX);
4648 		return 0;
4649 	}
4650 
4651 	ss = &sc->ss[0];
4652 
4653 	sc->intr_type = pci_alloc_1intr(sc->dev, mxge_msi_enable,
4654 	    &ss->intr_rid, &irq_flags);
4655 
4656 	ss->intr_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
4657 	    &ss->intr_rid, irq_flags);
4658 	if (ss->intr_res == NULL) {
4659 		device_printf(sc->dev, "could not alloc interrupt\n");
4660 		return ENXIO;
4661 	}
4662 
4663 	if (sc->intr_type == PCI_INTR_TYPE_LEGACY)
4664 		ss->intr_func = mxge_legacy;
4665 	else
4666 		ss->intr_func = mxge_msi;
4667 	ss->intr_serialize = &sc->main_serialize;
4668 	ss->intr_cpuid = rman_get_cpuid(ss->intr_res);
4669 
4670 	return 0;
4671 }
4672 
4673 static int
4674 mxge_setup_intr(struct mxge_softc *sc)
4675 {
4676 	int i;
4677 
4678 	for (i = 0; i < sc->num_slices; ++i) {
4679 		struct mxge_slice_state *ss = &sc->ss[i];
4680 		int error;
4681 
4682 		error = bus_setup_intr_descr(sc->dev, ss->intr_res,
4683 		    INTR_MPSAFE, ss->intr_func, ss, &ss->intr_hand,
4684 		    ss->intr_serialize, ss->intr_desc);
4685 		if (error) {
4686 			device_printf(sc->dev, "can't setup %dth intr\n", i);
4687 			mxge_teardown_intr(sc, i);
4688 			return error;
4689 		}
4690 	}
4691 	return 0;
4692 }
4693 
4694 static void
4695 mxge_teardown_intr(struct mxge_softc *sc, int cnt)
4696 {
4697 	int i;
4698 
4699 	if (sc->ss == NULL)
4700 		return;
4701 
4702 	for (i = 0; i < cnt; ++i) {
4703 		struct mxge_slice_state *ss = &sc->ss[i];
4704 
4705 		bus_teardown_intr(sc->dev, ss->intr_res, ss->intr_hand);
4706 	}
4707 }
4708 
4709 static void
4710 mxge_free_intr(struct mxge_softc *sc)
4711 {
4712 	if (sc->ss == NULL)
4713 		return;
4714 
4715 	if (sc->intr_type != PCI_INTR_TYPE_MSIX) {
4716 		struct mxge_slice_state *ss = &sc->ss[0];
4717 
4718 		if (ss->intr_res != NULL) {
4719 			bus_release_resource(sc->dev, SYS_RES_IRQ,
4720 			    ss->intr_rid, ss->intr_res);
4721 		}
4722 		if (sc->intr_type == PCI_INTR_TYPE_MSI)
4723 			pci_release_msi(sc->dev);
4724 	} else {
4725 		mxge_free_msix(sc, TRUE);
4726 	}
4727 }
4728