xref: /dragonfly/sys/dev/netif/mxge/if_mxge.c (revision edf2e657)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2013, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 $FreeBSD: head/sys/dev/mxge/if_mxge.c 254263 2013-08-12 23:30:01Z scottl $
29 
30 ***************************************************************************/
31 
32 #include "opt_ifpoll.h"
33 #include "opt_inet.h"
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/linker.h>
38 #include <sys/firmware.h>
39 #include <sys/endian.h>
40 #include <sys/in_cksum.h>
41 #include <sys/sockio.h>
42 #include <sys/mbuf.h>
43 #include <sys/malloc.h>
44 #include <sys/kernel.h>
45 #include <sys/module.h>
46 #include <sys/serialize.h>
47 #include <sys/socket.h>
48 #include <sys/sysctl.h>
49 
50 #include <net/if.h>
51 #include <net/if_arp.h>
52 #include <net/ifq_var.h>
53 #include <net/if_ringmap.h>
54 #include <net/ethernet.h>
55 #include <net/if_dl.h>
56 #include <net/if_media.h>
57 #include <net/if_poll.h>
58 
59 #include <net/bpf.h>
60 
61 #include <net/if_types.h>
62 #include <net/vlan/if_vlan_var.h>
63 #include <net/zlib.h>
64 #include <net/toeplitz.h>
65 
66 #include <netinet/in_systm.h>
67 #include <netinet/in.h>
68 #include <netinet/ip.h>
69 #include <netinet/tcp.h>
70 
71 #include <sys/bus.h>
72 #include <sys/rman.h>
73 
74 #include <bus/pci/pcireg.h>
75 #include <bus/pci/pcivar.h>
76 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
77 
78 #include <vm/vm.h>		/* for pmap_mapdev() */
79 #include <vm/pmap.h>
80 
81 #if defined(__x86_64__)
82 #include <machine/specialreg.h>
83 #endif
84 
85 #include <dev/netif/mxge/mxge_mcp.h>
86 #include <dev/netif/mxge/mcp_gen_header.h>
87 #include <dev/netif/mxge/if_mxge_var.h>
88 
89 #define MXGE_IFM	(IFM_ETHER | IFM_FDX | IFM_ETH_FORCEPAUSE)
90 
91 #define MXGE_RX_SMALL_BUFLEN		(MHLEN - MXGEFW_PAD)
92 #define MXGE_HWRSS_KEYLEN		16
93 
94 /* Tunable params */
95 static int mxge_nvidia_ecrc_enable = 1;
96 static int mxge_force_firmware = 0;
97 static int mxge_intr_coal_delay = MXGE_INTR_COAL_DELAY;
98 static int mxge_deassert_wait = 1;
99 static int mxge_ticks;
100 static int mxge_num_slices = 0;
101 static int mxge_always_promisc = 0;
102 static int mxge_throttle = 0;
103 static int mxge_msi_enable = 1;
104 static int mxge_msix_enable = 1;
105 static int mxge_multi_tx = 1;
106 /*
107  * Don't use RSS by default, its just too slow
108  */
109 static int mxge_use_rss = 0;
110 
111 static char mxge_flowctrl[IFM_ETH_FC_STRLEN] = IFM_ETH_FC_FORCE_FULL;
112 
113 static const char *mxge_fw_unaligned = "mxge_ethp_z8e";
114 static const char *mxge_fw_aligned = "mxge_eth_z8e";
115 static const char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
116 static const char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
117 
118 TUNABLE_INT("hw.mxge.num_slices", &mxge_num_slices);
119 TUNABLE_INT("hw.mxge.intr_coal_delay", &mxge_intr_coal_delay);
120 TUNABLE_INT("hw.mxge.nvidia_ecrc_enable", &mxge_nvidia_ecrc_enable);
121 TUNABLE_INT("hw.mxge.force_firmware", &mxge_force_firmware);
122 TUNABLE_INT("hw.mxge.deassert_wait", &mxge_deassert_wait);
123 TUNABLE_INT("hw.mxge.ticks", &mxge_ticks);
124 TUNABLE_INT("hw.mxge.always_promisc", &mxge_always_promisc);
125 TUNABLE_INT("hw.mxge.throttle", &mxge_throttle);
126 TUNABLE_INT("hw.mxge.multi_tx", &mxge_multi_tx);
127 TUNABLE_INT("hw.mxge.use_rss", &mxge_use_rss);
128 TUNABLE_INT("hw.mxge.msi.enable", &mxge_msi_enable);
129 TUNABLE_INT("hw.mxge.msix.enable", &mxge_msix_enable);
130 TUNABLE_STR("hw.mxge.flow_ctrl", mxge_flowctrl, sizeof(mxge_flowctrl));
131 
132 static int mxge_probe(device_t dev);
133 static int mxge_attach(device_t dev);
134 static int mxge_detach(device_t dev);
135 static int mxge_shutdown(device_t dev);
136 
137 static int mxge_alloc_intr(struct mxge_softc *sc);
138 static void mxge_free_intr(struct mxge_softc *sc);
139 static int mxge_setup_intr(struct mxge_softc *sc);
140 static void mxge_teardown_intr(struct mxge_softc *sc, int cnt);
141 
142 static device_method_t mxge_methods[] = {
143 	/* Device interface */
144 	DEVMETHOD(device_probe, mxge_probe),
145 	DEVMETHOD(device_attach, mxge_attach),
146 	DEVMETHOD(device_detach, mxge_detach),
147 	DEVMETHOD(device_shutdown, mxge_shutdown),
148 	DEVMETHOD_END
149 };
150 
151 static driver_t mxge_driver = {
152 	"mxge",
153 	mxge_methods,
154 	sizeof(mxge_softc_t),
155 };
156 
157 static devclass_t mxge_devclass;
158 
159 /* Declare ourselves to be a child of the PCI bus.*/
160 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, NULL, NULL);
161 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
162 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
163 
164 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
165 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
166 static void mxge_close(mxge_softc_t *sc, int down);
167 static int mxge_open(mxge_softc_t *sc);
168 static void mxge_tick(void *arg);
169 static void mxge_watchdog_reset(mxge_softc_t *sc);
170 static void mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice);
171 
172 static int
173 mxge_probe(device_t dev)
174 {
175 	if (pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM &&
176 	    (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E ||
177 	     pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9)) {
178 		int rev = pci_get_revid(dev);
179 
180 		switch (rev) {
181 		case MXGE_PCI_REV_Z8E:
182 			device_set_desc(dev, "Myri10G-PCIE-8A");
183 			break;
184 		case MXGE_PCI_REV_Z8ES:
185 			device_set_desc(dev, "Myri10G-PCIE-8B");
186 			break;
187 		default:
188 			device_set_desc(dev, "Myri10G-PCIE-8??");
189 			device_printf(dev, "Unrecognized rev %d NIC\n", rev);
190 			break;
191 		}
192 		return 0;
193 	}
194 	return ENXIO;
195 }
196 
197 static void
198 mxge_enable_wc(mxge_softc_t *sc)
199 {
200 #if defined(__x86_64__)
201 	vm_offset_t len;
202 
203 	sc->wc = 1;
204 	len = rman_get_size(sc->mem_res);
205 	pmap_change_attr((vm_offset_t) sc->sram, len / PAGE_SIZE,
206 	    PAT_WRITE_COMBINING);
207 #endif
208 }
209 
210 static int
211 mxge_dma_alloc(mxge_softc_t *sc, bus_dmamem_t *dma, size_t bytes,
212     bus_size_t alignment)
213 {
214 	bus_size_t boundary;
215 	int err;
216 
217 	if (bytes > 4096 && alignment == 4096)
218 		boundary = 0;
219 	else
220 		boundary = 4096;
221 
222 	err = bus_dmamem_coherent(sc->parent_dmat, alignment, boundary,
223 	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, bytes,
224 	    BUS_DMA_WAITOK | BUS_DMA_ZERO, dma);
225 	if (err != 0) {
226 		device_printf(sc->dev, "bus_dmamem_coherent failed: %d\n", err);
227 		return err;
228 	}
229 	return 0;
230 }
231 
232 static void
233 mxge_dma_free(bus_dmamem_t *dma)
234 {
235 	bus_dmamap_unload(dma->dmem_tag, dma->dmem_map);
236 	bus_dmamem_free(dma->dmem_tag, dma->dmem_addr, dma->dmem_map);
237 	bus_dma_tag_destroy(dma->dmem_tag);
238 }
239 
240 /*
241  * The eeprom strings on the lanaiX have the format
242  * SN=x\0
243  * MAC=x:x:x:x:x:x\0
244  * PC=text\0
245  */
246 static int
247 mxge_parse_strings(mxge_softc_t *sc)
248 {
249 	const char *ptr;
250 	int i, found_mac, found_sn2;
251 	char *endptr;
252 
253 	ptr = sc->eeprom_strings;
254 	found_mac = 0;
255 	found_sn2 = 0;
256 	while (*ptr != '\0') {
257 		if (strncmp(ptr, "MAC=", 4) == 0) {
258 			ptr += 4;
259 			for (i = 0;;) {
260 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
261 				if (endptr - ptr != 2)
262 					goto abort;
263 				ptr = endptr;
264 				if (++i == 6)
265 					break;
266 				if (*ptr++ != ':')
267 					goto abort;
268 			}
269 			found_mac = 1;
270 		} else if (strncmp(ptr, "PC=", 3) == 0) {
271 			ptr += 3;
272 			strlcpy(sc->product_code_string, ptr,
273 			    sizeof(sc->product_code_string));
274 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
275 			ptr += 3;
276 			strlcpy(sc->serial_number_string, ptr,
277 			    sizeof(sc->serial_number_string));
278 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
279 			/* SN2 takes precedence over SN */
280 			ptr += 4;
281 			found_sn2 = 1;
282 			strlcpy(sc->serial_number_string, ptr,
283 			    sizeof(sc->serial_number_string));
284 		}
285 		while (*ptr++ != '\0') {}
286 	}
287 
288 	if (found_mac)
289 		return 0;
290 
291 abort:
292 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
293 	return ENXIO;
294 }
295 
296 #if defined(__x86_64__)
297 
298 static void
299 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
300 {
301 	uint32_t val;
302 	unsigned long base, off;
303 	char *va, *cfgptr;
304 	device_t pdev, mcp55;
305 	uint16_t vendor_id, device_id, word;
306 	uintptr_t bus, slot, func, ivend, idev;
307 	uint32_t *ptr32;
308 
309 	if (!mxge_nvidia_ecrc_enable)
310 		return;
311 
312 	pdev = device_get_parent(device_get_parent(sc->dev));
313 	if (pdev == NULL) {
314 		device_printf(sc->dev, "could not find parent?\n");
315 		return;
316 	}
317 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
318 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
319 
320 	if (vendor_id != 0x10de)
321 		return;
322 
323 	base = 0;
324 
325 	if (device_id == 0x005d) {
326 		/* ck804, base address is magic */
327 		base = 0xe0000000UL;
328 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
329 		/* mcp55, base address stored in chipset */
330 		mcp55 = pci_find_bsf(0, 0, 0);
331 		if (mcp55 &&
332 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
333 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
334 			word = pci_read_config(mcp55, 0x90, 2);
335 			base = ((unsigned long)word & 0x7ffeU) << 25;
336 		}
337 	}
338 	if (!base)
339 		return;
340 
341 	/*
342 	 * XXXX
343 	 * Test below is commented because it is believed that doing
344 	 * config read/write beyond 0xff will access the config space
345 	 * for the next larger function.  Uncomment this and remove
346 	 * the hacky pmap_mapdev() way of accessing config space when
347 	 * DragonFly grows support for extended pcie config space access.
348 	 */
349 #if 0
350 	/*
351 	 * See if we can, by some miracle, access the extended
352 	 * config space
353 	 */
354 	val = pci_read_config(pdev, 0x178, 4);
355 	if (val != 0xffffffff) {
356 		val |= 0x40;
357 		pci_write_config(pdev, 0x178, val, 4);
358 		return;
359 	}
360 #endif
361 	/*
362 	 * Rather than using normal pci config space writes, we must
363 	 * map the Nvidia config space ourselves.  This is because on
364 	 * opteron/nvidia class machine the 0xe000000 mapping is
365 	 * handled by the nvidia chipset, that means the internal PCI
366 	 * device (the on-chip northbridge), or the amd-8131 bridge
367 	 * and things behind them are not visible by this method.
368 	 */
369 
370 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
371 		      PCI_IVAR_BUS, &bus);
372 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
373 		      PCI_IVAR_SLOT, &slot);
374 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
375 		      PCI_IVAR_FUNCTION, &func);
376 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
377 		      PCI_IVAR_VENDOR, &ivend);
378 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
379 		      PCI_IVAR_DEVICE, &idev);
380 
381 	off =  base + 0x00100000UL * (unsigned long)bus +
382 	    0x00001000UL * (unsigned long)(func + 8 * slot);
383 
384 	/* map it into the kernel */
385 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
386 	if (va == NULL) {
387 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
388 		return;
389 	}
390 	/* get a pointer to the config space mapped into the kernel */
391 	cfgptr = va + (off & PAGE_MASK);
392 
393 	/* make sure that we can really access it */
394 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
395 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
396 	if (!(vendor_id == ivend && device_id == idev)) {
397 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
398 		    vendor_id, device_id);
399 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
400 		return;
401 	}
402 
403 	ptr32 = (uint32_t*)(cfgptr + 0x178);
404 	val = *ptr32;
405 
406 	if (val == 0xffffffff) {
407 		device_printf(sc->dev, "extended mapping failed\n");
408 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
409 		return;
410 	}
411 	*ptr32 = val | 0x40;
412 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
413 	if (bootverbose) {
414 		device_printf(sc->dev, "Enabled ECRC on upstream "
415 		    "Nvidia bridge at %d:%d:%d\n",
416 		    (int)bus, (int)slot, (int)func);
417 	}
418 }
419 
420 #else	/* __x86_64__ */
421 
422 static void
423 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
424 {
425 	device_printf(sc->dev, "Nforce 4 chipset on non-x86/x86_64!?!?!\n");
426 }
427 
428 #endif
429 
430 static int
431 mxge_dma_test(mxge_softc_t *sc, int test_type)
432 {
433 	mxge_cmd_t cmd;
434 	bus_addr_t dmatest_bus = sc->dmabench_dma.dmem_busaddr;
435 	int status;
436 	uint32_t len;
437 	const char *test = " ";
438 
439 	/*
440 	 * Run a small DMA test.
441 	 * The magic multipliers to the length tell the firmware
442 	 * to do DMA read, write, or read+write tests.  The
443 	 * results are returned in cmd.data0.  The upper 16
444 	 * bits of the return is the number of transfers completed.
445 	 * The lower 16 bits is the time in 0.5us ticks that the
446 	 * transfers took to complete.
447 	 */
448 
449 	len = sc->tx_boundary;
450 
451 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
452 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
453 	cmd.data2 = len * 0x10000;
454 	status = mxge_send_cmd(sc, test_type, &cmd);
455 	if (status != 0) {
456 		test = "read";
457 		goto abort;
458 	}
459 	sc->read_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
460 
461 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
462 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
463 	cmd.data2 = len * 0x1;
464 	status = mxge_send_cmd(sc, test_type, &cmd);
465 	if (status != 0) {
466 		test = "write";
467 		goto abort;
468 	}
469 	sc->write_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
470 
471 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
472 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
473 	cmd.data2 = len * 0x10001;
474 	status = mxge_send_cmd(sc, test_type, &cmd);
475 	if (status != 0) {
476 		test = "read/write";
477 		goto abort;
478 	}
479 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
480 	    (cmd.data0 & 0xffff);
481 
482 abort:
483 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST) {
484 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
485 		    test, status);
486 	}
487 	return status;
488 }
489 
490 /*
491  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
492  * when the PCI-E Completion packets are aligned on an 8-byte
493  * boundary.  Some PCI-E chip sets always align Completion packets; on
494  * the ones that do not, the alignment can be enforced by enabling
495  * ECRC generation (if supported).
496  *
497  * When PCI-E Completion packets are not aligned, it is actually more
498  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
499  *
500  * If the driver can neither enable ECRC nor verify that it has
501  * already been enabled, then it must use a firmware image which works
502  * around unaligned completion packets (ethp_z8e.dat), and it should
503  * also ensure that it never gives the device a Read-DMA which is
504  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
505  * enabled, then the driver should use the aligned (eth_z8e.dat)
506  * firmware image, and set tx_boundary to 4KB.
507  */
508 static int
509 mxge_firmware_probe(mxge_softc_t *sc)
510 {
511 	device_t dev = sc->dev;
512 	int reg, status;
513 	uint16_t pectl;
514 
515 	sc->tx_boundary = 4096;
516 
517 	/*
518 	 * Verify the max read request size was set to 4KB
519 	 * before trying the test with 4KB.
520 	 */
521 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
522 		pectl = pci_read_config(dev, reg + 0x8, 2);
523 		if ((pectl & (5 << 12)) != (5 << 12)) {
524 			device_printf(dev, "Max Read Req. size != 4k (0x%x)\n",
525 			    pectl);
526 			sc->tx_boundary = 2048;
527 		}
528 	}
529 
530 	/*
531 	 * Load the optimized firmware (which assumes aligned PCIe
532 	 * completions) in order to see if it works on this host.
533 	 */
534 	sc->fw_name = mxge_fw_aligned;
535 	status = mxge_load_firmware(sc, 1);
536 	if (status != 0)
537 		return status;
538 
539 	/*
540 	 * Enable ECRC if possible
541 	 */
542 	mxge_enable_nvidia_ecrc(sc);
543 
544 	/*
545 	 * Run a DMA test which watches for unaligned completions and
546 	 * aborts on the first one seen.  Not required on Z8ES or newer.
547 	 */
548 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
549 		return 0;
550 
551 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
552 	if (status == 0)
553 		return 0; /* keep the aligned firmware */
554 
555 	if (status != E2BIG)
556 		device_printf(dev, "DMA test failed: %d\n", status);
557 	if (status == ENOSYS) {
558 		device_printf(dev, "Falling back to ethp! "
559 		    "Please install up to date fw\n");
560 	}
561 	return status;
562 }
563 
564 static int
565 mxge_select_firmware(mxge_softc_t *sc)
566 {
567 	int aligned = 0;
568 	int force_firmware = mxge_force_firmware;
569 
570 	if (sc->throttle)
571 		force_firmware = sc->throttle;
572 
573 	if (force_firmware != 0) {
574 		if (force_firmware == 1)
575 			aligned = 1;
576 		else
577 			aligned = 0;
578 		if (bootverbose) {
579 			device_printf(sc->dev,
580 			    "Assuming %s completions (forced)\n",
581 			    aligned ? "aligned" : "unaligned");
582 		}
583 		goto abort;
584 	}
585 
586 	/*
587 	 * If the PCIe link width is 4 or less, we can use the aligned
588 	 * firmware and skip any checks
589 	 */
590 	if (sc->link_width != 0 && sc->link_width <= 4) {
591 		device_printf(sc->dev, "PCIe x%d Link, "
592 		    "expect reduced performance\n", sc->link_width);
593 		aligned = 1;
594 		goto abort;
595 	}
596 
597 	if (mxge_firmware_probe(sc) == 0)
598 		return 0;
599 
600 abort:
601 	if (aligned) {
602 		sc->fw_name = mxge_fw_aligned;
603 		sc->tx_boundary = 4096;
604 	} else {
605 		sc->fw_name = mxge_fw_unaligned;
606 		sc->tx_boundary = 2048;
607 	}
608 	return mxge_load_firmware(sc, 0);
609 }
610 
611 static int
612 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
613 {
614 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
615 		if_printf(sc->ifp, "Bad firmware type: 0x%x\n",
616 		    be32toh(hdr->mcp_type));
617 		return EIO;
618 	}
619 
620 	/* Save firmware version for sysctl */
621 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
622 	if (bootverbose)
623 		if_printf(sc->ifp, "firmware id: %s\n", hdr->version);
624 
625 	ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
626 	    &sc->fw_ver_minor, &sc->fw_ver_tiny);
627 
628 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR &&
629 	      sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
630 		if_printf(sc->ifp, "Found firmware version %s\n",
631 		    sc->fw_version);
632 		if_printf(sc->ifp, "Driver needs %d.%d\n",
633 		    MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
634 		return EINVAL;
635 	}
636 	return 0;
637 }
638 
639 static void *
640 z_alloc(void *nil, u_int items, u_int size)
641 {
642 	return kmalloc(items * size, M_TEMP, M_WAITOK);
643 }
644 
645 static void
646 z_free(void *nil, void *ptr)
647 {
648 	kfree(ptr, M_TEMP);
649 }
650 
651 static int
652 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
653 {
654 	z_stream zs;
655 	char *inflate_buffer;
656 	const struct firmware *fw;
657 	const mcp_gen_header_t *hdr;
658 	unsigned hdr_offset;
659 	int status;
660 	unsigned int i;
661 	char dummy;
662 	size_t fw_len;
663 
664 	fw = firmware_get(sc->fw_name);
665 	if (fw == NULL) {
666 		if_printf(sc->ifp, "Could not find firmware image %s\n",
667 		    sc->fw_name);
668 		return ENOENT;
669 	}
670 
671 	/* Setup zlib and decompress f/w */
672 	bzero(&zs, sizeof(zs));
673 	zs.zalloc = z_alloc;
674 	zs.zfree = z_free;
675 	status = inflateInit(&zs);
676 	if (status != Z_OK) {
677 		status = EIO;
678 		goto abort_with_fw;
679 	}
680 
681 	/*
682 	 * The uncompressed size is stored as the firmware version,
683 	 * which would otherwise go unused
684 	 */
685 	fw_len = (size_t)fw->version;
686 	inflate_buffer = kmalloc(fw_len, M_TEMP, M_WAITOK);
687 	zs.avail_in = fw->datasize;
688 	zs.next_in = __DECONST(char *, fw->data);
689 	zs.avail_out = fw_len;
690 	zs.next_out = inflate_buffer;
691 	status = inflate(&zs, Z_FINISH);
692 	if (status != Z_STREAM_END) {
693 		if_printf(sc->ifp, "zlib %d\n", status);
694 		status = EIO;
695 		goto abort_with_buffer;
696 	}
697 
698 	/* Check id */
699 	hdr_offset =
700 	htobe32(*(const uint32_t *)(inflate_buffer + MCP_HEADER_PTR_OFFSET));
701 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
702 		if_printf(sc->ifp, "Bad firmware file");
703 		status = EIO;
704 		goto abort_with_buffer;
705 	}
706 	hdr = (const void*)(inflate_buffer + hdr_offset);
707 
708 	status = mxge_validate_firmware(sc, hdr);
709 	if (status != 0)
710 		goto abort_with_buffer;
711 
712 	/* Copy the inflated firmware to NIC SRAM. */
713 	for (i = 0; i < fw_len; i += 256) {
714 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i, inflate_buffer + i,
715 		    min(256U, (unsigned)(fw_len - i)));
716 		wmb();
717 		dummy = *sc->sram;
718 		wmb();
719 	}
720 
721 	*limit = fw_len;
722 	status = 0;
723 abort_with_buffer:
724 	kfree(inflate_buffer, M_TEMP);
725 	inflateEnd(&zs);
726 abort_with_fw:
727 	firmware_put(fw, FIRMWARE_UNLOAD);
728 	return status;
729 }
730 
731 /*
732  * Enable or disable periodic RDMAs from the host to make certain
733  * chipsets resend dropped PCIe messages
734  */
735 static void
736 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
737 {
738 	char buf_bytes[72];
739 	volatile uint32_t *confirm;
740 	volatile char *submit;
741 	uint32_t *buf, dma_low, dma_high;
742 	int i;
743 
744 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
745 
746 	/* Clear confirmation addr */
747 	confirm = (volatile uint32_t *)sc->cmd;
748 	*confirm = 0;
749 	wmb();
750 
751 	/*
752 	 * Send an rdma command to the PCIe engine, and wait for the
753 	 * response in the confirmation address.  The firmware should
754 	 * write a -1 there to indicate it is alive and well
755 	 */
756 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
757 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
758 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
759 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
760 	buf[2] = htobe32(0xffffffff);		/* confirm data */
761 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
762 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
763 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
764 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
765 	buf[5] = htobe32(enable);		/* enable? */
766 
767 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
768 
769 	mxge_pio_copy(submit, buf, 64);
770 	wmb();
771 	DELAY(1000);
772 	wmb();
773 	i = 0;
774 	while (*confirm != 0xffffffff && i < 20) {
775 		DELAY(1000);
776 		i++;
777 	}
778 	if (*confirm != 0xffffffff) {
779 		if_printf(sc->ifp, "dummy rdma %s failed (%p = 0x%x)",
780 		    (enable ? "enable" : "disable"), confirm, *confirm);
781 	}
782 }
783 
784 static int
785 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
786 {
787 	mcp_cmd_t *buf;
788 	char buf_bytes[sizeof(*buf) + 8];
789 	volatile mcp_cmd_response_t *response = sc->cmd;
790 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
791 	uint32_t dma_low, dma_high;
792 	int err, sleep_total = 0;
793 
794 	/* Ensure buf is aligned to 8 bytes */
795 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
796 
797 	buf->data0 = htobe32(data->data0);
798 	buf->data1 = htobe32(data->data1);
799 	buf->data2 = htobe32(data->data2);
800 	buf->cmd = htobe32(cmd);
801 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
802 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
803 
804 	buf->response_addr.low = htobe32(dma_low);
805 	buf->response_addr.high = htobe32(dma_high);
806 
807 	response->result = 0xffffffff;
808 	wmb();
809 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
810 
811 	/*
812 	 * Wait up to 20ms
813 	 */
814 	err = EAGAIN;
815 	for (sleep_total = 0; sleep_total < 20; sleep_total++) {
816 		wmb();
817 		switch (be32toh(response->result)) {
818 		case 0:
819 			data->data0 = be32toh(response->data);
820 			err = 0;
821 			break;
822 		case 0xffffffff:
823 			DELAY(1000);
824 			break;
825 		case MXGEFW_CMD_UNKNOWN:
826 			err = ENOSYS;
827 			break;
828 		case MXGEFW_CMD_ERROR_UNALIGNED:
829 			err = E2BIG;
830 			break;
831 		case MXGEFW_CMD_ERROR_BUSY:
832 			err = EBUSY;
833 			break;
834 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
835 			err = ENXIO;
836 			break;
837 		default:
838 			if_printf(sc->ifp, "command %d failed, result = %d\n",
839 			    cmd, be32toh(response->result));
840 			err = ENXIO;
841 			break;
842 		}
843 		if (err != EAGAIN)
844 			break;
845 	}
846 	if (err == EAGAIN) {
847 		if_printf(sc->ifp, "command %d timed out result = %d\n",
848 		    cmd, be32toh(response->result));
849 	}
850 	return err;
851 }
852 
853 static int
854 mxge_adopt_running_firmware(mxge_softc_t *sc)
855 {
856 	struct mcp_gen_header *hdr;
857 	const size_t bytes = sizeof(struct mcp_gen_header);
858 	size_t hdr_offset;
859 	int status;
860 
861 	/*
862 	 * Find running firmware header
863 	 */
864 	hdr_offset =
865 	htobe32(*(volatile uint32_t *)(sc->sram + MCP_HEADER_PTR_OFFSET));
866 
867 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
868 		if_printf(sc->ifp, "Running firmware has bad header offset "
869 		    "(%zu)\n", hdr_offset);
870 		return EIO;
871 	}
872 
873 	/*
874 	 * Copy header of running firmware from SRAM to host memory to
875 	 * validate firmware
876 	 */
877 	hdr = kmalloc(bytes, M_DEVBUF, M_WAITOK);
878 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
879 	    rman_get_bushandle(sc->mem_res), hdr_offset, (char *)hdr, bytes);
880 	status = mxge_validate_firmware(sc, hdr);
881 	kfree(hdr, M_DEVBUF);
882 
883 	/*
884 	 * Check to see if adopted firmware has bug where adopting
885 	 * it will cause broadcasts to be filtered unless the NIC
886 	 * is kept in ALLMULTI mode
887 	 */
888 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
889 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
890 		sc->adopted_rx_filter_bug = 1;
891 		if_printf(sc->ifp, "Adopting fw %d.%d.%d: "
892 		    "working around rx filter bug\n",
893 		    sc->fw_ver_major, sc->fw_ver_minor, sc->fw_ver_tiny);
894 	}
895 
896 	return status;
897 }
898 
899 static int
900 mxge_load_firmware(mxge_softc_t *sc, int adopt)
901 {
902 	volatile uint32_t *confirm;
903 	volatile char *submit;
904 	char buf_bytes[72];
905 	uint32_t *buf, size, dma_low, dma_high;
906 	int status, i;
907 
908 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
909 
910 	size = sc->sram_size;
911 	status = mxge_load_firmware_helper(sc, &size);
912 	if (status) {
913 		if (!adopt)
914 			return status;
915 
916 		/*
917 		 * Try to use the currently running firmware, if
918 		 * it is new enough
919 		 */
920 		status = mxge_adopt_running_firmware(sc);
921 		if (status) {
922 			if_printf(sc->ifp,
923 			    "failed to adopt running firmware\n");
924 			return status;
925 		}
926 		if_printf(sc->ifp, "Successfully adopted running firmware\n");
927 
928 		if (sc->tx_boundary == 4096) {
929 			if_printf(sc->ifp,
930 			     "Using firmware currently running on NIC.  "
931 			     "For optimal\n");
932 			if_printf(sc->ifp, "performance consider loading "
933 			     "optimized firmware\n");
934 		}
935 		sc->fw_name = mxge_fw_unaligned;
936 		sc->tx_boundary = 2048;
937 		return 0;
938 	}
939 
940 	/* Clear confirmation addr */
941 	confirm = (volatile uint32_t *)sc->cmd;
942 	*confirm = 0;
943 	wmb();
944 
945 	/*
946 	 * Send a reload command to the bootstrap MCP, and wait for the
947 	 * response in the confirmation address.  The firmware should
948 	 * write a -1 there to indicate it is alive and well
949 	 */
950 
951 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
952 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
953 
954 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
955 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
956 	buf[2] = htobe32(0xffffffff);	/* confirm data */
957 
958 	/*
959 	 * FIX: All newest firmware should un-protect the bottom of
960 	 * the sram before handoff. However, the very first interfaces
961 	 * do not. Therefore the handoff copy must skip the first 8 bytes
962 	 */
963 					/* where the code starts*/
964 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
965 	buf[4] = htobe32(size - 8); 	/* length of code */
966 	buf[5] = htobe32(8);		/* where to copy to */
967 	buf[6] = htobe32(0);		/* where to jump to */
968 
969 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
970 	mxge_pio_copy(submit, buf, 64);
971 	wmb();
972 	DELAY(1000);
973 	wmb();
974 	i = 0;
975 	while (*confirm != 0xffffffff && i < 20) {
976 		DELAY(1000*10);
977 		i++;
978 	}
979 	if (*confirm != 0xffffffff) {
980 		if_printf(sc->ifp,"handoff failed (%p = 0x%x)",
981 		    confirm, *confirm);
982 		return ENXIO;
983 	}
984 	return 0;
985 }
986 
987 static int
988 mxge_update_mac_address(mxge_softc_t *sc)
989 {
990 	mxge_cmd_t cmd;
991 	uint8_t *addr = sc->mac_addr;
992 
993 	cmd.data0 = (addr[0] << 24) | (addr[1] << 16) |
994 	    (addr[2] << 8) | addr[3];
995 	cmd.data1 = (addr[4] << 8) | (addr[5]);
996 	return mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
997 }
998 
999 static int
1000 mxge_change_pause(mxge_softc_t *sc, int pause)
1001 {
1002 	mxge_cmd_t cmd;
1003 	int status;
1004 
1005 	bzero(&cmd, sizeof(cmd));	/* silence gcc warning */
1006 	if (pause)
1007 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL, &cmd);
1008 	else
1009 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL, &cmd);
1010 	if (status) {
1011 		if_printf(sc->ifp, "Failed to set flow control mode\n");
1012 		return ENXIO;
1013 	}
1014 	sc->pause = pause;
1015 	return 0;
1016 }
1017 
1018 static void
1019 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1020 {
1021 	mxge_cmd_t cmd;
1022 	int status;
1023 
1024 	bzero(&cmd, sizeof(cmd));	/* avoid gcc warning */
1025 	if (mxge_always_promisc)
1026 		promisc = 1;
1027 
1028 	if (promisc)
1029 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC, &cmd);
1030 	else
1031 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC, &cmd);
1032 	if (status)
1033 		if_printf(sc->ifp, "Failed to set promisc mode\n");
1034 }
1035 
1036 static void
1037 mxge_set_multicast_list(mxge_softc_t *sc)
1038 {
1039 	mxge_cmd_t cmd;
1040 	struct ifmultiaddr *ifma;
1041 	struct ifnet *ifp = sc->ifp;
1042 	int err;
1043 
1044 	/* This firmware is known to not support multicast */
1045 	if (!sc->fw_multicast_support)
1046 		return;
1047 
1048 	/* Disable multicast filtering while we play with the lists*/
1049 	bzero(&cmd, sizeof(cmd));	/* silence gcc warning */
1050 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1051 	if (err != 0) {
1052 		if_printf(ifp, "Failed MXGEFW_ENABLE_ALLMULTI, "
1053 		    "error status: %d\n", err);
1054 		return;
1055 	}
1056 
1057 	if (sc->adopted_rx_filter_bug)
1058 		return;
1059 
1060 	if (ifp->if_flags & IFF_ALLMULTI) {
1061 		/* Request to disable multicast filtering, so quit here */
1062 		return;
1063 	}
1064 
1065 	/* Flush all the filters */
1066 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1067 	if (err != 0) {
1068 		if_printf(ifp, "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, "
1069 		    "error status: %d\n", err);
1070 		return;
1071 	}
1072 
1073 	/*
1074 	 * Walk the multicast list, and add each address
1075 	 */
1076 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1077 		if (ifma->ifma_addr->sa_family != AF_LINK)
1078 			continue;
1079 
1080 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1081 		    &cmd.data0, 4);
1082 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1083 		    &cmd.data1, 2);
1084 		cmd.data0 = htonl(cmd.data0);
1085 		cmd.data1 = htonl(cmd.data1);
1086 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1087 		if (err != 0) {
1088 			if_printf(ifp, "Failed MXGEFW_JOIN_MULTICAST_GROUP, "
1089 			    "error status: %d\n", err);
1090 			/* Abort, leaving multicast filtering off */
1091 			return;
1092 		}
1093 	}
1094 
1095 	/* Enable multicast filtering */
1096 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1097 	if (err != 0) {
1098 		if_printf(ifp, "Failed MXGEFW_DISABLE_ALLMULTI, "
1099 		    "error status: %d\n", err);
1100 	}
1101 }
1102 
1103 #if 0
1104 static int
1105 mxge_max_mtu(mxge_softc_t *sc)
1106 {
1107 	mxge_cmd_t cmd;
1108 	int status;
1109 
1110 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1111 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1112 
1113 	/* try to set nbufs to see if it we can
1114 	   use virtually contiguous jumbos */
1115 	cmd.data0 = 0;
1116 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1117 			       &cmd);
1118 	if (status == 0)
1119 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1120 
1121 	/* otherwise, we're limited to MJUMPAGESIZE */
1122 	return MJUMPAGESIZE - MXGEFW_PAD;
1123 }
1124 #endif
1125 
1126 static int
1127 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1128 {
1129 	struct mxge_slice_state *ss;
1130 	mxge_rx_done_t *rx_done;
1131 	volatile uint32_t *irq_claim;
1132 	mxge_cmd_t cmd;
1133 	int slice, status, rx_intr_size;
1134 
1135 	/*
1136 	 * Try to send a reset command to the card to see if it
1137 	 * is alive
1138 	 */
1139 	memset(&cmd, 0, sizeof (cmd));
1140 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1141 	if (status != 0) {
1142 		if_printf(sc->ifp, "failed reset\n");
1143 		return ENXIO;
1144 	}
1145 
1146 	mxge_dummy_rdma(sc, 1);
1147 
1148 	/*
1149 	 * Set the intrq size
1150 	 * XXX assume 4byte mcp_slot
1151 	 */
1152 	rx_intr_size = sc->rx_intr_slots * sizeof(mcp_slot_t);
1153 	cmd.data0 = rx_intr_size;
1154 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1155 
1156 	/*
1157 	 * Even though we already know how many slices are supported
1158 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1159 	 * has magic side effects, and must be called after a reset.
1160 	 * It must be called prior to calling any RSS related cmds,
1161 	 * including assigning an interrupt queue for anything but
1162 	 * slice 0.  It must also be called *after*
1163 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1164 	 * the firmware to compute offsets.
1165 	 */
1166 	if (sc->num_slices > 1) {
1167 		/* Ask the maximum number of slices it supports */
1168 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
1169 		if (status != 0) {
1170 			if_printf(sc->ifp, "failed to get number of slices\n");
1171 			return status;
1172 		}
1173 
1174 		/*
1175 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1176 		 * to setting up the interrupt queue DMA
1177 		 */
1178 		cmd.data0 = sc->num_slices;
1179 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1180 		if (sc->num_tx_rings > 1)
1181 			cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1182 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES, &cmd);
1183 		if (status != 0) {
1184 			if_printf(sc->ifp, "failed to set number of slices\n");
1185 			return status;
1186 		}
1187 	}
1188 
1189 	if (interrupts_setup) {
1190 		/* Now exchange information about interrupts  */
1191 		for (slice = 0; slice < sc->num_slices; slice++) {
1192 			ss = &sc->ss[slice];
1193 
1194 			rx_done = &ss->rx_data.rx_done;
1195 			memset(rx_done->entry, 0, rx_intr_size);
1196 
1197 			cmd.data0 =
1198 			    MXGE_LOWPART_TO_U32(ss->rx_done_dma.dmem_busaddr);
1199 			cmd.data1 =
1200 			    MXGE_HIGHPART_TO_U32(ss->rx_done_dma.dmem_busaddr);
1201 			cmd.data2 = slice;
1202 			status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA,
1203 			    &cmd);
1204 		}
1205 	}
1206 
1207 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET,
1208 	    &cmd);
1209 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1210 
1211 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1212 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1213 
1214 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, &cmd);
1215 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1216 
1217 	if (status != 0) {
1218 		if_printf(sc->ifp, "failed set interrupt parameters\n");
1219 		return status;
1220 	}
1221 
1222 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1223 
1224 	/* Run a DMA benchmark */
1225 	mxge_dma_test(sc, MXGEFW_DMA_TEST);
1226 
1227 	for (slice = 0; slice < sc->num_slices; slice++) {
1228 		ss = &sc->ss[slice];
1229 
1230 		ss->irq_claim = irq_claim + (2 * slice);
1231 
1232 		/* Reset mcp/driver shared state back to 0 */
1233 		ss->rx_data.rx_done.idx = 0;
1234 		ss->tx.req = 0;
1235 		ss->tx.done = 0;
1236 		ss->tx.pkt_done = 0;
1237 		ss->tx.queue_active = 0;
1238 		ss->tx.activate = 0;
1239 		ss->tx.deactivate = 0;
1240 		ss->rx_data.rx_big.cnt = 0;
1241 		ss->rx_data.rx_small.cnt = 0;
1242 		if (ss->fw_stats != NULL)
1243 			bzero(ss->fw_stats, sizeof(*ss->fw_stats));
1244 	}
1245 	sc->rdma_tags_available = 15;
1246 
1247 	status = mxge_update_mac_address(sc);
1248 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1249 	mxge_change_pause(sc, sc->pause);
1250 	mxge_set_multicast_list(sc);
1251 
1252 	if (sc->throttle) {
1253 		cmd.data0 = sc->throttle;
1254 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd))
1255 			if_printf(sc->ifp, "can't enable throttle\n");
1256 	}
1257 	return status;
1258 }
1259 
1260 static int
1261 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1262 {
1263 	mxge_cmd_t cmd;
1264 	mxge_softc_t *sc;
1265 	int err;
1266 	unsigned int throttle;
1267 
1268 	sc = arg1;
1269 	throttle = sc->throttle;
1270 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1271 	if (err != 0)
1272 		return err;
1273 
1274 	if (throttle == sc->throttle)
1275 		return 0;
1276 
1277 	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1278 		return EINVAL;
1279 
1280 	ifnet_serialize_all(sc->ifp);
1281 
1282 	cmd.data0 = throttle;
1283 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1284 	if (err == 0)
1285 		sc->throttle = throttle;
1286 
1287 	ifnet_deserialize_all(sc->ifp);
1288 	return err;
1289 }
1290 
1291 static int
1292 mxge_change_use_rss(SYSCTL_HANDLER_ARGS)
1293 {
1294 	mxge_softc_t *sc;
1295 	int err, use_rss;
1296 
1297 	sc = arg1;
1298 	use_rss = sc->use_rss;
1299 	err = sysctl_handle_int(oidp, &use_rss, arg2, req);
1300 	if (err != 0)
1301 		return err;
1302 
1303 	if (use_rss == sc->use_rss)
1304 		return 0;
1305 
1306 	ifnet_serialize_all(sc->ifp);
1307 
1308 	sc->use_rss = use_rss;
1309 	if (sc->ifp->if_flags & IFF_RUNNING) {
1310 		mxge_close(sc, 0);
1311 		mxge_open(sc);
1312 	}
1313 
1314 	ifnet_deserialize_all(sc->ifp);
1315 	return err;
1316 }
1317 
1318 static int
1319 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1320 {
1321 	mxge_softc_t *sc;
1322 	unsigned int intr_coal_delay;
1323 	int err;
1324 
1325 	sc = arg1;
1326 	intr_coal_delay = sc->intr_coal_delay;
1327 	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1328 	if (err != 0)
1329 		return err;
1330 
1331 	if (intr_coal_delay == sc->intr_coal_delay)
1332 		return 0;
1333 
1334 	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1335 		return EINVAL;
1336 
1337 	ifnet_serialize_all(sc->ifp);
1338 
1339 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1340 	sc->intr_coal_delay = intr_coal_delay;
1341 
1342 	ifnet_deserialize_all(sc->ifp);
1343 	return err;
1344 }
1345 
1346 static int
1347 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1348 {
1349 	int err;
1350 
1351 	if (arg1 == NULL)
1352 		return EFAULT;
1353 	arg2 = be32toh(*(int *)arg1);
1354 	arg1 = NULL;
1355 	err = sysctl_handle_int(oidp, arg1, arg2, req);
1356 
1357 	return err;
1358 }
1359 
1360 static void
1361 mxge_rem_sysctls(mxge_softc_t *sc)
1362 {
1363 	if (sc->ss != NULL) {
1364 		struct mxge_slice_state *ss;
1365 		int slice;
1366 
1367 		for (slice = 0; slice < sc->num_slices; slice++) {
1368 			ss = &sc->ss[slice];
1369 			if (ss->sysctl_tree != NULL) {
1370 				sysctl_ctx_free(&ss->sysctl_ctx);
1371 				ss->sysctl_tree = NULL;
1372 			}
1373 		}
1374 	}
1375 
1376 	if (sc->slice_sysctl_tree != NULL) {
1377 		sysctl_ctx_free(&sc->slice_sysctl_ctx);
1378 		sc->slice_sysctl_tree = NULL;
1379 	}
1380 }
1381 
1382 static void
1383 mxge_add_sysctls(mxge_softc_t *sc)
1384 {
1385 	struct sysctl_ctx_list *ctx;
1386 	struct sysctl_oid_list *children;
1387 	mcp_irq_data_t *fw;
1388 	struct mxge_slice_state *ss;
1389 	int slice;
1390 	char slice_num[8];
1391 
1392 	ctx = device_get_sysctl_ctx(sc->dev);
1393 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1394 	fw = sc->ss[0].fw_stats;
1395 
1396 	/*
1397 	 * Random information
1398 	 */
1399 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version",
1400 	    CTLFLAG_RD, &sc->fw_version, 0, "firmware version");
1401 
1402 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "serial_number",
1403 	    CTLFLAG_RD, &sc->serial_number_string, 0, "serial number");
1404 
1405 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "product_code",
1406 	    CTLFLAG_RD, &sc->product_code_string, 0, "product code");
1407 
1408 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "pcie_link_width",
1409 	    CTLFLAG_RD, &sc->link_width, 0, "link width");
1410 
1411 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_boundary",
1412 	    CTLFLAG_RD, &sc->tx_boundary, 0, "tx boundary");
1413 
1414 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_combine",
1415 	    CTLFLAG_RD, &sc->wc, 0, "write combining PIO");
1416 
1417 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_dma_MBs",
1418 	    CTLFLAG_RD, &sc->read_dma, 0, "DMA Read speed in MB/s");
1419 
1420 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_dma_MBs",
1421 	    CTLFLAG_RD, &sc->write_dma, 0, "DMA Write speed in MB/s");
1422 
1423 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_write_dma_MBs",
1424 	    CTLFLAG_RD, &sc->read_write_dma, 0,
1425 	    "DMA concurrent Read/Write speed in MB/s");
1426 
1427 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "watchdog_resets",
1428 	    CTLFLAG_RD, &sc->watchdog_resets, 0,
1429 	    "Number of times NIC was reset");
1430 
1431 	if (sc->num_slices > 1) {
1432 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "slice_cpumap",
1433 		    CTLTYPE_OPAQUE | CTLFLAG_RD, sc->ring_map, 0,
1434 		    if_ringmap_cpumap_sysctl, "I", "slice CPU map");
1435 	}
1436 
1437 	/*
1438 	 * Performance related tunables
1439 	 */
1440 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "intr_coal_delay",
1441 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_intr_coal, "I",
1442 	    "Interrupt coalescing delay in usecs");
1443 
1444 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "throttle",
1445 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_throttle, "I",
1446 	    "Transmit throttling");
1447 
1448 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "use_rss",
1449 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_use_rss, "I",
1450 	    "Use RSS");
1451 
1452 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "deassert_wait",
1453 	    CTLFLAG_RW, &mxge_deassert_wait, 0,
1454 	    "Wait for IRQ line to go low in ihandler");
1455 
1456 	/*
1457 	 * Stats block from firmware is in network byte order.
1458 	 * Need to swap it
1459 	 */
1460 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "link_up",
1461 	    CTLTYPE_INT|CTLFLAG_RD, &fw->link_up, 0,
1462 	    mxge_handle_be32, "I", "link up");
1463 
1464 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_tags_available",
1465 	    CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available, 0,
1466 	    mxge_handle_be32, "I", "rdma_tags_available");
1467 
1468 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_crc32",
1469 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_crc32, 0,
1470 	    mxge_handle_be32, "I", "dropped_bad_crc32");
1471 
1472 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_phy",
1473 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_phy, 0,
1474 	    mxge_handle_be32, "I", "dropped_bad_phy");
1475 
1476 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_error_or_filtered",
1477 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_error_or_filtered, 0,
1478 	    mxge_handle_be32, "I", "dropped_link_error_or_filtered");
1479 
1480 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_overflow",
1481 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow, 0,
1482 	    mxge_handle_be32, "I", "dropped_link_overflow");
1483 
1484 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_multicast_filtered",
1485 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_multicast_filtered, 0,
1486 	    mxge_handle_be32, "I", "dropped_multicast_filtered");
1487 
1488 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_big_buffer",
1489 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer, 0,
1490 	    mxge_handle_be32, "I", "dropped_no_big_buffer");
1491 
1492 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_small_buffer",
1493 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_small_buffer, 0,
1494 	    mxge_handle_be32, "I", "dropped_no_small_buffer");
1495 
1496 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_overrun",
1497 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun, 0,
1498 	    mxge_handle_be32, "I", "dropped_overrun");
1499 
1500 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_pause",
1501 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_pause, 0,
1502 	    mxge_handle_be32, "I", "dropped_pause");
1503 
1504 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_runt",
1505 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt, 0,
1506 	    mxge_handle_be32, "I", "dropped_runt");
1507 
1508 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_unicast_filtered",
1509 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered, 0,
1510 	    mxge_handle_be32, "I", "dropped_unicast_filtered");
1511 
1512 	/* add counters exported for debugging from all slices */
1513 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1514 	sc->slice_sysctl_tree = SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx,
1515 	    children, OID_AUTO, "slice", CTLFLAG_RD, 0, "");
1516 	if (sc->slice_sysctl_tree == NULL) {
1517 		device_printf(sc->dev, "can't add slice sysctl node\n");
1518 		return;
1519 	}
1520 
1521 	for (slice = 0; slice < sc->num_slices; slice++) {
1522 		ss = &sc->ss[slice];
1523 		sysctl_ctx_init(&ss->sysctl_ctx);
1524 		ctx = &ss->sysctl_ctx;
1525 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1526 		ksprintf(slice_num, "%d", slice);
1527 		ss->sysctl_tree = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
1528 		    slice_num, CTLFLAG_RD, 0, "");
1529 		if (ss->sysctl_tree == NULL) {
1530 			device_printf(sc->dev,
1531 			    "can't add %d slice sysctl node\n", slice);
1532 			return;	/* XXX continue? */
1533 		}
1534 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1535 
1536 		/*
1537 		 * XXX change to ULONG
1538 		 */
1539 
1540 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_small_cnt",
1541 		    CTLFLAG_RD, &ss->rx_data.rx_small.cnt, 0, "rx_small_cnt");
1542 
1543 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_big_cnt",
1544 		    CTLFLAG_RD, &ss->rx_data.rx_big.cnt, 0, "rx_small_cnt");
1545 
1546 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_req",
1547 		    CTLFLAG_RD, &ss->tx.req, 0, "tx_req");
1548 
1549 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_done",
1550 		    CTLFLAG_RD, &ss->tx.done, 0, "tx_done");
1551 
1552 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_pkt_done",
1553 		    CTLFLAG_RD, &ss->tx.pkt_done, 0, "tx_done");
1554 
1555 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_queue_active",
1556 		    CTLFLAG_RD, &ss->tx.queue_active, 0, "tx_queue_active");
1557 
1558 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_activate",
1559 		    CTLFLAG_RD, &ss->tx.activate, 0, "tx_activate");
1560 
1561 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_deactivate",
1562 		    CTLFLAG_RD, &ss->tx.deactivate, 0, "tx_deactivate");
1563 	}
1564 }
1565 
1566 /*
1567  * Copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1568  * backwards one at a time and handle ring wraps
1569  */
1570 static __inline void
1571 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1572     mcp_kreq_ether_send_t *src, int cnt)
1573 {
1574 	int idx, starting_slot;
1575 
1576 	starting_slot = tx->req;
1577 	while (cnt > 1) {
1578 		cnt--;
1579 		idx = (starting_slot + cnt) & tx->mask;
1580 		mxge_pio_copy(&tx->lanai[idx], &src[cnt], sizeof(*src));
1581 		wmb();
1582 	}
1583 }
1584 
1585 /*
1586  * Copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1587  * at most 32 bytes at a time, so as to avoid involving the software
1588  * pio handler in the nic.  We re-write the first segment's flags
1589  * to mark them valid only after writing the entire chain
1590  */
1591 static __inline void
1592 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src, int cnt)
1593 {
1594 	int idx, i;
1595 	uint32_t *src_ints;
1596 	volatile uint32_t *dst_ints;
1597 	mcp_kreq_ether_send_t *srcp;
1598 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1599 	uint8_t last_flags;
1600 
1601 	idx = tx->req & tx->mask;
1602 
1603 	last_flags = src->flags;
1604 	src->flags = 0;
1605 	wmb();
1606 	dst = dstp = &tx->lanai[idx];
1607 	srcp = src;
1608 
1609 	if ((idx + cnt) < tx->mask) {
1610 		for (i = 0; i < cnt - 1; i += 2) {
1611 			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1612 			wmb(); /* force write every 32 bytes */
1613 			srcp += 2;
1614 			dstp += 2;
1615 		}
1616 	} else {
1617 		/*
1618 		 * Submit all but the first request, and ensure
1619 		 * that it is submitted below
1620 		 */
1621 		mxge_submit_req_backwards(tx, src, cnt);
1622 		i = 0;
1623 	}
1624 	if (i < cnt) {
1625 		/* Submit the first request */
1626 		mxge_pio_copy(dstp, srcp, sizeof(*src));
1627 		wmb(); /* barrier before setting valid flag */
1628 	}
1629 
1630 	/* Re-write the last 32-bits with the valid flags */
1631 	src->flags = last_flags;
1632 	src_ints = (uint32_t *)src;
1633 	src_ints+=3;
1634 	dst_ints = (volatile uint32_t *)dst;
1635 	dst_ints+=3;
1636 	*dst_ints = *src_ints;
1637 	tx->req += cnt;
1638 	wmb();
1639 }
1640 
1641 static int
1642 mxge_pullup_tso(struct mbuf **mp)
1643 {
1644 	int hoff, iphlen, thoff;
1645 	struct mbuf *m;
1646 
1647 	m = *mp;
1648 	KASSERT(M_WRITABLE(m), ("TSO mbuf not writable"));
1649 
1650 	iphlen = m->m_pkthdr.csum_iphlen;
1651 	thoff = m->m_pkthdr.csum_thlen;
1652 	hoff = m->m_pkthdr.csum_lhlen;
1653 
1654 	KASSERT(iphlen > 0, ("invalid ip hlen"));
1655 	KASSERT(thoff > 0, ("invalid tcp hlen"));
1656 	KASSERT(hoff > 0, ("invalid ether hlen"));
1657 
1658 	if (__predict_false(m->m_len < hoff + iphlen + thoff)) {
1659 		m = m_pullup(m, hoff + iphlen + thoff);
1660 		if (m == NULL) {
1661 			*mp = NULL;
1662 			return ENOBUFS;
1663 		}
1664 		*mp = m;
1665 	}
1666 	return 0;
1667 }
1668 
1669 static int
1670 mxge_encap_tso(mxge_tx_ring_t *tx, struct mxge_buffer_state *info_map,
1671     struct mbuf *m, int busdma_seg_cnt)
1672 {
1673 	mcp_kreq_ether_send_t *req;
1674 	bus_dma_segment_t *seg;
1675 	uint32_t low, high_swapped;
1676 	int len, seglen, cum_len, cum_len_next;
1677 	int next_is_first, chop, cnt, rdma_count, small;
1678 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1679 	uint8_t flags, flags_next;
1680 	struct mxge_buffer_state *info_last;
1681 	bus_dmamap_t map = info_map->map;
1682 
1683 	mss = m->m_pkthdr.tso_segsz;
1684 
1685 	/*
1686 	 * Negative cum_len signifies to the send loop that we are
1687 	 * still in the header portion of the TSO packet.
1688 	 */
1689 	cum_len = -(m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen +
1690 	    m->m_pkthdr.csum_thlen);
1691 
1692 	/*
1693 	 * TSO implies checksum offload on this hardware
1694 	 */
1695 	cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1696 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1697 
1698 	/*
1699 	 * For TSO, pseudo_hdr_offset holds mss.  The firmware figures
1700 	 * out where to put the checksum by parsing the header.
1701 	 */
1702 	pseudo_hdr_offset = htobe16(mss);
1703 
1704 	req = tx->req_list;
1705 	seg = tx->seg_list;
1706 	cnt = 0;
1707 	rdma_count = 0;
1708 
1709 	/*
1710 	 * "rdma_count" is the number of RDMAs belonging to the current
1711 	 * packet BEFORE the current send request.  For non-TSO packets,
1712 	 * this is equal to "count".
1713 	 *
1714 	 * For TSO packets, rdma_count needs to be reset to 0 after a
1715 	 * segment cut.
1716 	 *
1717 	 * The rdma_count field of the send request is the number of
1718 	 * RDMAs of the packet starting at that request.  For TSO send
1719 	 * requests with one ore more cuts in the middle, this is the
1720 	 * number of RDMAs starting after the last cut in the request.
1721 	 * All previous segments before the last cut implicitly have 1
1722 	 * RDMA.
1723 	 *
1724 	 * Since the number of RDMAs is not known beforehand, it must be
1725 	 * filled-in retroactively - after each segmentation cut or at
1726 	 * the end of the entire packet.
1727 	 */
1728 
1729 	while (busdma_seg_cnt) {
1730 		/*
1731 		 * Break the busdma segment up into pieces
1732 		 */
1733 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1734 		high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1735 		len = seg->ds_len;
1736 
1737 		while (len) {
1738 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1739 			seglen = len;
1740 			cum_len_next = cum_len + seglen;
1741 			(req - rdma_count)->rdma_count = rdma_count + 1;
1742 			if (__predict_true(cum_len >= 0)) {
1743 				/* Payload */
1744 				chop = (cum_len_next > mss);
1745 				cum_len_next = cum_len_next % mss;
1746 				next_is_first = (cum_len_next == 0);
1747 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1748 				flags_next |=
1749 				    next_is_first * MXGEFW_FLAGS_FIRST;
1750 				rdma_count |= -(chop | next_is_first);
1751 				rdma_count += chop & !next_is_first;
1752 			} else if (cum_len_next >= 0) {
1753 				/* Header ends */
1754 				rdma_count = -1;
1755 				cum_len_next = 0;
1756 				seglen = -cum_len;
1757 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1758 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1759 				    MXGEFW_FLAGS_FIRST |
1760 				    (small * MXGEFW_FLAGS_SMALL);
1761 			}
1762 
1763 			req->addr_high = high_swapped;
1764 			req->addr_low = htobe32(low);
1765 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1766 			req->pad = 0;
1767 			req->rdma_count = 1;
1768 			req->length = htobe16(seglen);
1769 			req->cksum_offset = cksum_offset;
1770 			req->flags =
1771 			    flags | ((cum_len & 1) * MXGEFW_FLAGS_ALIGN_ODD);
1772 			low += seglen;
1773 			len -= seglen;
1774 			cum_len = cum_len_next;
1775 			flags = flags_next;
1776 			req++;
1777 			cnt++;
1778 			rdma_count++;
1779 			if (__predict_false(cksum_offset > seglen))
1780 				cksum_offset -= seglen;
1781 			else
1782 				cksum_offset = 0;
1783 			if (__predict_false(cnt > tx->max_desc))
1784 				goto drop;
1785 		}
1786 		busdma_seg_cnt--;
1787 		seg++;
1788 	}
1789 	(req - rdma_count)->rdma_count = rdma_count;
1790 
1791 	do {
1792 		req--;
1793 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1794 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1795 
1796 	info_last = &tx->info[((cnt - 1) + tx->req) & tx->mask];
1797 
1798 	info_map->map = info_last->map;
1799 	info_last->map = map;
1800 	info_last->m = m;
1801 
1802 	mxge_submit_req(tx, tx->req_list, cnt);
1803 
1804 	if (tx->send_go != NULL && tx->queue_active == 0) {
1805 		/* Tell the NIC to start polling this slice */
1806 		*tx->send_go = 1;
1807 		tx->queue_active = 1;
1808 		tx->activate++;
1809 		wmb();
1810 	}
1811 	return 0;
1812 
1813 drop:
1814 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1815 	m_freem(m);
1816 	return ENOBUFS;
1817 }
1818 
1819 static int
1820 mxge_encap(mxge_tx_ring_t *tx, struct mbuf *m, bus_addr_t zeropad)
1821 {
1822 	mcp_kreq_ether_send_t *req;
1823 	bus_dma_segment_t *seg;
1824 	bus_dmamap_t map;
1825 	int cnt, cum_len, err, i, idx, odd_flag;
1826 	uint16_t pseudo_hdr_offset;
1827 	uint8_t flags, cksum_offset;
1828 	struct mxge_buffer_state *info_map, *info_last;
1829 
1830 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
1831 		err = mxge_pullup_tso(&m);
1832 		if (__predict_false(err))
1833 			return err;
1834 	}
1835 
1836 	/*
1837 	 * Map the frame for DMA
1838 	 */
1839 	idx = tx->req & tx->mask;
1840 	info_map = &tx->info[idx];
1841 	map = info_map->map;
1842 
1843 	err = bus_dmamap_load_mbuf_defrag(tx->dmat, map, &m,
1844 	    tx->seg_list, tx->max_desc - 2, &cnt, BUS_DMA_NOWAIT);
1845 	if (__predict_false(err != 0))
1846 		goto drop;
1847 	bus_dmamap_sync(tx->dmat, map, BUS_DMASYNC_PREWRITE);
1848 
1849 	/*
1850 	 * TSO is different enough, we handle it in another routine
1851 	 */
1852 	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1853 		return mxge_encap_tso(tx, info_map, m, cnt);
1854 
1855 	req = tx->req_list;
1856 	cksum_offset = 0;
1857 	pseudo_hdr_offset = 0;
1858 	flags = MXGEFW_FLAGS_NO_TSO;
1859 
1860 	/*
1861 	 * Checksum offloading
1862 	 */
1863 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1864 		cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1865 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1866 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1867 		req->cksum_offset = cksum_offset;
1868 		flags |= MXGEFW_FLAGS_CKSUM;
1869 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1870 	} else {
1871 		odd_flag = 0;
1872 	}
1873 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1874 		flags |= MXGEFW_FLAGS_SMALL;
1875 
1876 	/*
1877 	 * Convert segments into a request list
1878 	 */
1879 	cum_len = 0;
1880 	seg = tx->seg_list;
1881 	req->flags = MXGEFW_FLAGS_FIRST;
1882 	for (i = 0; i < cnt; i++) {
1883 		req->addr_low = htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1884 		req->addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1885 		req->length = htobe16(seg->ds_len);
1886 		req->cksum_offset = cksum_offset;
1887 		if (cksum_offset > seg->ds_len)
1888 			cksum_offset -= seg->ds_len;
1889 		else
1890 			cksum_offset = 0;
1891 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1892 		req->pad = 0; /* complete solid 16-byte block */
1893 		req->rdma_count = 1;
1894 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1895 		cum_len += seg->ds_len;
1896 		seg++;
1897 		req++;
1898 		req->flags = 0;
1899 	}
1900 	req--;
1901 
1902 	/*
1903 	 * Pad runt to 60 bytes
1904 	 */
1905 	if (cum_len < 60) {
1906 		req++;
1907 		req->addr_low = htobe32(MXGE_LOWPART_TO_U32(zeropad));
1908 		req->addr_high = htobe32(MXGE_HIGHPART_TO_U32(zeropad));
1909 		req->length = htobe16(60 - cum_len);
1910 		req->cksum_offset = 0;
1911 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1912 		req->pad = 0; /* complete solid 16-byte block */
1913 		req->rdma_count = 1;
1914 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1915 		cnt++;
1916 	}
1917 
1918 	tx->req_list[0].rdma_count = cnt;
1919 #if 0
1920 	/* print what the firmware will see */
1921 	for (i = 0; i < cnt; i++) {
1922 		kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1923 		    "cso:%d, flags:0x%x, rdma:%d\n",
1924 		    i, (int)ntohl(tx->req_list[i].addr_high),
1925 		    (int)ntohl(tx->req_list[i].addr_low),
1926 		    (int)ntohs(tx->req_list[i].length),
1927 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1928 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1929 		    tx->req_list[i].rdma_count);
1930 	}
1931 	kprintf("--------------\n");
1932 #endif
1933 	info_last = &tx->info[((cnt - 1) + tx->req) & tx->mask];
1934 
1935 	info_map->map = info_last->map;
1936 	info_last->map = map;
1937 	info_last->m = m;
1938 
1939 	mxge_submit_req(tx, tx->req_list, cnt);
1940 
1941 	if (tx->send_go != NULL && tx->queue_active == 0) {
1942 		/* Tell the NIC to start polling this slice */
1943 		*tx->send_go = 1;
1944 		tx->queue_active = 1;
1945 		tx->activate++;
1946 		wmb();
1947 	}
1948 	return 0;
1949 
1950 drop:
1951 	m_freem(m);
1952 	return err;
1953 }
1954 
1955 static void
1956 mxge_start(struct ifnet *ifp, struct ifaltq_subque *ifsq)
1957 {
1958 	mxge_softc_t *sc = ifp->if_softc;
1959 	mxge_tx_ring_t *tx = ifsq_get_priv(ifsq);
1960 	bus_addr_t zeropad;
1961 	int encap = 0;
1962 
1963 	KKASSERT(tx->ifsq == ifsq);
1964 	ASSERT_SERIALIZED(&tx->tx_serialize);
1965 
1966 	if ((ifp->if_flags & IFF_RUNNING) == 0 || ifsq_is_oactive(ifsq))
1967 		return;
1968 
1969 	zeropad = sc->zeropad_dma.dmem_busaddr;
1970 	while (tx->mask - (tx->req - tx->done) > tx->max_desc) {
1971 		struct mbuf *m;
1972 		int error;
1973 
1974 		m = ifsq_dequeue(ifsq);
1975 		if (m == NULL)
1976 			goto done;
1977 
1978 		BPF_MTAP(ifp, m);
1979 		error = mxge_encap(tx, m, zeropad);
1980 		if (!error)
1981 			encap = 1;
1982 		else
1983 			IFNET_STAT_INC(ifp, oerrors, 1);
1984 	}
1985 
1986 	/* Ran out of transmit slots */
1987 	ifsq_set_oactive(ifsq);
1988 done:
1989 	if (encap)
1990 		tx->watchdog.wd_timer = 5;
1991 }
1992 
1993 static void
1994 mxge_watchdog(struct ifaltq_subque *ifsq)
1995 {
1996 	struct ifnet *ifp = ifsq_get_ifp(ifsq);
1997 	struct mxge_softc *sc = ifp->if_softc;
1998 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
1999 	mxge_tx_ring_t *tx = ifsq_get_priv(ifsq);
2000 
2001 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
2002 
2003 	/* Check for pause blocking before resetting */
2004 	if (tx->watchdog_rx_pause == rx_pause) {
2005 		mxge_warn_stuck(sc, tx, 0);
2006 		mxge_watchdog_reset(sc);
2007 		return;
2008 	} else {
2009 		if_printf(ifp, "Flow control blocking xmits, "
2010 		    "check link partner\n");
2011 	}
2012 	tx->watchdog_rx_pause = rx_pause;
2013 }
2014 
2015 /*
2016  * Copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2017  * at most 32 bytes at a time, so as to avoid involving the software
2018  * pio handler in the nic.  We re-write the first segment's low
2019  * DMA address to mark it valid only after we write the entire chunk
2020  * in a burst
2021  */
2022 static __inline void
2023 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2024     mcp_kreq_ether_recv_t *src)
2025 {
2026 	uint32_t low;
2027 
2028 	low = src->addr_low;
2029 	src->addr_low = 0xffffffff;
2030 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2031 	wmb();
2032 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2033 	wmb();
2034 	src->addr_low = low;
2035 	dst->addr_low = low;
2036 	wmb();
2037 }
2038 
2039 static int
2040 mxge_get_buf_small(mxge_rx_ring_t *rx, bus_dmamap_t map, int idx,
2041     boolean_t init)
2042 {
2043 	bus_dma_segment_t seg;
2044 	struct mbuf *m;
2045 	int cnt, err, mflag;
2046 
2047 	mflag = M_NOWAIT;
2048 	if (__predict_false(init))
2049 		mflag = M_WAITOK;
2050 
2051 	m = m_gethdr(mflag, MT_DATA);
2052 	if (m == NULL) {
2053 		err = ENOBUFS;
2054 		if (__predict_false(init)) {
2055 			/*
2056 			 * During initialization, there
2057 			 * is nothing to setup; bail out
2058 			 */
2059 			return err;
2060 		}
2061 		goto done;
2062 	}
2063 	m->m_len = m->m_pkthdr.len = MHLEN;
2064 
2065 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2066 	    &seg, 1, &cnt, BUS_DMA_NOWAIT);
2067 	if (err != 0) {
2068 		m_freem(m);
2069 		if (__predict_false(init)) {
2070 			/*
2071 			 * During initialization, there
2072 			 * is nothing to setup; bail out
2073 			 */
2074 			return err;
2075 		}
2076 		goto done;
2077 	}
2078 
2079 	rx->info[idx].m = m;
2080 	rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2081 	rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2082 
2083 done:
2084 	if ((idx & 7) == 7)
2085 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2086 	return err;
2087 }
2088 
2089 static int
2090 mxge_get_buf_big(mxge_rx_ring_t *rx, bus_dmamap_t map, int idx,
2091     boolean_t init)
2092 {
2093 	bus_dma_segment_t seg;
2094 	struct mbuf *m;
2095 	int cnt, err, mflag;
2096 
2097 	mflag = M_NOWAIT;
2098 	if (__predict_false(init))
2099 		mflag = M_WAITOK;
2100 
2101 	if (rx->cl_size == MCLBYTES)
2102 		m = m_getcl(mflag, MT_DATA, M_PKTHDR);
2103 	else
2104 		m = m_getjcl(mflag, MT_DATA, M_PKTHDR, MJUMPAGESIZE);
2105 	if (m == NULL) {
2106 		err = ENOBUFS;
2107 		if (__predict_false(init)) {
2108 			/*
2109 			 * During initialization, there
2110 			 * is nothing to setup; bail out
2111 			 */
2112 			return err;
2113 		}
2114 		goto done;
2115 	}
2116 	m->m_len = m->m_pkthdr.len = rx->cl_size;
2117 
2118 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2119 	    &seg, 1, &cnt, BUS_DMA_NOWAIT);
2120 	if (err != 0) {
2121 		m_freem(m);
2122 		if (__predict_false(init)) {
2123 			/*
2124 			 * During initialization, there
2125 			 * is nothing to setup; bail out
2126 			 */
2127 			return err;
2128 		}
2129 		goto done;
2130 	}
2131 
2132 	rx->info[idx].m = m;
2133 	rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2134 	rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2135 
2136 done:
2137 	if ((idx & 7) == 7)
2138 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2139 	return err;
2140 }
2141 
2142 /*
2143  * Myri10GE hardware checksums are not valid if the sender
2144  * padded the frame with non-zero padding.  This is because
2145  * the firmware just does a simple 16-bit 1s complement
2146  * checksum across the entire frame, excluding the first 14
2147  * bytes.  It is best to simply to check the checksum and
2148  * tell the stack about it only if the checksum is good
2149  */
2150 static __inline uint16_t
2151 mxge_rx_csum(struct mbuf *m, int csum)
2152 {
2153 	const struct ether_header *eh;
2154 	const struct ip *ip;
2155 	uint16_t c;
2156 
2157 	eh = mtod(m, const struct ether_header *);
2158 
2159 	/* Only deal with IPv4 TCP & UDP for now */
2160 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2161 		return 1;
2162 
2163 	ip = (const struct ip *)(eh + 1);
2164 	if (__predict_false(ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP))
2165 		return 1;
2166 
2167 #ifdef INET
2168 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2169 	    htonl(ntohs(csum) + ntohs(ip->ip_len) +
2170 	          - (ip->ip_hl << 2) + ip->ip_p));
2171 #else
2172 	c = 1;
2173 #endif
2174 	c ^= 0xffff;
2175 	return c;
2176 }
2177 
2178 static void
2179 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2180 {
2181 	struct ether_vlan_header *evl;
2182 	uint32_t partial;
2183 
2184 	evl = mtod(m, struct ether_vlan_header *);
2185 
2186 	/*
2187 	 * Fix checksum by subtracting EVL_ENCAPLEN bytes after
2188 	 * what the firmware thought was the end of the ethernet
2189 	 * header.
2190 	 */
2191 
2192 	/* Put checksum into host byte order */
2193 	*csum = ntohs(*csum);
2194 
2195 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2196 	*csum += ~partial;
2197 	*csum += ((*csum) < ~partial);
2198 	*csum = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2199 	*csum = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2200 
2201 	/*
2202 	 * Restore checksum to network byte order;
2203 	 * later consumers expect this
2204 	 */
2205 	*csum = htons(*csum);
2206 
2207 	/* save the tag */
2208 	m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2209 	m->m_flags |= M_VLANTAG;
2210 
2211 	/*
2212 	 * Remove the 802.1q header by copying the Ethernet
2213 	 * addresses over it and adjusting the beginning of
2214 	 * the data in the mbuf.  The encapsulated Ethernet
2215 	 * type field is already in place.
2216 	 */
2217 	bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2218 	    ETHER_HDR_LEN - ETHER_TYPE_LEN);
2219 	m_adj(m, EVL_ENCAPLEN);
2220 }
2221 
2222 
2223 static __inline void
2224 mxge_rx_done_big(struct ifnet *ifp, mxge_rx_ring_t *rx,
2225     uint32_t len, uint32_t csum)
2226 {
2227 	struct mbuf *m;
2228 	const struct ether_header *eh;
2229 	bus_dmamap_t old_map;
2230 	int idx;
2231 
2232 	idx = rx->cnt & rx->mask;
2233 	rx->cnt++;
2234 
2235 	/* Save a pointer to the received mbuf */
2236 	m = rx->info[idx].m;
2237 
2238 	/* Try to replace the received mbuf */
2239 	if (mxge_get_buf_big(rx, rx->extra_map, idx, FALSE)) {
2240 		/* Drop the frame -- the old mbuf is re-cycled */
2241 		IFNET_STAT_INC(ifp, ierrors, 1);
2242 		return;
2243 	}
2244 
2245 	/* Unmap the received buffer */
2246 	old_map = rx->info[idx].map;
2247 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2248 	bus_dmamap_unload(rx->dmat, old_map);
2249 
2250 	/* Swap the bus_dmamap_t's */
2251 	rx->info[idx].map = rx->extra_map;
2252 	rx->extra_map = old_map;
2253 
2254 	/*
2255 	 * mcp implicitly skips 1st 2 bytes so that packet is properly
2256 	 * aligned
2257 	 */
2258 	m->m_data += MXGEFW_PAD;
2259 
2260 	m->m_pkthdr.rcvif = ifp;
2261 	m->m_len = m->m_pkthdr.len = len;
2262 
2263 	IFNET_STAT_INC(ifp, ipackets, 1);
2264 
2265 	eh = mtod(m, const struct ether_header *);
2266 	if (eh->ether_type == htons(ETHERTYPE_VLAN))
2267 		mxge_vlan_tag_remove(m, &csum);
2268 
2269 	/* If the checksum is valid, mark it in the mbuf header */
2270 	if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2271 	    mxge_rx_csum(m, csum) == 0) {
2272 		/* Tell the stack that the checksum is good */
2273 		m->m_pkthdr.csum_data = 0xffff;
2274 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2275 		    CSUM_DATA_VALID;
2276 	}
2277 	ifp->if_input(ifp, m, NULL, -1);
2278 }
2279 
2280 static __inline void
2281 mxge_rx_done_small(struct ifnet *ifp, mxge_rx_ring_t *rx,
2282     uint32_t len, uint32_t csum)
2283 {
2284 	const struct ether_header *eh;
2285 	struct mbuf *m;
2286 	bus_dmamap_t old_map;
2287 	int idx;
2288 
2289 	idx = rx->cnt & rx->mask;
2290 	rx->cnt++;
2291 
2292 	/* Save a pointer to the received mbuf */
2293 	m = rx->info[idx].m;
2294 
2295 	/* Try to replace the received mbuf */
2296 	if (mxge_get_buf_small(rx, rx->extra_map, idx, FALSE)) {
2297 		/* Drop the frame -- the old mbuf is re-cycled */
2298 		IFNET_STAT_INC(ifp, ierrors, 1);
2299 		return;
2300 	}
2301 
2302 	/* Unmap the received buffer */
2303 	old_map = rx->info[idx].map;
2304 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2305 	bus_dmamap_unload(rx->dmat, old_map);
2306 
2307 	/* Swap the bus_dmamap_t's */
2308 	rx->info[idx].map = rx->extra_map;
2309 	rx->extra_map = old_map;
2310 
2311 	/*
2312 	 * mcp implicitly skips 1st 2 bytes so that packet is properly
2313 	 * aligned
2314 	 */
2315 	m->m_data += MXGEFW_PAD;
2316 
2317 	m->m_pkthdr.rcvif = ifp;
2318 	m->m_len = m->m_pkthdr.len = len;
2319 
2320 	IFNET_STAT_INC(ifp, ipackets, 1);
2321 
2322 	eh = mtod(m, const struct ether_header *);
2323 	if (eh->ether_type == htons(ETHERTYPE_VLAN))
2324 		mxge_vlan_tag_remove(m, &csum);
2325 
2326 	/* If the checksum is valid, mark it in the mbuf header */
2327 	if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2328 	    mxge_rx_csum(m, csum) == 0) {
2329 		/* Tell the stack that the checksum is good */
2330 		m->m_pkthdr.csum_data = 0xffff;
2331 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2332 		    CSUM_DATA_VALID;
2333 	}
2334 	ifp->if_input(ifp, m, NULL, -1);
2335 }
2336 
2337 static __inline void
2338 mxge_clean_rx_done(struct ifnet *ifp, struct mxge_rx_data *rx_data, int cycle)
2339 {
2340 	mxge_rx_done_t *rx_done = &rx_data->rx_done;
2341 
2342 	while (rx_done->entry[rx_done->idx].length != 0 && cycle != 0) {
2343 		uint16_t length, checksum;
2344 
2345 		length = ntohs(rx_done->entry[rx_done->idx].length);
2346 		rx_done->entry[rx_done->idx].length = 0;
2347 
2348 		checksum = rx_done->entry[rx_done->idx].checksum;
2349 
2350 		if (length <= MXGE_RX_SMALL_BUFLEN) {
2351 			mxge_rx_done_small(ifp, &rx_data->rx_small,
2352 			    length, checksum);
2353 		} else {
2354 			mxge_rx_done_big(ifp, &rx_data->rx_big,
2355 			    length, checksum);
2356 		}
2357 
2358 		rx_done->idx++;
2359 		rx_done->idx &= rx_done->mask;
2360 		--cycle;
2361 	}
2362 }
2363 
2364 static __inline void
2365 mxge_tx_done(struct ifnet *ifp, mxge_tx_ring_t *tx, uint32_t mcp_idx)
2366 {
2367 	ASSERT_SERIALIZED(&tx->tx_serialize);
2368 
2369 	while (tx->pkt_done != mcp_idx) {
2370 		struct mbuf *m;
2371 		int idx;
2372 
2373 		idx = tx->done & tx->mask;
2374 		tx->done++;
2375 
2376 		m = tx->info[idx].m;
2377 		/*
2378 		 * mbuf and DMA map only attached to the first
2379 		 * segment per-mbuf.
2380 		 */
2381 		if (m != NULL) {
2382 			tx->pkt_done++;
2383 			IFNET_STAT_INC(ifp, opackets, 1);
2384 			tx->info[idx].m = NULL;
2385 			bus_dmamap_unload(tx->dmat, tx->info[idx].map);
2386 			m_freem(m);
2387 		}
2388 	}
2389 
2390 	/*
2391 	 * If we have space, clear OACTIVE to tell the stack that
2392 	 * its OK to send packets
2393 	 */
2394 	if (tx->req - tx->done < (tx->mask + 1) / 2) {
2395 		ifsq_clr_oactive(tx->ifsq);
2396 		if (tx->req == tx->done) {
2397 			/* Reset watchdog */
2398 			tx->watchdog.wd_timer = 0;
2399 		}
2400 	}
2401 
2402 	if (!ifsq_is_empty(tx->ifsq))
2403 		ifsq_devstart(tx->ifsq);
2404 
2405 	if (tx->send_stop != NULL && tx->req == tx->done) {
2406 		/*
2407 		 * Let the NIC stop polling this queue, since there
2408 		 * are no more transmits pending
2409 		 */
2410 		*tx->send_stop = 1;
2411 		tx->queue_active = 0;
2412 		tx->deactivate++;
2413 		wmb();
2414 	}
2415 }
2416 
2417 static struct mxge_media_type mxge_xfp_media_types[] = {
2418 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2419 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2420 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2421 	{IFM_NONE,	(1 << 5),	"10GBASE-ER"},
2422 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2423 	{IFM_NONE,	(1 << 3),	"10GBASE-SW"},
2424 	{IFM_NONE,	(1 << 2),	"10GBASE-LW"},
2425 	{IFM_NONE,	(1 << 1),	"10GBASE-EW"},
2426 	{IFM_NONE,	(1 << 0),	"Reserved"}
2427 };
2428 
2429 static struct mxge_media_type mxge_sfp_media_types[] = {
2430 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2431 	{IFM_NONE,	(1 << 7),	"Reserved"},
2432 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2433 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2434 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2435 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2436 };
2437 
2438 static void
2439 mxge_media_set(mxge_softc_t *sc, int media_type)
2440 {
2441 	int fc_opt = 0;
2442 
2443 	if (media_type == IFM_NONE)
2444 		return;
2445 
2446 	if (sc->pause)
2447 		fc_opt = IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE;
2448 
2449 	ifmedia_add(&sc->media, MXGE_IFM | media_type, 0, NULL);
2450 	ifmedia_set(&sc->media, MXGE_IFM | media_type | fc_opt);
2451 
2452 	sc->current_media = media_type;
2453 }
2454 
2455 static void
2456 mxge_media_unset(mxge_softc_t *sc)
2457 {
2458 	ifmedia_removeall(&sc->media);
2459 	sc->current_media = IFM_NONE;
2460 }
2461 
2462 static void
2463 mxge_media_init(mxge_softc_t *sc)
2464 {
2465 	const char *ptr;
2466 	int i;
2467 
2468 	mxge_media_unset(sc);
2469 
2470 	/*
2471 	 * Parse the product code to deterimine the interface type
2472 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2473 	 * after the 3rd dash in the driver's cached copy of the
2474 	 * EEPROM's product code string.
2475 	 */
2476 	ptr = sc->product_code_string;
2477 	if (ptr == NULL) {
2478 		if_printf(sc->ifp, "Missing product code\n");
2479 		return;
2480 	}
2481 
2482 	for (i = 0; i < 3; i++, ptr++) {
2483 		ptr = strchr(ptr, '-');
2484 		if (ptr == NULL) {
2485 			if_printf(sc->ifp, "only %d dashes in PC?!?\n", i);
2486 			return;
2487 		}
2488 	}
2489 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2490 		/* -C is CX4 */
2491 		sc->connector = MXGE_CX4;
2492 		mxge_media_set(sc, IFM_10G_CX4);
2493 	} else if (*ptr == 'Q') {
2494 		/* -Q is Quad Ribbon Fiber */
2495 		sc->connector = MXGE_QRF;
2496 		if_printf(sc->ifp, "Quad Ribbon Fiber Media\n");
2497 		/* DragonFly has no media type for Quad ribbon fiber */
2498 	} else if (*ptr == 'R') {
2499 		/* -R is XFP */
2500 		sc->connector = MXGE_XFP;
2501 		/* NOTE: ifmedia will be installed later */
2502 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2503 		/* -S or -2S is SFP+ */
2504 		sc->connector = MXGE_SFP;
2505 		/* NOTE: ifmedia will be installed later */
2506 	} else {
2507 		sc->connector = MXGE_UNK;
2508 		if_printf(sc->ifp, "Unknown media type: %c\n", *ptr);
2509 	}
2510 }
2511 
2512 /*
2513  * Determine the media type for a NIC.  Some XFPs will identify
2514  * themselves only when their link is up, so this is initiated via a
2515  * link up interrupt.  However, this can potentially take up to
2516  * several milliseconds, so it is run via the watchdog routine, rather
2517  * than in the interrupt handler itself.
2518  */
2519 static void
2520 mxge_media_probe(mxge_softc_t *sc)
2521 {
2522 	mxge_cmd_t cmd;
2523 	const char *cage_type;
2524 	struct mxge_media_type *mxge_media_types = NULL;
2525 	int i, err, ms, mxge_media_type_entries;
2526 	uint32_t byte;
2527 
2528 	sc->need_media_probe = 0;
2529 
2530 	if (sc->connector == MXGE_XFP) {
2531 		/* -R is XFP */
2532 		mxge_media_types = mxge_xfp_media_types;
2533 		mxge_media_type_entries = NELEM(mxge_xfp_media_types);
2534 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2535 		cage_type = "XFP";
2536 	} else 	if (sc->connector == MXGE_SFP) {
2537 		/* -S or -2S is SFP+ */
2538 		mxge_media_types = mxge_sfp_media_types;
2539 		mxge_media_type_entries = NELEM(mxge_sfp_media_types);
2540 		cage_type = "SFP+";
2541 		byte = 3;
2542 	} else {
2543 		/* nothing to do; media type cannot change */
2544 		return;
2545 	}
2546 
2547 	/*
2548 	 * At this point we know the NIC has an XFP cage, so now we
2549 	 * try to determine what is in the cage by using the
2550 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2551 	 * register.  We read just one byte, which may take over
2552 	 * a millisecond
2553 	 */
2554 
2555 	bzero(&cmd, sizeof(cmd));	/* silence gcc warning */
2556 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2557 	cmd.data1 = byte;
2558 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2559 	if (err != MXGEFW_CMD_OK) {
2560 		if (err == MXGEFW_CMD_ERROR_I2C_FAILURE)
2561 			if_printf(sc->ifp, "failed to read XFP\n");
2562 		else if (err == MXGEFW_CMD_ERROR_I2C_ABSENT)
2563 			if_printf(sc->ifp, "Type R/S with no XFP!?!?\n");
2564 		else
2565 			if_printf(sc->ifp, "I2C read failed, err: %d", err);
2566 		mxge_media_unset(sc);
2567 		return;
2568 	}
2569 
2570 	/* Now we wait for the data to be cached */
2571 	cmd.data0 = byte;
2572 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2573 	for (ms = 0; err == EBUSY && ms < 50; ms++) {
2574 		DELAY(1000);
2575 		cmd.data0 = byte;
2576 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2577 	}
2578 	if (err != MXGEFW_CMD_OK) {
2579 		if_printf(sc->ifp, "failed to read %s (%d, %dms)\n",
2580 		    cage_type, err, ms);
2581 		mxge_media_unset(sc);
2582 		return;
2583 	}
2584 
2585 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2586 		if (bootverbose) {
2587 			if_printf(sc->ifp, "%s:%s\n", cage_type,
2588 			    mxge_media_types[0].name);
2589 		}
2590 		if (sc->current_media != mxge_media_types[0].flag) {
2591 			mxge_media_unset(sc);
2592 			mxge_media_set(sc, mxge_media_types[0].flag);
2593 		}
2594 		return;
2595 	}
2596 	for (i = 1; i < mxge_media_type_entries; i++) {
2597 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2598 			if (bootverbose) {
2599 				if_printf(sc->ifp, "%s:%s\n", cage_type,
2600 				    mxge_media_types[i].name);
2601 			}
2602 
2603 			if (sc->current_media != mxge_media_types[i].flag) {
2604 				mxge_media_unset(sc);
2605 				mxge_media_set(sc, mxge_media_types[i].flag);
2606 			}
2607 			return;
2608 		}
2609 	}
2610 	mxge_media_unset(sc);
2611 	if (bootverbose) {
2612 		if_printf(sc->ifp, "%s media 0x%x unknown\n", cage_type,
2613 		    cmd.data0);
2614 	}
2615 }
2616 
2617 static void
2618 mxge_intr_status(struct mxge_softc *sc, const mcp_irq_data_t *stats)
2619 {
2620 	if (sc->link_state != stats->link_up) {
2621 		sc->link_state = stats->link_up;
2622 		if (sc->link_state) {
2623 			sc->ifp->if_link_state = LINK_STATE_UP;
2624 			if_link_state_change(sc->ifp);
2625 			if (bootverbose)
2626 				if_printf(sc->ifp, "link up\n");
2627 		} else {
2628 			sc->ifp->if_link_state = LINK_STATE_DOWN;
2629 			if_link_state_change(sc->ifp);
2630 			if (bootverbose)
2631 				if_printf(sc->ifp, "link down\n");
2632 		}
2633 		sc->need_media_probe = 1;
2634 	}
2635 
2636 	if (sc->rdma_tags_available != be32toh(stats->rdma_tags_available)) {
2637 		sc->rdma_tags_available = be32toh(stats->rdma_tags_available);
2638 		if_printf(sc->ifp, "RDMA timed out! %d tags left\n",
2639 		    sc->rdma_tags_available);
2640 	}
2641 
2642 	if (stats->link_down) {
2643 		sc->down_cnt += stats->link_down;
2644 		sc->link_state = 0;
2645 		sc->ifp->if_link_state = LINK_STATE_DOWN;
2646 		if_link_state_change(sc->ifp);
2647 	}
2648 }
2649 
2650 static void
2651 mxge_serialize_skipmain(struct mxge_softc *sc)
2652 {
2653 	lwkt_serialize_array_enter(sc->serializes, sc->nserialize, 1);
2654 }
2655 
2656 static void
2657 mxge_deserialize_skipmain(struct mxge_softc *sc)
2658 {
2659 	lwkt_serialize_array_exit(sc->serializes, sc->nserialize, 1);
2660 }
2661 
2662 static void
2663 mxge_legacy(void *arg)
2664 {
2665 	struct mxge_slice_state *ss = arg;
2666 	mxge_softc_t *sc = ss->sc;
2667 	mcp_irq_data_t *stats = ss->fw_stats;
2668 	mxge_tx_ring_t *tx = &ss->tx;
2669 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2670 	uint32_t send_done_count;
2671 	uint8_t valid;
2672 
2673 	ASSERT_SERIALIZED(&sc->main_serialize);
2674 
2675 	/* Make sure the DMA has finished */
2676 	if (!stats->valid)
2677 		return;
2678 	valid = stats->valid;
2679 
2680 	/* Lower legacy IRQ */
2681 	*sc->irq_deassert = 0;
2682 	if (!mxge_deassert_wait) {
2683 		/* Don't wait for conf. that irq is low */
2684 		stats->valid = 0;
2685 	}
2686 
2687 	mxge_serialize_skipmain(sc);
2688 
2689 	/*
2690 	 * Loop while waiting for legacy irq deassertion
2691 	 * XXX do we really want to loop?
2692 	 */
2693 	do {
2694 		/* Check for transmit completes and receives */
2695 		send_done_count = be32toh(stats->send_done_count);
2696 		while ((send_done_count != tx->pkt_done) ||
2697 		       (rx_done->entry[rx_done->idx].length != 0)) {
2698 			if (send_done_count != tx->pkt_done) {
2699 				mxge_tx_done(&sc->arpcom.ac_if, tx,
2700 				    (int)send_done_count);
2701 			}
2702 			mxge_clean_rx_done(&sc->arpcom.ac_if, &ss->rx_data, -1);
2703 			send_done_count = be32toh(stats->send_done_count);
2704 		}
2705 		if (mxge_deassert_wait)
2706 			wmb();
2707 	} while (*((volatile uint8_t *)&stats->valid));
2708 
2709 	mxge_deserialize_skipmain(sc);
2710 
2711 	/* Fw link & error stats meaningful only on the first slice */
2712 	if (__predict_false(stats->stats_updated))
2713 		mxge_intr_status(sc, stats);
2714 
2715 	/* Check to see if we have rx token to pass back */
2716 	if (valid & 0x1)
2717 		*ss->irq_claim = be32toh(3);
2718 	*(ss->irq_claim + 1) = be32toh(3);
2719 }
2720 
2721 static void
2722 mxge_msi(void *arg)
2723 {
2724 	struct mxge_slice_state *ss = arg;
2725 	mxge_softc_t *sc = ss->sc;
2726 	mcp_irq_data_t *stats = ss->fw_stats;
2727 	mxge_tx_ring_t *tx = &ss->tx;
2728 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2729 	uint32_t send_done_count;
2730 	uint8_t valid;
2731 #ifndef IFPOLL_ENABLE
2732 	const boolean_t polling = FALSE;
2733 #else
2734 	boolean_t polling = FALSE;
2735 #endif
2736 
2737 	ASSERT_SERIALIZED(&sc->main_serialize);
2738 
2739 	/* Make sure the DMA has finished */
2740 	if (__predict_false(!stats->valid))
2741 		return;
2742 
2743 	valid = stats->valid;
2744 	stats->valid = 0;
2745 
2746 #ifdef IFPOLL_ENABLE
2747 	if (sc->arpcom.ac_if.if_flags & IFF_NPOLLING)
2748 		polling = TRUE;
2749 #endif
2750 
2751 	if (!polling) {
2752 		/* Check for receives */
2753 		lwkt_serialize_enter(&ss->rx_data.rx_serialize);
2754 		if (rx_done->entry[rx_done->idx].length != 0)
2755 			mxge_clean_rx_done(&sc->arpcom.ac_if, &ss->rx_data, -1);
2756 		lwkt_serialize_exit(&ss->rx_data.rx_serialize);
2757 	}
2758 
2759 	/*
2760 	 * Check for transmit completes
2761 	 *
2762 	 * NOTE:
2763 	 * Since pkt_done is only changed by mxge_tx_done(),
2764 	 * which is called only in interrupt handler, the
2765 	 * check w/o holding tx serializer is MPSAFE.
2766 	 */
2767 	send_done_count = be32toh(stats->send_done_count);
2768 	if (send_done_count != tx->pkt_done) {
2769 		lwkt_serialize_enter(&tx->tx_serialize);
2770 		mxge_tx_done(&sc->arpcom.ac_if, tx, (int)send_done_count);
2771 		lwkt_serialize_exit(&tx->tx_serialize);
2772 	}
2773 
2774 	if (__predict_false(stats->stats_updated))
2775 		mxge_intr_status(sc, stats);
2776 
2777 	/* Check to see if we have rx token to pass back */
2778 	if (!polling && (valid & 0x1))
2779 		*ss->irq_claim = be32toh(3);
2780 	*(ss->irq_claim + 1) = be32toh(3);
2781 }
2782 
2783 static void
2784 mxge_msix_rx(void *arg)
2785 {
2786 	struct mxge_slice_state *ss = arg;
2787 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2788 
2789 #ifdef IFPOLL_ENABLE
2790 	if (ss->sc->arpcom.ac_if.if_flags & IFF_NPOLLING)
2791 		return;
2792 #endif
2793 
2794 	ASSERT_SERIALIZED(&ss->rx_data.rx_serialize);
2795 
2796 	if (rx_done->entry[rx_done->idx].length != 0)
2797 		mxge_clean_rx_done(&ss->sc->arpcom.ac_if, &ss->rx_data, -1);
2798 
2799 	*ss->irq_claim = be32toh(3);
2800 }
2801 
2802 static void
2803 mxge_msix_rxtx(void *arg)
2804 {
2805 	struct mxge_slice_state *ss = arg;
2806 	mxge_softc_t *sc = ss->sc;
2807 	mcp_irq_data_t *stats = ss->fw_stats;
2808 	mxge_tx_ring_t *tx = &ss->tx;
2809 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2810 	uint32_t send_done_count;
2811 	uint8_t valid;
2812 #ifndef IFPOLL_ENABLE
2813 	const boolean_t polling = FALSE;
2814 #else
2815 	boolean_t polling = FALSE;
2816 #endif
2817 
2818 	ASSERT_SERIALIZED(&ss->rx_data.rx_serialize);
2819 
2820 	/* Make sure the DMA has finished */
2821 	if (__predict_false(!stats->valid))
2822 		return;
2823 
2824 	valid = stats->valid;
2825 	stats->valid = 0;
2826 
2827 #ifdef IFPOLL_ENABLE
2828 	if (sc->arpcom.ac_if.if_flags & IFF_NPOLLING)
2829 		polling = TRUE;
2830 #endif
2831 
2832 	/* Check for receives */
2833 	if (!polling && rx_done->entry[rx_done->idx].length != 0)
2834 		mxge_clean_rx_done(&sc->arpcom.ac_if, &ss->rx_data, -1);
2835 
2836 	/*
2837 	 * Check for transmit completes
2838 	 *
2839 	 * NOTE:
2840 	 * Since pkt_done is only changed by mxge_tx_done(),
2841 	 * which is called only in interrupt handler, the
2842 	 * check w/o holding tx serializer is MPSAFE.
2843 	 */
2844 	send_done_count = be32toh(stats->send_done_count);
2845 	if (send_done_count != tx->pkt_done) {
2846 		lwkt_serialize_enter(&tx->tx_serialize);
2847 		mxge_tx_done(&sc->arpcom.ac_if, tx, (int)send_done_count);
2848 		lwkt_serialize_exit(&tx->tx_serialize);
2849 	}
2850 
2851 	/* Check to see if we have rx token to pass back */
2852 	if (!polling && (valid & 0x1))
2853 		*ss->irq_claim = be32toh(3);
2854 	*(ss->irq_claim + 1) = be32toh(3);
2855 }
2856 
2857 static void
2858 mxge_init(void *arg)
2859 {
2860 	struct mxge_softc *sc = arg;
2861 
2862 	ASSERT_IFNET_SERIALIZED_ALL(sc->ifp);
2863 	if ((sc->ifp->if_flags & IFF_RUNNING) == 0)
2864 		mxge_open(sc);
2865 }
2866 
2867 static void
2868 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2869 {
2870 	int i;
2871 
2872 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
2873 		if (ss->rx_data.rx_big.info[i].m == NULL)
2874 			continue;
2875 		bus_dmamap_unload(ss->rx_data.rx_big.dmat,
2876 		    ss->rx_data.rx_big.info[i].map);
2877 		m_freem(ss->rx_data.rx_big.info[i].m);
2878 		ss->rx_data.rx_big.info[i].m = NULL;
2879 	}
2880 
2881 	for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
2882 		if (ss->rx_data.rx_small.info[i].m == NULL)
2883 			continue;
2884 		bus_dmamap_unload(ss->rx_data.rx_small.dmat,
2885 		    ss->rx_data.rx_small.info[i].map);
2886 		m_freem(ss->rx_data.rx_small.info[i].m);
2887 		ss->rx_data.rx_small.info[i].m = NULL;
2888 	}
2889 
2890 	/* Transmit ring used only on the first slice */
2891 	if (ss->tx.info == NULL)
2892 		return;
2893 
2894 	for (i = 0; i <= ss->tx.mask; i++) {
2895 		if (ss->tx.info[i].m == NULL)
2896 			continue;
2897 		bus_dmamap_unload(ss->tx.dmat, ss->tx.info[i].map);
2898 		m_freem(ss->tx.info[i].m);
2899 		ss->tx.info[i].m = NULL;
2900 	}
2901 }
2902 
2903 static void
2904 mxge_free_mbufs(mxge_softc_t *sc)
2905 {
2906 	int slice;
2907 
2908 	for (slice = 0; slice < sc->num_slices; slice++)
2909 		mxge_free_slice_mbufs(&sc->ss[slice]);
2910 }
2911 
2912 static void
2913 mxge_free_slice_rings(struct mxge_slice_state *ss)
2914 {
2915 	int i;
2916 
2917 	if (ss->rx_data.rx_done.entry != NULL) {
2918 		mxge_dma_free(&ss->rx_done_dma);
2919 		ss->rx_data.rx_done.entry = NULL;
2920 	}
2921 
2922 	if (ss->tx.req_list != NULL) {
2923 		kfree(ss->tx.req_list, M_DEVBUF);
2924 		ss->tx.req_list = NULL;
2925 	}
2926 
2927 	if (ss->tx.seg_list != NULL) {
2928 		kfree(ss->tx.seg_list, M_DEVBUF);
2929 		ss->tx.seg_list = NULL;
2930 	}
2931 
2932 	if (ss->rx_data.rx_small.shadow != NULL) {
2933 		kfree(ss->rx_data.rx_small.shadow, M_DEVBUF);
2934 		ss->rx_data.rx_small.shadow = NULL;
2935 	}
2936 
2937 	if (ss->rx_data.rx_big.shadow != NULL) {
2938 		kfree(ss->rx_data.rx_big.shadow, M_DEVBUF);
2939 		ss->rx_data.rx_big.shadow = NULL;
2940 	}
2941 
2942 	if (ss->tx.info != NULL) {
2943 		if (ss->tx.dmat != NULL) {
2944 			for (i = 0; i <= ss->tx.mask; i++) {
2945 				bus_dmamap_destroy(ss->tx.dmat,
2946 				    ss->tx.info[i].map);
2947 			}
2948 			bus_dma_tag_destroy(ss->tx.dmat);
2949 		}
2950 		kfree(ss->tx.info, M_DEVBUF);
2951 		ss->tx.info = NULL;
2952 	}
2953 
2954 	if (ss->rx_data.rx_small.info != NULL) {
2955 		if (ss->rx_data.rx_small.dmat != NULL) {
2956 			for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
2957 				bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
2958 				    ss->rx_data.rx_small.info[i].map);
2959 			}
2960 			bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
2961 			    ss->rx_data.rx_small.extra_map);
2962 			bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
2963 		}
2964 		kfree(ss->rx_data.rx_small.info, M_DEVBUF);
2965 		ss->rx_data.rx_small.info = NULL;
2966 	}
2967 
2968 	if (ss->rx_data.rx_big.info != NULL) {
2969 		if (ss->rx_data.rx_big.dmat != NULL) {
2970 			for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
2971 				bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
2972 				    ss->rx_data.rx_big.info[i].map);
2973 			}
2974 			bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
2975 			    ss->rx_data.rx_big.extra_map);
2976 			bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
2977 		}
2978 		kfree(ss->rx_data.rx_big.info, M_DEVBUF);
2979 		ss->rx_data.rx_big.info = NULL;
2980 	}
2981 }
2982 
2983 static void
2984 mxge_free_rings(mxge_softc_t *sc)
2985 {
2986 	int slice;
2987 
2988 	if (sc->ss == NULL)
2989 		return;
2990 
2991 	for (slice = 0; slice < sc->num_slices; slice++)
2992 		mxge_free_slice_rings(&sc->ss[slice]);
2993 }
2994 
2995 static int
2996 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
2997     int tx_ring_entries)
2998 {
2999 	mxge_softc_t *sc = ss->sc;
3000 	size_t bytes;
3001 	int err, i;
3002 
3003 	/*
3004 	 * Allocate per-slice receive resources
3005 	 */
3006 
3007 	ss->rx_data.rx_small.mask = ss->rx_data.rx_big.mask =
3008 	    rx_ring_entries - 1;
3009 	ss->rx_data.rx_done.mask = (2 * rx_ring_entries) - 1;
3010 
3011 	/* Allocate the rx shadow rings */
3012 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_small.shadow);
3013 	ss->rx_data.rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3014 
3015 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_big.shadow);
3016 	ss->rx_data.rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3017 
3018 	/* Allocate the rx host info rings */
3019 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_small.info);
3020 	ss->rx_data.rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3021 
3022 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_big.info);
3023 	ss->rx_data.rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3024 
3025 	/* Allocate the rx busdma resources */
3026 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3027 				 1,			/* alignment */
3028 				 4096,			/* boundary */
3029 				 BUS_SPACE_MAXADDR,	/* low */
3030 				 BUS_SPACE_MAXADDR,	/* high */
3031 				 NULL, NULL,		/* filter */
3032 				 MHLEN,			/* maxsize */
3033 				 1,			/* num segs */
3034 				 MHLEN,			/* maxsegsize */
3035 				 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
3036 				 			/* flags */
3037 				 &ss->rx_data.rx_small.dmat); /* tag */
3038 	if (err != 0) {
3039 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3040 		    err);
3041 		return err;
3042 	}
3043 
3044 	err = bus_dmamap_create(ss->rx_data.rx_small.dmat, BUS_DMA_WAITOK,
3045 	    &ss->rx_data.rx_small.extra_map);
3046 	if (err != 0) {
3047 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n", err);
3048 		bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
3049 		ss->rx_data.rx_small.dmat = NULL;
3050 		return err;
3051 	}
3052 	for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
3053 		err = bus_dmamap_create(ss->rx_data.rx_small.dmat,
3054 		    BUS_DMA_WAITOK, &ss->rx_data.rx_small.info[i].map);
3055 		if (err != 0) {
3056 			int j;
3057 
3058 			device_printf(sc->dev, "Err %d rx_small dmamap\n", err);
3059 
3060 			for (j = 0; j < i; ++j) {
3061 				bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
3062 				    ss->rx_data.rx_small.info[j].map);
3063 			}
3064 			bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
3065 			    ss->rx_data.rx_small.extra_map);
3066 			bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
3067 			ss->rx_data.rx_small.dmat = NULL;
3068 			return err;
3069 		}
3070 	}
3071 
3072 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3073 				 1,			/* alignment */
3074 				 4096,			/* boundary */
3075 				 BUS_SPACE_MAXADDR,	/* low */
3076 				 BUS_SPACE_MAXADDR,	/* high */
3077 				 NULL, NULL,		/* filter */
3078 				 4096,			/* maxsize */
3079 				 1,			/* num segs */
3080 				 4096,			/* maxsegsize*/
3081 				 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
3082 				 			/* flags */
3083 				 &ss->rx_data.rx_big.dmat); /* tag */
3084 	if (err != 0) {
3085 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3086 		    err);
3087 		return err;
3088 	}
3089 
3090 	err = bus_dmamap_create(ss->rx_data.rx_big.dmat, BUS_DMA_WAITOK,
3091 	    &ss->rx_data.rx_big.extra_map);
3092 	if (err != 0) {
3093 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n", err);
3094 		bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
3095 		ss->rx_data.rx_big.dmat = NULL;
3096 		return err;
3097 	}
3098 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
3099 		err = bus_dmamap_create(ss->rx_data.rx_big.dmat, BUS_DMA_WAITOK,
3100 		    &ss->rx_data.rx_big.info[i].map);
3101 		if (err != 0) {
3102 			int j;
3103 
3104 			device_printf(sc->dev, "Err %d rx_big dmamap\n", err);
3105 			for (j = 0; j < i; ++j) {
3106 				bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
3107 				    ss->rx_data.rx_big.info[j].map);
3108 			}
3109 			bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
3110 			    ss->rx_data.rx_big.extra_map);
3111 			bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
3112 			ss->rx_data.rx_big.dmat = NULL;
3113 			return err;
3114 		}
3115 	}
3116 
3117 	/*
3118 	 * Now allocate TX resources
3119 	 */
3120 
3121 	ss->tx.mask = tx_ring_entries - 1;
3122 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3123 
3124 	/*
3125 	 * Allocate the tx request copy block; MUST be at least 8 bytes
3126 	 * aligned
3127 	 */
3128 	bytes = sizeof(*ss->tx.req_list) * (ss->tx.max_desc + 4);
3129 	ss->tx.req_list = kmalloc_cachealign(__VM_CACHELINE_ALIGN(bytes),
3130 	    M_DEVBUF, M_WAITOK);
3131 
3132 	/* Allocate the tx busdma segment list */
3133 	bytes = sizeof(*ss->tx.seg_list) * ss->tx.max_desc;
3134 	ss->tx.seg_list = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3135 
3136 	/* Allocate the tx host info ring */
3137 	bytes = tx_ring_entries * sizeof(*ss->tx.info);
3138 	ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3139 
3140 	/* Allocate the tx busdma resources */
3141 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3142 				 1,			/* alignment */
3143 				 sc->tx_boundary,	/* boundary */
3144 				 BUS_SPACE_MAXADDR,	/* low */
3145 				 BUS_SPACE_MAXADDR,	/* high */
3146 				 NULL, NULL,		/* filter */
3147 				 IP_MAXPACKET +
3148 				 sizeof(struct ether_vlan_header),
3149 				 			/* maxsize */
3150 				 ss->tx.max_desc - 2,	/* num segs */
3151 				 sc->tx_boundary,	/* maxsegsz */
3152 				 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW |
3153 				 BUS_DMA_ONEBPAGE,	/* flags */
3154 				 &ss->tx.dmat);		/* tag */
3155 	if (err != 0) {
3156 		device_printf(sc->dev, "Err %d allocating tx dmat\n", err);
3157 		return err;
3158 	}
3159 
3160 	/*
3161 	 * Now use these tags to setup DMA maps for each slot in the ring
3162 	 */
3163 	for (i = 0; i <= ss->tx.mask; i++) {
3164 		err = bus_dmamap_create(ss->tx.dmat,
3165 		    BUS_DMA_WAITOK | BUS_DMA_ONEBPAGE, &ss->tx.info[i].map);
3166 		if (err != 0) {
3167 			int j;
3168 
3169 			device_printf(sc->dev, "Err %d tx dmamap\n", err);
3170 			for (j = 0; j < i; ++j) {
3171 				bus_dmamap_destroy(ss->tx.dmat,
3172 				    ss->tx.info[j].map);
3173 			}
3174 			bus_dma_tag_destroy(ss->tx.dmat);
3175 			ss->tx.dmat = NULL;
3176 			return err;
3177 		}
3178 	}
3179 	return 0;
3180 }
3181 
3182 static int
3183 mxge_alloc_rings(mxge_softc_t *sc)
3184 {
3185 	mxge_cmd_t cmd;
3186 	int tx_ring_size;
3187 	int tx_ring_entries, rx_ring_entries;
3188 	int err, slice;
3189 
3190 	/* Get ring sizes */
3191 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3192 	if (err != 0) {
3193 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3194 		return err;
3195 	}
3196 	tx_ring_size = cmd.data0;
3197 
3198 	tx_ring_entries = tx_ring_size / sizeof(mcp_kreq_ether_send_t);
3199 	rx_ring_entries = sc->rx_intr_slots / 2;
3200 
3201 	if (bootverbose) {
3202 		device_printf(sc->dev, "tx desc %d, rx desc %d\n",
3203 		    tx_ring_entries, rx_ring_entries);
3204 	}
3205 
3206 	sc->ifp->if_nmbclusters = rx_ring_entries * sc->num_slices;
3207 	sc->ifp->if_nmbjclusters = sc->ifp->if_nmbclusters;
3208 
3209 	ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3210 	ifq_set_ready(&sc->ifp->if_snd);
3211 	ifq_set_subq_cnt(&sc->ifp->if_snd, sc->num_tx_rings);
3212 
3213 	if (sc->num_tx_rings > 1) {
3214 		sc->ifp->if_mapsubq = ifq_mapsubq_modulo;
3215 		ifq_set_subq_divisor(&sc->ifp->if_snd, sc->num_tx_rings);
3216 	}
3217 
3218 	for (slice = 0; slice < sc->num_slices; slice++) {
3219 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3220 		    rx_ring_entries, tx_ring_entries);
3221 		if (err != 0) {
3222 			device_printf(sc->dev,
3223 			    "alloc %d slice rings failed\n", slice);
3224 			return err;
3225 		}
3226 	}
3227 	return 0;
3228 }
3229 
3230 static void
3231 mxge_choose_params(int mtu, int *cl_size)
3232 {
3233 	int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3234 
3235 	if (bufsize < MCLBYTES) {
3236 		*cl_size = MCLBYTES;
3237 	} else {
3238 		KASSERT(bufsize < MJUMPAGESIZE, ("invalid MTU %d", mtu));
3239 		*cl_size = MJUMPAGESIZE;
3240 	}
3241 }
3242 
3243 static int
3244 mxge_slice_open(struct mxge_slice_state *ss, int cl_size)
3245 {
3246 	mxge_cmd_t cmd;
3247 	int err, i, slice;
3248 
3249 	slice = ss - ss->sc->ss;
3250 
3251 	/*
3252 	 * Get the lanai pointers to the send and receive rings
3253 	 */
3254 	err = 0;
3255 
3256 	bzero(&cmd, sizeof(cmd));	/* silence gcc warning */
3257 	if (ss->sc->num_tx_rings == 1) {
3258 		if (slice == 0) {
3259 			cmd.data0 = slice;
3260 			err = mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SEND_OFFSET,
3261 			    &cmd);
3262 			ss->tx.lanai = (volatile mcp_kreq_ether_send_t *)
3263 			    (ss->sc->sram + cmd.data0);
3264 			/* Leave send_go and send_stop as NULL */
3265 		}
3266 	} else {
3267 		cmd.data0 = slice;
3268 		err = mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3269 		ss->tx.lanai = (volatile mcp_kreq_ether_send_t *)
3270 		    (ss->sc->sram + cmd.data0);
3271 		ss->tx.send_go = (volatile uint32_t *)
3272 		    (ss->sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3273 		ss->tx.send_stop = (volatile uint32_t *)
3274 		    (ss->sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3275 	}
3276 
3277 	cmd.data0 = slice;
3278 	err |= mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3279 	ss->rx_data.rx_small.lanai =
3280 	    (volatile mcp_kreq_ether_recv_t *)(ss->sc->sram + cmd.data0);
3281 
3282 	cmd.data0 = slice;
3283 	err |= mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3284 	ss->rx_data.rx_big.lanai =
3285 	    (volatile mcp_kreq_ether_recv_t *)(ss->sc->sram + cmd.data0);
3286 
3287 	if (err != 0) {
3288 		if_printf(ss->sc->ifp,
3289 		    "failed to get ring sizes or locations\n");
3290 		return EIO;
3291 	}
3292 
3293 	/*
3294 	 * Stock small receive ring
3295 	 */
3296 	for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
3297 		err = mxge_get_buf_small(&ss->rx_data.rx_small,
3298 		    ss->rx_data.rx_small.info[i].map, i, TRUE);
3299 		if (err) {
3300 			if_printf(ss->sc->ifp, "alloced %d/%d smalls\n", i,
3301 			    ss->rx_data.rx_small.mask + 1);
3302 			return ENOMEM;
3303 		}
3304 	}
3305 
3306 	/*
3307 	 * Stock big receive ring
3308 	 */
3309 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
3310 		ss->rx_data.rx_big.shadow[i].addr_low = 0xffffffff;
3311 		ss->rx_data.rx_big.shadow[i].addr_high = 0xffffffff;
3312 	}
3313 
3314 	ss->rx_data.rx_big.cl_size = cl_size;
3315 
3316 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
3317 		err = mxge_get_buf_big(&ss->rx_data.rx_big,
3318 		    ss->rx_data.rx_big.info[i].map, i, TRUE);
3319 		if (err) {
3320 			if_printf(ss->sc->ifp, "alloced %d/%d bigs\n", i,
3321 			    ss->rx_data.rx_big.mask + 1);
3322 			return ENOMEM;
3323 		}
3324 	}
3325 	return 0;
3326 }
3327 
3328 static int
3329 mxge_open(mxge_softc_t *sc)
3330 {
3331 	struct ifnet *ifp = sc->ifp;
3332 	mxge_cmd_t cmd;
3333 	int err, slice, cl_size, i;
3334 	bus_addr_t bus;
3335 	volatile uint8_t *itable;
3336 	struct mxge_slice_state *ss;
3337 
3338 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
3339 
3340 	/* Copy the MAC address in case it was overridden */
3341 	bcopy(IF_LLADDR(ifp), sc->mac_addr, ETHER_ADDR_LEN);
3342 
3343 	err = mxge_reset(sc, 1);
3344 	if (err != 0) {
3345 		if_printf(ifp, "failed to reset\n");
3346 		return EIO;
3347 	}
3348 
3349 	if (sc->num_slices > 1) {
3350 		/*
3351 		 * Setup the indirect table.
3352 		 */
3353 		if_ringmap_rdrtable(sc->ring_map, sc->rdr_table, NETISR_CPUMAX);
3354 
3355 		cmd.data0 = NETISR_CPUMAX;
3356 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE, &cmd);
3357 
3358 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET, &cmd);
3359 		if (err != 0) {
3360 			if_printf(ifp, "failed to setup rss tables\n");
3361 			return err;
3362 		}
3363 
3364 		itable = sc->sram + cmd.data0;
3365 		for (i = 0; i < NETISR_CPUMAX; i++)
3366 			itable[i] = sc->rdr_table[i];
3367 
3368 		if (sc->use_rss) {
3369 			volatile uint8_t *hwkey;
3370 			uint8_t swkey[MXGE_HWRSS_KEYLEN];
3371 
3372 			/*
3373 			 * Setup Toeplitz key.
3374 			 */
3375 			err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_KEY_OFFSET,
3376 			    &cmd);
3377 			if (err != 0) {
3378 				if_printf(ifp, "failed to get rsskey\n");
3379 				return err;
3380 			}
3381 			hwkey = sc->sram + cmd.data0;
3382 
3383 			toeplitz_get_key(swkey, MXGE_HWRSS_KEYLEN);
3384 			for (i = 0; i < MXGE_HWRSS_KEYLEN; ++i)
3385 				hwkey[i] = swkey[i];
3386 			wmb();
3387 
3388 			err = mxge_send_cmd(sc, MXGEFW_CMD_RSS_KEY_UPDATED,
3389 			    &cmd);
3390 			if (err != 0) {
3391 				if_printf(ifp, "failed to update rsskey\n");
3392 				return err;
3393 			}
3394 			if (bootverbose)
3395 				if_printf(ifp, "RSS key updated\n");
3396 		}
3397 
3398 		cmd.data0 = 1;
3399 		if (sc->use_rss) {
3400 			if (bootverbose)
3401 				if_printf(ifp, "input hash: RSS\n");
3402 			cmd.data1 = MXGEFW_RSS_HASH_TYPE_IPV4 |
3403 			    MXGEFW_RSS_HASH_TYPE_TCP_IPV4;
3404 		} else {
3405 			if (bootverbose)
3406 				if_printf(ifp, "input hash: SRC_DST_PORT\n");
3407 			cmd.data1 = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
3408 		}
3409 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3410 		if (err != 0) {
3411 			if_printf(ifp, "failed to enable slices\n");
3412 			return err;
3413 		}
3414 	}
3415 
3416 	cmd.data0 = MXGEFW_TSO_MODE_NDIS;
3417 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_TSO_MODE, &cmd);
3418 	if (err) {
3419 		/*
3420 		 * Can't change TSO mode to NDIS, never allow TSO then
3421 		 */
3422 		if_printf(ifp, "failed to set TSO mode\n");
3423 		ifp->if_capenable &= ~IFCAP_TSO;
3424 		ifp->if_capabilities &= ~IFCAP_TSO;
3425 		ifp->if_hwassist &= ~CSUM_TSO;
3426 	}
3427 
3428 	mxge_choose_params(ifp->if_mtu, &cl_size);
3429 
3430 	cmd.data0 = 1;
3431 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS, &cmd);
3432 	/*
3433 	 * Error is only meaningful if we're trying to set
3434 	 * MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1
3435 	 */
3436 
3437 	/*
3438 	 * Give the firmware the mtu and the big and small buffer
3439 	 * sizes.  The firmware wants the big buf size to be a power
3440 	 * of two. Luckily, DragonFly's clusters are powers of two
3441 	 */
3442 	cmd.data0 = ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3443 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3444 
3445 	cmd.data0 = MXGE_RX_SMALL_BUFLEN;
3446 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE, &cmd);
3447 
3448 	cmd.data0 = cl_size;
3449 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3450 
3451 	if (err != 0) {
3452 		if_printf(ifp, "failed to setup params\n");
3453 		goto abort;
3454 	}
3455 
3456 	/* Now give him the pointer to the stats block */
3457 	for (slice = 0; slice < sc->num_slices; slice++) {
3458 		ss = &sc->ss[slice];
3459 		cmd.data0 = MXGE_LOWPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3460 		cmd.data1 = MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3461 		cmd.data2 = sizeof(struct mcp_irq_data);
3462 		cmd.data2 |= (slice << 16);
3463 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3464 	}
3465 
3466 	if (err != 0) {
3467 		bus = sc->ss->fw_stats_dma.dmem_busaddr;
3468 		bus += offsetof(struct mcp_irq_data, send_done_count);
3469 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3470 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3471 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3472 		    &cmd);
3473 
3474 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3475 		sc->fw_multicast_support = 0;
3476 	} else {
3477 		sc->fw_multicast_support = 1;
3478 	}
3479 
3480 	if (err != 0) {
3481 		if_printf(ifp, "failed to setup params\n");
3482 		goto abort;
3483 	}
3484 
3485 	for (slice = 0; slice < sc->num_slices; slice++) {
3486 		err = mxge_slice_open(&sc->ss[slice], cl_size);
3487 		if (err != 0) {
3488 			if_printf(ifp, "couldn't open slice %d\n", slice);
3489 			goto abort;
3490 		}
3491 	}
3492 
3493 	/* Finally, start the firmware running */
3494 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3495 	if (err) {
3496 		if_printf(ifp, "Couldn't bring up link\n");
3497 		goto abort;
3498 	}
3499 
3500 	ifp->if_flags |= IFF_RUNNING;
3501 	for (i = 0; i < sc->num_tx_rings; ++i) {
3502 		mxge_tx_ring_t *tx = &sc->ss[i].tx;
3503 
3504 		ifsq_clr_oactive(tx->ifsq);
3505 		ifsq_watchdog_start(&tx->watchdog);
3506 	}
3507 
3508 	return 0;
3509 
3510 abort:
3511 	mxge_free_mbufs(sc);
3512 	return err;
3513 }
3514 
3515 static void
3516 mxge_close(mxge_softc_t *sc, int down)
3517 {
3518 	struct ifnet *ifp = sc->ifp;
3519 	mxge_cmd_t cmd;
3520 	int err, old_down_cnt, i;
3521 
3522 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
3523 
3524 	if (!down) {
3525 		old_down_cnt = sc->down_cnt;
3526 		wmb();
3527 
3528 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3529 		if (err)
3530 			if_printf(ifp, "Couldn't bring down link\n");
3531 
3532 		if (old_down_cnt == sc->down_cnt) {
3533 			/*
3534 			 * Wait for down irq
3535 			 * XXX racy
3536 			 */
3537 			ifnet_deserialize_all(ifp);
3538 			DELAY(10 * sc->intr_coal_delay);
3539 			ifnet_serialize_all(ifp);
3540 		}
3541 
3542 		wmb();
3543 		if (old_down_cnt == sc->down_cnt)
3544 			if_printf(ifp, "never got down irq\n");
3545 	}
3546 	mxge_free_mbufs(sc);
3547 
3548 	ifp->if_flags &= ~IFF_RUNNING;
3549 	for (i = 0; i < sc->num_tx_rings; ++i) {
3550 		mxge_tx_ring_t *tx = &sc->ss[i].tx;
3551 
3552 		ifsq_clr_oactive(tx->ifsq);
3553 		ifsq_watchdog_stop(&tx->watchdog);
3554 	}
3555 }
3556 
3557 static void
3558 mxge_setup_cfg_space(mxge_softc_t *sc)
3559 {
3560 	device_t dev = sc->dev;
3561 	int reg;
3562 	uint16_t lnk, pectl;
3563 
3564 	/* Find the PCIe link width and set max read request to 4KB */
3565 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3566 		lnk = pci_read_config(dev, reg + 0x12, 2);
3567 		sc->link_width = (lnk >> 4) & 0x3f;
3568 
3569 		if (sc->pectl == 0) {
3570 			pectl = pci_read_config(dev, reg + 0x8, 2);
3571 			pectl = (pectl & ~0x7000) | (5 << 12);
3572 			pci_write_config(dev, reg + 0x8, pectl, 2);
3573 			sc->pectl = pectl;
3574 		} else {
3575 			/* Restore saved pectl after watchdog reset */
3576 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3577 		}
3578 	}
3579 
3580 	/* Enable DMA and memory space access */
3581 	pci_enable_busmaster(dev);
3582 }
3583 
3584 static uint32_t
3585 mxge_read_reboot(mxge_softc_t *sc)
3586 {
3587 	device_t dev = sc->dev;
3588 	uint32_t vs;
3589 
3590 	/* Find the vendor specific offset */
3591 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3592 		if_printf(sc->ifp, "could not find vendor specific offset\n");
3593 		return (uint32_t)-1;
3594 	}
3595 	/* Enable read32 mode */
3596 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3597 	/* Tell NIC which register to read */
3598 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3599 	return pci_read_config(dev, vs + 0x14, 4);
3600 }
3601 
3602 static void
3603 mxge_watchdog_reset(mxge_softc_t *sc)
3604 {
3605 	struct pci_devinfo *dinfo;
3606 	int err, running;
3607 	uint32_t reboot;
3608 	uint16_t cmd;
3609 
3610 	err = ENXIO;
3611 
3612 	if_printf(sc->ifp, "Watchdog reset!\n");
3613 
3614 	/*
3615 	 * Check to see if the NIC rebooted.  If it did, then all of
3616 	 * PCI config space has been reset, and things like the
3617 	 * busmaster bit will be zero.  If this is the case, then we
3618 	 * must restore PCI config space before the NIC can be used
3619 	 * again
3620 	 */
3621 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3622 	if (cmd == 0xffff) {
3623 		/*
3624 		 * Maybe the watchdog caught the NIC rebooting; wait
3625 		 * up to 100ms for it to finish.  If it does not come
3626 		 * back, then give up
3627 		 */
3628 		DELAY(1000*100);
3629 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3630 		if (cmd == 0xffff)
3631 			if_printf(sc->ifp, "NIC disappeared!\n");
3632 	}
3633 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3634 		/* Print the reboot status */
3635 		reboot = mxge_read_reboot(sc);
3636 		if_printf(sc->ifp, "NIC rebooted, status = 0x%x\n", reboot);
3637 
3638 		running = sc->ifp->if_flags & IFF_RUNNING;
3639 		if (running) {
3640 			/*
3641 			 * Quiesce NIC so that TX routines will not try to
3642 			 * xmit after restoration of BAR
3643 			 */
3644 
3645 			/* Mark the link as down */
3646 			if (sc->link_state) {
3647 				sc->ifp->if_link_state = LINK_STATE_DOWN;
3648 				if_link_state_change(sc->ifp);
3649 			}
3650 			mxge_close(sc, 1);
3651 		}
3652 		/* Restore PCI configuration space */
3653 		dinfo = device_get_ivars(sc->dev);
3654 		pci_cfg_restore(sc->dev, dinfo);
3655 
3656 		/* And redo any changes we made to our config space */
3657 		mxge_setup_cfg_space(sc);
3658 
3659 		/* Reload f/w */
3660 		err = mxge_load_firmware(sc, 0);
3661 		if (err)
3662 			if_printf(sc->ifp, "Unable to re-load f/w\n");
3663 		if (running && !err) {
3664 			int i;
3665 
3666 			err = mxge_open(sc);
3667 
3668 			for (i = 0; i < sc->num_tx_rings; ++i)
3669 				ifsq_devstart_sched(sc->ss[i].tx.ifsq);
3670 		}
3671 		sc->watchdog_resets++;
3672 	} else {
3673 		if_printf(sc->ifp, "NIC did not reboot, not resetting\n");
3674 		err = 0;
3675 	}
3676 	if (err) {
3677 		if_printf(sc->ifp, "watchdog reset failed\n");
3678 	} else {
3679 		if (sc->dying == 2)
3680 			sc->dying = 0;
3681 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3682 	}
3683 }
3684 
3685 static void
3686 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3687 {
3688 	if_printf(sc->ifp, "slice %d struck? ring state:\n", slice);
3689 	if_printf(sc->ifp, "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3690 	    tx->req, tx->done, tx->queue_active);
3691 	if_printf(sc->ifp, "tx.activate=%d tx.deactivate=%d\n",
3692 	    tx->activate, tx->deactivate);
3693 	if_printf(sc->ifp, "pkt_done=%d fw=%d\n",
3694 	    tx->pkt_done, be32toh(sc->ss->fw_stats->send_done_count));
3695 }
3696 
3697 static u_long
3698 mxge_update_stats(mxge_softc_t *sc)
3699 {
3700 	u_long ipackets, opackets, pkts;
3701 
3702 	IFNET_STAT_GET(sc->ifp, ipackets, ipackets);
3703 	IFNET_STAT_GET(sc->ifp, opackets, opackets);
3704 
3705 	pkts = ipackets - sc->ipackets;
3706 	pkts += opackets - sc->opackets;
3707 
3708 	sc->ipackets = ipackets;
3709 	sc->opackets = opackets;
3710 
3711 	return pkts;
3712 }
3713 
3714 static void
3715 mxge_tick(void *arg)
3716 {
3717 	mxge_softc_t *sc = arg;
3718 	u_long pkts = 0;
3719 	int err = 0;
3720 	int ticks;
3721 
3722 	lwkt_serialize_enter(&sc->main_serialize);
3723 
3724 	ticks = mxge_ticks;
3725 	if (sc->ifp->if_flags & IFF_RUNNING) {
3726 		/* Aggregate stats from different slices */
3727 		pkts = mxge_update_stats(sc);
3728 		if (sc->need_media_probe)
3729 			mxge_media_probe(sc);
3730 	}
3731 	if (pkts == 0) {
3732 		uint16_t cmd;
3733 
3734 		/* Ensure NIC did not suffer h/w fault while idle */
3735 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3736 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3737 			sc->dying = 2;
3738 			mxge_serialize_skipmain(sc);
3739 			mxge_watchdog_reset(sc);
3740 			mxge_deserialize_skipmain(sc);
3741 			err = ENXIO;
3742 		}
3743 
3744 		/* Look less often if NIC is idle */
3745 		ticks *= 4;
3746 	}
3747 
3748 	if (err == 0)
3749 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
3750 
3751 	lwkt_serialize_exit(&sc->main_serialize);
3752 }
3753 
3754 static int
3755 mxge_media_change(struct ifnet *ifp)
3756 {
3757 	mxge_softc_t *sc = ifp->if_softc;
3758 	const struct ifmedia *ifm = &sc->media;
3759 	int pause;
3760 
3761 	if (IFM_OPTIONS(ifm->ifm_media) & (IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE)) {
3762 		if (sc->pause)
3763 			return 0;
3764 		pause = 1;
3765 	} else {
3766 		if (!sc->pause)
3767 			return 0;
3768 		pause = 0;
3769 	}
3770 	return mxge_change_pause(sc, pause);
3771 }
3772 
3773 static int
3774 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3775 {
3776 	struct ifnet *ifp = sc->ifp;
3777 	int real_mtu, old_mtu;
3778 	int err = 0;
3779 
3780 	real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3781 	if (mtu > sc->max_mtu || real_mtu < 60)
3782 		return EINVAL;
3783 
3784 	old_mtu = ifp->if_mtu;
3785 	ifp->if_mtu = mtu;
3786 	if (ifp->if_flags & IFF_RUNNING) {
3787 		mxge_close(sc, 0);
3788 		err = mxge_open(sc);
3789 		if (err != 0) {
3790 			ifp->if_mtu = old_mtu;
3791 			mxge_close(sc, 0);
3792 			mxge_open(sc);
3793 		}
3794 	}
3795 	return err;
3796 }
3797 
3798 static void
3799 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3800 {
3801 	mxge_softc_t *sc = ifp->if_softc;
3802 
3803 	ifmr->ifm_status = IFM_AVALID;
3804 	ifmr->ifm_active = IFM_ETHER;
3805 
3806 	if (sc->link_state)
3807 		ifmr->ifm_status |= IFM_ACTIVE;
3808 
3809 	/*
3810 	 * Autoselect is not supported, so the current media
3811 	 * should be delivered.
3812 	 */
3813 	ifmr->ifm_active |= sc->current_media;
3814 	if (sc->current_media != IFM_NONE) {
3815 		ifmr->ifm_active |= MXGE_IFM;
3816 		if (sc->pause)
3817 			ifmr->ifm_active |= IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE;
3818 	}
3819 }
3820 
3821 static int
3822 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data,
3823     struct ucred *cr __unused)
3824 {
3825 	mxge_softc_t *sc = ifp->if_softc;
3826 	struct ifreq *ifr = (struct ifreq *)data;
3827 	int err, mask;
3828 
3829 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
3830 	err = 0;
3831 
3832 	switch (command) {
3833 	case SIOCSIFMTU:
3834 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3835 		break;
3836 
3837 	case SIOCSIFFLAGS:
3838 		if (sc->dying)
3839 			return EINVAL;
3840 
3841 		if (ifp->if_flags & IFF_UP) {
3842 			if (!(ifp->if_flags & IFF_RUNNING)) {
3843 				err = mxge_open(sc);
3844 			} else {
3845 				/*
3846 				 * Take care of PROMISC and ALLMULTI
3847 				 * flag changes
3848 				 */
3849 				mxge_change_promisc(sc,
3850 				    ifp->if_flags & IFF_PROMISC);
3851 				mxge_set_multicast_list(sc);
3852 			}
3853 		} else {
3854 			if (ifp->if_flags & IFF_RUNNING)
3855 				mxge_close(sc, 0);
3856 		}
3857 		break;
3858 
3859 	case SIOCADDMULTI:
3860 	case SIOCDELMULTI:
3861 		mxge_set_multicast_list(sc);
3862 		break;
3863 
3864 	case SIOCSIFCAP:
3865 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3866 		if (mask & IFCAP_TXCSUM) {
3867 			ifp->if_capenable ^= IFCAP_TXCSUM;
3868 			if (ifp->if_capenable & IFCAP_TXCSUM)
3869 				ifp->if_hwassist |= CSUM_TCP | CSUM_UDP;
3870 			else
3871 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
3872 		}
3873 		if (mask & IFCAP_TSO) {
3874 			ifp->if_capenable ^= IFCAP_TSO;
3875 			if (ifp->if_capenable & IFCAP_TSO)
3876 				ifp->if_hwassist |= CSUM_TSO;
3877 			else
3878 				ifp->if_hwassist &= ~CSUM_TSO;
3879 		}
3880 		if (mask & IFCAP_RXCSUM)
3881 			ifp->if_capenable ^= IFCAP_RXCSUM;
3882 		if (mask & IFCAP_VLAN_HWTAGGING)
3883 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3884 		break;
3885 
3886 	case SIOCGIFMEDIA:
3887 	case SIOCSIFMEDIA:
3888 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3889 		    &sc->media, command);
3890 		break;
3891 
3892 	default:
3893 		err = ether_ioctl(ifp, command, data);
3894 		break;
3895 	}
3896 	return err;
3897 }
3898 
3899 static void
3900 mxge_fetch_tunables(mxge_softc_t *sc)
3901 {
3902 	int ifm;
3903 
3904 	sc->intr_coal_delay = mxge_intr_coal_delay;
3905 	if (sc->intr_coal_delay < 0 || sc->intr_coal_delay > (10 * 1000))
3906 		sc->intr_coal_delay = MXGE_INTR_COAL_DELAY;
3907 
3908 	/* XXX */
3909 	if (mxge_ticks == 0)
3910 		mxge_ticks = hz / 2;
3911 
3912 	ifm = ifmedia_str2ethfc(mxge_flowctrl);
3913 	if (ifm & (IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE))
3914 		sc->pause = 1;
3915 
3916 	sc->use_rss = mxge_use_rss;
3917 
3918 	sc->throttle = mxge_throttle;
3919 	if (sc->throttle && sc->throttle > MXGE_MAX_THROTTLE)
3920 		sc->throttle = MXGE_MAX_THROTTLE;
3921 	if (sc->throttle && sc->throttle < MXGE_MIN_THROTTLE)
3922 		sc->throttle = MXGE_MIN_THROTTLE;
3923 }
3924 
3925 static void
3926 mxge_free_slices(mxge_softc_t *sc)
3927 {
3928 	struct mxge_slice_state *ss;
3929 	int i;
3930 
3931 	if (sc->ss == NULL)
3932 		return;
3933 
3934 	for (i = 0; i < sc->num_slices; i++) {
3935 		ss = &sc->ss[i];
3936 		if (ss->fw_stats != NULL) {
3937 			mxge_dma_free(&ss->fw_stats_dma);
3938 			ss->fw_stats = NULL;
3939 		}
3940 		if (ss->rx_data.rx_done.entry != NULL) {
3941 			mxge_dma_free(&ss->rx_done_dma);
3942 			ss->rx_data.rx_done.entry = NULL;
3943 		}
3944 	}
3945 	kfree(sc->ss, M_DEVBUF);
3946 	sc->ss = NULL;
3947 }
3948 
3949 static int
3950 mxge_alloc_slices(mxge_softc_t *sc)
3951 {
3952 	mxge_cmd_t cmd;
3953 	struct mxge_slice_state *ss;
3954 	size_t bytes;
3955 	int err, i, rx_ring_size;
3956 
3957 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3958 	if (err != 0) {
3959 		device_printf(sc->dev, "Cannot determine rx ring size\n");
3960 		return err;
3961 	}
3962 	rx_ring_size = cmd.data0;
3963 	sc->rx_intr_slots = 2 * (rx_ring_size / sizeof (mcp_dma_addr_t));
3964 
3965 	bytes = sizeof(*sc->ss) * sc->num_slices;
3966 	sc->ss = kmalloc_cachealign(bytes, M_DEVBUF, M_WAITOK | M_ZERO);
3967 
3968 	for (i = 0; i < sc->num_slices; i++) {
3969 		ss = &sc->ss[i];
3970 
3971 		ss->sc = sc;
3972 
3973 		lwkt_serialize_init(&ss->rx_data.rx_serialize);
3974 		lwkt_serialize_init(&ss->tx.tx_serialize);
3975 		ss->intr_rid = -1;
3976 
3977 		/*
3978 		 * Allocate per-slice rx interrupt queue
3979 		 * XXX assume 4bytes mcp_slot
3980 		 */
3981 		bytes = sc->rx_intr_slots * sizeof(mcp_slot_t);
3982 		err = mxge_dma_alloc(sc, &ss->rx_done_dma, bytes, 4096);
3983 		if (err != 0) {
3984 			device_printf(sc->dev,
3985 			    "alloc %d slice rx_done failed\n", i);
3986 			return err;
3987 		}
3988 		ss->rx_data.rx_done.entry = ss->rx_done_dma.dmem_addr;
3989 
3990 		/*
3991 		 * Allocate the per-slice firmware stats
3992 		 */
3993 		bytes = sizeof(*ss->fw_stats);
3994 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
3995 		    sizeof(*ss->fw_stats), 64);
3996 		if (err != 0) {
3997 			device_printf(sc->dev,
3998 			    "alloc %d fw_stats failed\n", i);
3999 			return err;
4000 		}
4001 		ss->fw_stats = ss->fw_stats_dma.dmem_addr;
4002 	}
4003 	return 0;
4004 }
4005 
4006 static void
4007 mxge_slice_probe(mxge_softc_t *sc)
4008 {
4009 	int status, max_intr_slots, max_slices, num_slices;
4010 	int msix_cnt, msix_enable, multi_tx;
4011 	mxge_cmd_t cmd;
4012 	const char *old_fw;
4013 
4014 	sc->num_slices = 1;
4015 	sc->num_tx_rings = 1;
4016 
4017 	num_slices = device_getenv_int(sc->dev, "num_slices", mxge_num_slices);
4018 	if (num_slices == 1)
4019 		return;
4020 
4021 	if (netisr_ncpus == 1)
4022 		return;
4023 
4024 	msix_enable = device_getenv_int(sc->dev, "msix.enable",
4025 	    mxge_msix_enable);
4026 	if (!msix_enable)
4027 		return;
4028 
4029 	msix_cnt = pci_msix_count(sc->dev);
4030 	if (msix_cnt < 2)
4031 		return;
4032 	if (bootverbose)
4033 		device_printf(sc->dev, "MSI-X count %d\n", msix_cnt);
4034 
4035 	/*
4036 	 * Now load the slice aware firmware see what it supports
4037 	 */
4038 	old_fw = sc->fw_name;
4039 	if (old_fw == mxge_fw_aligned)
4040 		sc->fw_name = mxge_fw_rss_aligned;
4041 	else
4042 		sc->fw_name = mxge_fw_rss_unaligned;
4043 	status = mxge_load_firmware(sc, 0);
4044 	if (status != 0) {
4045 		device_printf(sc->dev, "Falling back to a single slice\n");
4046 		return;
4047 	}
4048 
4049 	/*
4050 	 * Try to send a reset command to the card to see if it is alive
4051 	 */
4052 	memset(&cmd, 0, sizeof(cmd));
4053 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4054 	if (status != 0) {
4055 		device_printf(sc->dev, "failed reset\n");
4056 		goto abort_with_fw;
4057 	}
4058 
4059 	/*
4060 	 * Get rx ring size to calculate rx interrupt queue size
4061 	 */
4062 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4063 	if (status != 0) {
4064 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4065 		goto abort_with_fw;
4066 	}
4067 	max_intr_slots = 2 * (cmd.data0 / sizeof(mcp_dma_addr_t));
4068 
4069 	/*
4070 	 * Tell it the size of the rx interrupt queue
4071 	 */
4072 	cmd.data0 = max_intr_slots * sizeof(struct mcp_slot);
4073 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4074 	if (status != 0) {
4075 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4076 		goto abort_with_fw;
4077 	}
4078 
4079 	/*
4080 	 * Ask the maximum number of slices it supports
4081 	 */
4082 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4083 	if (status != 0) {
4084 		device_printf(sc->dev,
4085 		    "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4086 		goto abort_with_fw;
4087 	}
4088 	max_slices = cmd.data0;
4089 	if (bootverbose)
4090 		device_printf(sc->dev, "max slices %d\n", max_slices);
4091 
4092 	if (max_slices > msix_cnt)
4093 		max_slices = msix_cnt;
4094 
4095 	sc->ring_map = if_ringmap_alloc(sc->dev, num_slices, max_slices);
4096 	sc->num_slices = if_ringmap_count(sc->ring_map);
4097 
4098 	multi_tx = device_getenv_int(sc->dev, "multi_tx", mxge_multi_tx);
4099 	if (multi_tx)
4100 		sc->num_tx_rings = sc->num_slices;
4101 
4102 	if (bootverbose) {
4103 		device_printf(sc->dev, "using %d slices, max %d\n",
4104 		    sc->num_slices, max_slices);
4105 	}
4106 
4107 	if (sc->num_slices == 1)
4108 		goto abort_with_fw;
4109 	return;
4110 
4111 abort_with_fw:
4112 	sc->fw_name = old_fw;
4113 	mxge_load_firmware(sc, 0);
4114 }
4115 
4116 static void
4117 mxge_setup_serialize(struct mxge_softc *sc)
4118 {
4119 	int i = 0, slice;
4120 
4121 	/* Main + rx + tx */
4122 	sc->nserialize = (2 * sc->num_slices) + 1;
4123 	sc->serializes =
4124 	    kmalloc(sc->nserialize * sizeof(struct lwkt_serialize *),
4125 	        M_DEVBUF, M_WAITOK | M_ZERO);
4126 
4127 	/*
4128 	 * Setup serializes
4129 	 *
4130 	 * NOTE: Order is critical
4131 	 */
4132 
4133 	KKASSERT(i < sc->nserialize);
4134 	sc->serializes[i++] = &sc->main_serialize;
4135 
4136 	for (slice = 0; slice < sc->num_slices; ++slice) {
4137 		KKASSERT(i < sc->nserialize);
4138 		sc->serializes[i++] = &sc->ss[slice].rx_data.rx_serialize;
4139 	}
4140 
4141 	for (slice = 0; slice < sc->num_slices; ++slice) {
4142 		KKASSERT(i < sc->nserialize);
4143 		sc->serializes[i++] = &sc->ss[slice].tx.tx_serialize;
4144 	}
4145 
4146 	KKASSERT(i == sc->nserialize);
4147 }
4148 
4149 static void
4150 mxge_serialize(struct ifnet *ifp, enum ifnet_serialize slz)
4151 {
4152 	struct mxge_softc *sc = ifp->if_softc;
4153 
4154 	ifnet_serialize_array_enter(sc->serializes, sc->nserialize, slz);
4155 }
4156 
4157 static void
4158 mxge_deserialize(struct ifnet *ifp, enum ifnet_serialize slz)
4159 {
4160 	struct mxge_softc *sc = ifp->if_softc;
4161 
4162 	ifnet_serialize_array_exit(sc->serializes, sc->nserialize, slz);
4163 }
4164 
4165 static int
4166 mxge_tryserialize(struct ifnet *ifp, enum ifnet_serialize slz)
4167 {
4168 	struct mxge_softc *sc = ifp->if_softc;
4169 
4170 	return ifnet_serialize_array_try(sc->serializes, sc->nserialize, slz);
4171 }
4172 
4173 #ifdef INVARIANTS
4174 
4175 static void
4176 mxge_serialize_assert(struct ifnet *ifp, enum ifnet_serialize slz,
4177     boolean_t serialized)
4178 {
4179 	struct mxge_softc *sc = ifp->if_softc;
4180 
4181 	ifnet_serialize_array_assert(sc->serializes, sc->nserialize,
4182 	    slz, serialized);
4183 }
4184 
4185 #endif	/* INVARIANTS */
4186 
4187 #ifdef IFPOLL_ENABLE
4188 
4189 static void
4190 mxge_npoll_rx(struct ifnet *ifp, void *xss, int cycle)
4191 {
4192 	struct mxge_slice_state *ss = xss;
4193 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
4194 
4195 	ASSERT_SERIALIZED(&ss->rx_data.rx_serialize);
4196 
4197 	if (rx_done->entry[rx_done->idx].length != 0) {
4198 		mxge_clean_rx_done(&ss->sc->arpcom.ac_if, &ss->rx_data, cycle);
4199 	} else {
4200 		/*
4201 		 * XXX
4202 		 * This register writting obviously has cost,
4203 		 * however, if we don't hand back the rx token,
4204 		 * the upcoming packets may suffer rediculously
4205 		 * large delay, as observed on 8AL-C using ping(8).
4206 		 */
4207 		*ss->irq_claim = be32toh(3);
4208 	}
4209 }
4210 
4211 static void
4212 mxge_npoll(struct ifnet *ifp, struct ifpoll_info *info)
4213 {
4214 	struct mxge_softc *sc = ifp->if_softc;
4215 	int i;
4216 
4217 	if (info == NULL)
4218 		return;
4219 
4220 	/*
4221 	 * Only poll rx; polling tx and status don't seem to work
4222 	 */
4223 	for (i = 0; i < sc->num_slices; ++i) {
4224 		struct mxge_slice_state *ss = &sc->ss[i];
4225 		int cpu = ss->intr_cpuid;
4226 
4227 		KKASSERT(cpu < netisr_ncpus);
4228 		info->ifpi_rx[cpu].poll_func = mxge_npoll_rx;
4229 		info->ifpi_rx[cpu].arg = ss;
4230 		info->ifpi_rx[cpu].serializer = &ss->rx_data.rx_serialize;
4231 	}
4232 }
4233 
4234 #endif	/* IFPOLL_ENABLE */
4235 
4236 static int
4237 mxge_attach(device_t dev)
4238 {
4239 	mxge_softc_t *sc = device_get_softc(dev);
4240 	struct ifnet *ifp = &sc->arpcom.ac_if;
4241 	int err, rid, i;
4242 
4243 	/*
4244 	 * Avoid rewriting half the lines in this file to use
4245 	 * &sc->arpcom.ac_if instead
4246 	 */
4247 	sc->ifp = ifp;
4248 	sc->dev = dev;
4249 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4250 
4251 	/* IFM_ETH_FORCEPAUSE can't be changed */
4252 	ifmedia_init(&sc->media, IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE,
4253 	    mxge_media_change, mxge_media_status);
4254 
4255 	lwkt_serialize_init(&sc->main_serialize);
4256 
4257 	mxge_fetch_tunables(sc);
4258 
4259 	err = bus_dma_tag_create(NULL,			/* parent */
4260 				 1,			/* alignment */
4261 				 0,			/* boundary */
4262 				 BUS_SPACE_MAXADDR,	/* low */
4263 				 BUS_SPACE_MAXADDR,	/* high */
4264 				 NULL, NULL,		/* filter */
4265 				 BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
4266 				 0, 			/* num segs */
4267 				 BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
4268 				 0,			/* flags */
4269 				 &sc->parent_dmat);	/* tag */
4270 	if (err != 0) {
4271 		device_printf(dev, "Err %d allocating parent dmat\n", err);
4272 		goto failed;
4273 	}
4274 
4275 	callout_init_mp(&sc->co_hdl);
4276 
4277 	mxge_setup_cfg_space(sc);
4278 
4279 	/*
4280 	 * Map the board into the kernel
4281 	 */
4282 	rid = PCIR_BARS;
4283 	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
4284 	    &rid, RF_ACTIVE);
4285 	if (sc->mem_res == NULL) {
4286 		device_printf(dev, "could not map memory\n");
4287 		err = ENXIO;
4288 		goto failed;
4289 	}
4290 
4291 	sc->sram = rman_get_virtual(sc->mem_res);
4292 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4293 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4294 		device_printf(dev, "impossible memory region size %ld\n",
4295 		    rman_get_size(sc->mem_res));
4296 		err = ENXIO;
4297 		goto failed;
4298 	}
4299 
4300 	/*
4301 	 * Make NULL terminated copy of the EEPROM strings section of
4302 	 * lanai SRAM
4303 	 */
4304 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4305 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4306 	    rman_get_bushandle(sc->mem_res),
4307 	    sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4308 	    sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE - 2);
4309 	err = mxge_parse_strings(sc);
4310 	if (err != 0) {
4311 		device_printf(dev, "parse EEPROM string failed\n");
4312 		goto failed;
4313 	}
4314 
4315 	/*
4316 	 * Enable write combining for efficient use of PCIe bus
4317 	 */
4318 	mxge_enable_wc(sc);
4319 
4320 	/*
4321 	 * Allocate the out of band DMA memory
4322 	 */
4323 	err = mxge_dma_alloc(sc, &sc->cmd_dma, sizeof(mxge_cmd_t), 64);
4324 	if (err != 0) {
4325 		device_printf(dev, "alloc cmd DMA buf failed\n");
4326 		goto failed;
4327 	}
4328 	sc->cmd = sc->cmd_dma.dmem_addr;
4329 
4330 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4331 	if (err != 0) {
4332 		device_printf(dev, "alloc zeropad DMA buf failed\n");
4333 		goto failed;
4334 	}
4335 
4336 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4337 	if (err != 0) {
4338 		device_printf(dev, "alloc dmabench DMA buf failed\n");
4339 		goto failed;
4340 	}
4341 
4342 	/* Select & load the firmware */
4343 	err = mxge_select_firmware(sc);
4344 	if (err != 0) {
4345 		device_printf(dev, "select firmware failed\n");
4346 		goto failed;
4347 	}
4348 
4349 	mxge_slice_probe(sc);
4350 	err = mxge_alloc_slices(sc);
4351 	if (err != 0) {
4352 		device_printf(dev, "alloc slices failed\n");
4353 		goto failed;
4354 	}
4355 
4356 	err = mxge_alloc_intr(sc);
4357 	if (err != 0) {
4358 		device_printf(dev, "alloc intr failed\n");
4359 		goto failed;
4360 	}
4361 
4362 	/* Setup serializes */
4363 	mxge_setup_serialize(sc);
4364 
4365 	err = mxge_reset(sc, 0);
4366 	if (err != 0) {
4367 		device_printf(dev, "reset failed\n");
4368 		goto failed;
4369 	}
4370 
4371 	err = mxge_alloc_rings(sc);
4372 	if (err != 0) {
4373 		device_printf(dev, "failed to allocate rings\n");
4374 		goto failed;
4375 	}
4376 
4377 	ifp->if_baudrate = IF_Gbps(10UL);
4378 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO;
4379 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4380 
4381 	ifp->if_capabilities |= IFCAP_VLAN_MTU;
4382 #if 0
4383 	/* Well, its software, sigh */
4384 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
4385 #endif
4386 	ifp->if_capenable = ifp->if_capabilities;
4387 
4388 	ifp->if_softc = sc;
4389 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4390 	ifp->if_init = mxge_init;
4391 	ifp->if_ioctl = mxge_ioctl;
4392 	ifp->if_start = mxge_start;
4393 #ifdef IFPOLL_ENABLE
4394 	if (sc->intr_type != PCI_INTR_TYPE_LEGACY)
4395 		ifp->if_npoll = mxge_npoll;
4396 #endif
4397 	ifp->if_serialize = mxge_serialize;
4398 	ifp->if_deserialize = mxge_deserialize;
4399 	ifp->if_tryserialize = mxge_tryserialize;
4400 #ifdef INVARIANTS
4401 	ifp->if_serialize_assert = mxge_serialize_assert;
4402 #endif
4403 
4404 	/* Increase TSO burst length */
4405 	ifp->if_tsolen = (32 * ETHERMTU);
4406 
4407 	/* Initialise the ifmedia structure */
4408 	mxge_media_init(sc);
4409 	mxge_media_probe(sc);
4410 
4411 	ether_ifattach(ifp, sc->mac_addr, NULL);
4412 
4413 	/* Setup TX rings and subqueues */
4414 	for (i = 0; i < sc->num_tx_rings; ++i) {
4415 		struct ifaltq_subque *ifsq = ifq_get_subq(&ifp->if_snd, i);
4416 		struct mxge_slice_state *ss = &sc->ss[i];
4417 
4418 		ifsq_set_cpuid(ifsq, ss->intr_cpuid);
4419 		ifsq_set_hw_serialize(ifsq, &ss->tx.tx_serialize);
4420 		ifsq_set_priv(ifsq, &ss->tx);
4421 		ss->tx.ifsq = ifsq;
4422 
4423 		ifsq_watchdog_init(&ss->tx.watchdog, ifsq, mxge_watchdog);
4424 	}
4425 
4426 	/*
4427 	 * XXX
4428 	 * We are not ready to do "gather" jumbo frame, so
4429 	 * limit MTU to MJUMPAGESIZE
4430 	 */
4431 	sc->max_mtu = MJUMPAGESIZE -
4432 	    ETHER_HDR_LEN - EVL_ENCAPLEN - MXGEFW_PAD - 1;
4433 	sc->dying = 0;
4434 
4435 	err = mxge_setup_intr(sc);
4436 	if (err != 0) {
4437 		device_printf(dev, "alloc and setup intr failed\n");
4438 		ether_ifdetach(ifp);
4439 		goto failed;
4440 	}
4441 
4442 	mxge_add_sysctls(sc);
4443 
4444 	/* Increase non-cluster mbuf limit; used by small RX rings */
4445 	mb_inclimit(ifp->if_nmbclusters);
4446 
4447 	callout_reset_bycpu(&sc->co_hdl, mxge_ticks, mxge_tick, sc,
4448 	    sc->ss[0].intr_cpuid);
4449 	return 0;
4450 
4451 failed:
4452 	mxge_detach(dev);
4453 	return err;
4454 }
4455 
4456 static int
4457 mxge_detach(device_t dev)
4458 {
4459 	mxge_softc_t *sc = device_get_softc(dev);
4460 
4461 	if (device_is_attached(dev)) {
4462 		struct ifnet *ifp = sc->ifp;
4463 		int mblimit = ifp->if_nmbclusters;
4464 
4465 		ifnet_serialize_all(ifp);
4466 
4467 		sc->dying = 1;
4468 		if (ifp->if_flags & IFF_RUNNING)
4469 			mxge_close(sc, 1);
4470 		callout_stop(&sc->co_hdl);
4471 
4472 		mxge_teardown_intr(sc, sc->num_slices);
4473 
4474 		ifnet_deserialize_all(ifp);
4475 
4476 		callout_terminate(&sc->co_hdl);
4477 
4478 		ether_ifdetach(ifp);
4479 
4480 		/* Decrease non-cluster mbuf limit increased by us */
4481 		mb_inclimit(-mblimit);
4482 	}
4483 	ifmedia_removeall(&sc->media);
4484 
4485 	if (sc->cmd != NULL && sc->zeropad_dma.dmem_addr != NULL &&
4486 	    sc->sram != NULL)
4487 		mxge_dummy_rdma(sc, 0);
4488 
4489 	mxge_free_intr(sc);
4490 	mxge_rem_sysctls(sc);
4491 	mxge_free_rings(sc);
4492 
4493 	/* MUST after sysctls, intr and rings are freed */
4494 	mxge_free_slices(sc);
4495 
4496 	if (sc->dmabench_dma.dmem_addr != NULL)
4497 		mxge_dma_free(&sc->dmabench_dma);
4498 	if (sc->zeropad_dma.dmem_addr != NULL)
4499 		mxge_dma_free(&sc->zeropad_dma);
4500 	if (sc->cmd_dma.dmem_addr != NULL)
4501 		mxge_dma_free(&sc->cmd_dma);
4502 
4503 	if (sc->msix_table_res != NULL) {
4504 		bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(2),
4505 		    sc->msix_table_res);
4506 	}
4507 	if (sc->mem_res != NULL) {
4508 		bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS,
4509 		    sc->mem_res);
4510 	}
4511 
4512 	if (sc->parent_dmat != NULL)
4513 		bus_dma_tag_destroy(sc->parent_dmat);
4514 
4515 	if (sc->ring_map != NULL)
4516 		if_ringmap_free(sc->ring_map);
4517 
4518 	return 0;
4519 }
4520 
4521 static int
4522 mxge_shutdown(device_t dev)
4523 {
4524 	return 0;
4525 }
4526 
4527 static void
4528 mxge_free_msix(struct mxge_softc *sc, boolean_t setup)
4529 {
4530 	int i;
4531 
4532 	KKASSERT(sc->num_slices > 1);
4533 
4534 	for (i = 0; i < sc->num_slices; ++i) {
4535 		struct mxge_slice_state *ss = &sc->ss[i];
4536 
4537 		if (ss->intr_res != NULL) {
4538 			bus_release_resource(sc->dev, SYS_RES_IRQ,
4539 			    ss->intr_rid, ss->intr_res);
4540 		}
4541 		if (ss->intr_rid >= 0)
4542 			pci_release_msix_vector(sc->dev, ss->intr_rid);
4543 	}
4544 	if (setup)
4545 		pci_teardown_msix(sc->dev);
4546 }
4547 
4548 static int
4549 mxge_alloc_msix(struct mxge_softc *sc)
4550 {
4551 	struct mxge_slice_state *ss;
4552 	int rid, error, i;
4553 	boolean_t setup = FALSE;
4554 
4555 	KKASSERT(sc->num_slices > 1);
4556 
4557 	ss = &sc->ss[0];
4558 
4559 	ss->intr_serialize = &sc->main_serialize;
4560 	ss->intr_func = mxge_msi;
4561 	ksnprintf(ss->intr_desc0, sizeof(ss->intr_desc0),
4562 	    "%s comb", device_get_nameunit(sc->dev));
4563 	ss->intr_desc = ss->intr_desc0;
4564 	ss->intr_cpuid = if_ringmap_cpumap(sc->ring_map, 0);
4565 
4566 	for (i = 1; i < sc->num_slices; ++i) {
4567 		ss = &sc->ss[i];
4568 
4569 		ss->intr_serialize = &ss->rx_data.rx_serialize;
4570 		if (sc->num_tx_rings == 1) {
4571 			ss->intr_func = mxge_msix_rx;
4572 			ksnprintf(ss->intr_desc0, sizeof(ss->intr_desc0),
4573 			    "%s rx%d", device_get_nameunit(sc->dev), i);
4574 		} else {
4575 			ss->intr_func = mxge_msix_rxtx;
4576 			ksnprintf(ss->intr_desc0, sizeof(ss->intr_desc0),
4577 			    "%s rxtx%d", device_get_nameunit(sc->dev), i);
4578 		}
4579 		ss->intr_desc = ss->intr_desc0;
4580 		ss->intr_cpuid = if_ringmap_cpumap(sc->ring_map, i);
4581 	}
4582 
4583 	rid = PCIR_BAR(2);
4584 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4585 	    &rid, RF_ACTIVE);
4586 	if (sc->msix_table_res == NULL) {
4587 		device_printf(sc->dev, "couldn't alloc MSI-X table res\n");
4588 		return ENXIO;
4589 	}
4590 
4591 	error = pci_setup_msix(sc->dev);
4592 	if (error) {
4593 		device_printf(sc->dev, "could not setup MSI-X\n");
4594 		goto back;
4595 	}
4596 	setup = TRUE;
4597 
4598 	for (i = 0; i < sc->num_slices; ++i) {
4599 		ss = &sc->ss[i];
4600 
4601 		error = pci_alloc_msix_vector(sc->dev, i, &ss->intr_rid,
4602 		    ss->intr_cpuid);
4603 		if (error) {
4604 			device_printf(sc->dev, "could not alloc "
4605 			    "MSI-X %d on cpu%d\n", i, ss->intr_cpuid);
4606 			goto back;
4607 		}
4608 
4609 		ss->intr_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
4610 		    &ss->intr_rid, RF_ACTIVE);
4611 		if (ss->intr_res == NULL) {
4612 			device_printf(sc->dev, "could not alloc "
4613 			    "MSI-X %d resource\n", i);
4614 			error = ENXIO;
4615 			goto back;
4616 		}
4617 	}
4618 
4619 	pci_enable_msix(sc->dev);
4620 	sc->intr_type = PCI_INTR_TYPE_MSIX;
4621 back:
4622 	if (error)
4623 		mxge_free_msix(sc, setup);
4624 	return error;
4625 }
4626 
4627 static int
4628 mxge_alloc_intr(struct mxge_softc *sc)
4629 {
4630 	struct mxge_slice_state *ss;
4631 	u_int irq_flags;
4632 
4633 	if (sc->num_slices > 1) {
4634 		int error;
4635 
4636 		error = mxge_alloc_msix(sc);
4637 		if (error)
4638 			return error;
4639 		KKASSERT(sc->intr_type == PCI_INTR_TYPE_MSIX);
4640 		return 0;
4641 	}
4642 
4643 	ss = &sc->ss[0];
4644 
4645 	sc->intr_type = pci_alloc_1intr(sc->dev, mxge_msi_enable,
4646 	    &ss->intr_rid, &irq_flags);
4647 
4648 	ss->intr_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
4649 	    &ss->intr_rid, irq_flags);
4650 	if (ss->intr_res == NULL) {
4651 		device_printf(sc->dev, "could not alloc interrupt\n");
4652 		return ENXIO;
4653 	}
4654 
4655 	if (sc->intr_type == PCI_INTR_TYPE_LEGACY)
4656 		ss->intr_func = mxge_legacy;
4657 	else
4658 		ss->intr_func = mxge_msi;
4659 	ss->intr_serialize = &sc->main_serialize;
4660 	ss->intr_cpuid = rman_get_cpuid(ss->intr_res);
4661 
4662 	return 0;
4663 }
4664 
4665 static int
4666 mxge_setup_intr(struct mxge_softc *sc)
4667 {
4668 	int i;
4669 
4670 	for (i = 0; i < sc->num_slices; ++i) {
4671 		struct mxge_slice_state *ss = &sc->ss[i];
4672 		int error;
4673 
4674 		error = bus_setup_intr_descr(sc->dev, ss->intr_res,
4675 		    INTR_MPSAFE, ss->intr_func, ss, &ss->intr_hand,
4676 		    ss->intr_serialize, ss->intr_desc);
4677 		if (error) {
4678 			device_printf(sc->dev, "can't setup %dth intr\n", i);
4679 			mxge_teardown_intr(sc, i);
4680 			return error;
4681 		}
4682 	}
4683 	return 0;
4684 }
4685 
4686 static void
4687 mxge_teardown_intr(struct mxge_softc *sc, int cnt)
4688 {
4689 	int i;
4690 
4691 	if (sc->ss == NULL)
4692 		return;
4693 
4694 	for (i = 0; i < cnt; ++i) {
4695 		struct mxge_slice_state *ss = &sc->ss[i];
4696 
4697 		bus_teardown_intr(sc->dev, ss->intr_res, ss->intr_hand);
4698 	}
4699 }
4700 
4701 static void
4702 mxge_free_intr(struct mxge_softc *sc)
4703 {
4704 	if (sc->ss == NULL)
4705 		return;
4706 
4707 	if (sc->intr_type != PCI_INTR_TYPE_MSIX) {
4708 		struct mxge_slice_state *ss = &sc->ss[0];
4709 
4710 		if (ss->intr_res != NULL) {
4711 			bus_release_resource(sc->dev, SYS_RES_IRQ,
4712 			    ss->intr_rid, ss->intr_res);
4713 		}
4714 		if (sc->intr_type == PCI_INTR_TYPE_MSI)
4715 			pci_release_msi(sc->dev);
4716 	} else {
4717 		mxge_free_msix(sc, TRUE);
4718 	}
4719 }
4720