xref: /dragonfly/sys/dev/netif/mxge/if_mxge.c (revision 279dd846)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2013, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 $FreeBSD: head/sys/dev/mxge/if_mxge.c 254263 2013-08-12 23:30:01Z scottl $
29 
30 ***************************************************************************/
31 
32 #include "opt_ifpoll.h"
33 #include "opt_inet.h"
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/linker.h>
38 #include <sys/firmware.h>
39 #include <sys/endian.h>
40 #include <sys/in_cksum.h>
41 #include <sys/sockio.h>
42 #include <sys/mbuf.h>
43 #include <sys/malloc.h>
44 #include <sys/kernel.h>
45 #include <sys/module.h>
46 #include <sys/serialize.h>
47 #include <sys/socket.h>
48 #include <sys/sysctl.h>
49 
50 #include <net/if.h>
51 #include <net/if_arp.h>
52 #include <net/ifq_var.h>
53 #include <net/ethernet.h>
54 #include <net/if_dl.h>
55 #include <net/if_media.h>
56 #include <net/if_poll.h>
57 
58 #include <net/bpf.h>
59 
60 #include <net/if_types.h>
61 #include <net/vlan/if_vlan_var.h>
62 #include <net/zlib.h>
63 #include <net/toeplitz.h>
64 
65 #include <netinet/in_systm.h>
66 #include <netinet/in.h>
67 #include <netinet/ip.h>
68 #include <netinet/tcp.h>
69 
70 #include <sys/bus.h>
71 #include <sys/rman.h>
72 
73 #include <bus/pci/pcireg.h>
74 #include <bus/pci/pcivar.h>
75 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
76 
77 #include <vm/vm.h>		/* for pmap_mapdev() */
78 #include <vm/pmap.h>
79 
80 #if defined(__i386__) || defined(__x86_64__)
81 #include <machine/specialreg.h>
82 #endif
83 
84 #include <dev/netif/mxge/mxge_mcp.h>
85 #include <dev/netif/mxge/mcp_gen_header.h>
86 #include <dev/netif/mxge/if_mxge_var.h>
87 
88 #define MXGE_RX_SMALL_BUFLEN		(MHLEN - MXGEFW_PAD)
89 #define MXGE_HWRSS_KEYLEN		16
90 
91 /* Tunable params */
92 static int mxge_nvidia_ecrc_enable = 1;
93 static int mxge_force_firmware = 0;
94 static int mxge_intr_coal_delay = MXGE_INTR_COAL_DELAY;
95 static int mxge_deassert_wait = 1;
96 static int mxge_flow_control = 1;
97 static int mxge_ticks;
98 static int mxge_num_slices = 0;
99 static int mxge_always_promisc = 0;
100 static int mxge_throttle = 0;
101 static int mxge_msi_enable = 1;
102 static int mxge_msix_enable = 1;
103 static int mxge_multi_tx = 1;
104 /*
105  * Don't use RSS by default, its just too slow
106  */
107 static int mxge_use_rss = 0;
108 
109 static const char *mxge_fw_unaligned = "mxge_ethp_z8e";
110 static const char *mxge_fw_aligned = "mxge_eth_z8e";
111 static const char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
112 static const char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
113 
114 TUNABLE_INT("hw.mxge.num_slices", &mxge_num_slices);
115 TUNABLE_INT("hw.mxge.flow_control_enabled", &mxge_flow_control);
116 TUNABLE_INT("hw.mxge.intr_coal_delay", &mxge_intr_coal_delay);
117 TUNABLE_INT("hw.mxge.nvidia_ecrc_enable", &mxge_nvidia_ecrc_enable);
118 TUNABLE_INT("hw.mxge.force_firmware", &mxge_force_firmware);
119 TUNABLE_INT("hw.mxge.deassert_wait", &mxge_deassert_wait);
120 TUNABLE_INT("hw.mxge.ticks", &mxge_ticks);
121 TUNABLE_INT("hw.mxge.always_promisc", &mxge_always_promisc);
122 TUNABLE_INT("hw.mxge.throttle", &mxge_throttle);
123 TUNABLE_INT("hw.mxge.multi_tx", &mxge_multi_tx);
124 TUNABLE_INT("hw.mxge.use_rss", &mxge_use_rss);
125 TUNABLE_INT("hw.mxge.msi.enable", &mxge_msi_enable);
126 TUNABLE_INT("hw.mxge.msix.enable", &mxge_msix_enable);
127 
128 static int mxge_probe(device_t dev);
129 static int mxge_attach(device_t dev);
130 static int mxge_detach(device_t dev);
131 static int mxge_shutdown(device_t dev);
132 
133 static int mxge_alloc_intr(struct mxge_softc *sc);
134 static void mxge_free_intr(struct mxge_softc *sc);
135 static int mxge_setup_intr(struct mxge_softc *sc);
136 static void mxge_teardown_intr(struct mxge_softc *sc, int cnt);
137 
138 static device_method_t mxge_methods[] = {
139 	/* Device interface */
140 	DEVMETHOD(device_probe, mxge_probe),
141 	DEVMETHOD(device_attach, mxge_attach),
142 	DEVMETHOD(device_detach, mxge_detach),
143 	DEVMETHOD(device_shutdown, mxge_shutdown),
144 	DEVMETHOD_END
145 };
146 
147 static driver_t mxge_driver = {
148 	"mxge",
149 	mxge_methods,
150 	sizeof(mxge_softc_t),
151 };
152 
153 static devclass_t mxge_devclass;
154 
155 /* Declare ourselves to be a child of the PCI bus.*/
156 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, NULL, NULL);
157 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
158 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
159 
160 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
161 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
162 static void mxge_close(mxge_softc_t *sc, int down);
163 static int mxge_open(mxge_softc_t *sc);
164 static void mxge_tick(void *arg);
165 static void mxge_watchdog_reset(mxge_softc_t *sc);
166 static void mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice);
167 
168 static int
169 mxge_probe(device_t dev)
170 {
171 	if (pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM &&
172 	    (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E ||
173 	     pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9)) {
174 		int rev = pci_get_revid(dev);
175 
176 		switch (rev) {
177 		case MXGE_PCI_REV_Z8E:
178 			device_set_desc(dev, "Myri10G-PCIE-8A");
179 			break;
180 		case MXGE_PCI_REV_Z8ES:
181 			device_set_desc(dev, "Myri10G-PCIE-8B");
182 			break;
183 		default:
184 			device_set_desc(dev, "Myri10G-PCIE-8??");
185 			device_printf(dev, "Unrecognized rev %d NIC\n", rev);
186 			break;
187 		}
188 		return 0;
189 	}
190 	return ENXIO;
191 }
192 
193 static void
194 mxge_enable_wc(mxge_softc_t *sc)
195 {
196 #if defined(__i386__) || defined(__x86_64__)
197 	vm_offset_t len;
198 
199 	sc->wc = 1;
200 	len = rman_get_size(sc->mem_res);
201 	pmap_change_attr((vm_offset_t) sc->sram, len / PAGE_SIZE,
202 	    PAT_WRITE_COMBINING);
203 #endif
204 }
205 
206 static int
207 mxge_dma_alloc(mxge_softc_t *sc, bus_dmamem_t *dma, size_t bytes,
208     bus_size_t alignment)
209 {
210 	bus_size_t boundary;
211 	int err;
212 
213 	if (bytes > 4096 && alignment == 4096)
214 		boundary = 0;
215 	else
216 		boundary = 4096;
217 
218 	err = bus_dmamem_coherent(sc->parent_dmat, alignment, boundary,
219 	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, bytes,
220 	    BUS_DMA_WAITOK | BUS_DMA_ZERO, dma);
221 	if (err != 0) {
222 		device_printf(sc->dev, "bus_dmamem_coherent failed: %d\n", err);
223 		return err;
224 	}
225 	return 0;
226 }
227 
228 static void
229 mxge_dma_free(bus_dmamem_t *dma)
230 {
231 	bus_dmamap_unload(dma->dmem_tag, dma->dmem_map);
232 	bus_dmamem_free(dma->dmem_tag, dma->dmem_addr, dma->dmem_map);
233 	bus_dma_tag_destroy(dma->dmem_tag);
234 }
235 
236 /*
237  * The eeprom strings on the lanaiX have the format
238  * SN=x\0
239  * MAC=x:x:x:x:x:x\0
240  * PC=text\0
241  */
242 static int
243 mxge_parse_strings(mxge_softc_t *sc)
244 {
245 	const char *ptr;
246 	int i, found_mac, found_sn2;
247 	char *endptr;
248 
249 	ptr = sc->eeprom_strings;
250 	found_mac = 0;
251 	found_sn2 = 0;
252 	while (*ptr != '\0') {
253 		if (strncmp(ptr, "MAC=", 4) == 0) {
254 			ptr += 4;
255 			for (i = 0;;) {
256 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
257 				if (endptr - ptr != 2)
258 					goto abort;
259 				ptr = endptr;
260 				if (++i == 6)
261 					break;
262 				if (*ptr++ != ':')
263 					goto abort;
264 			}
265 			found_mac = 1;
266 		} else if (strncmp(ptr, "PC=", 3) == 0) {
267 			ptr += 3;
268 			strlcpy(sc->product_code_string, ptr,
269 			    sizeof(sc->product_code_string));
270 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
271 			ptr += 3;
272 			strlcpy(sc->serial_number_string, ptr,
273 			    sizeof(sc->serial_number_string));
274 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
275 			/* SN2 takes precedence over SN */
276 			ptr += 4;
277 			found_sn2 = 1;
278 			strlcpy(sc->serial_number_string, ptr,
279 			    sizeof(sc->serial_number_string));
280 		}
281 		while (*ptr++ != '\0') {}
282 	}
283 
284 	if (found_mac)
285 		return 0;
286 
287 abort:
288 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
289 	return ENXIO;
290 }
291 
292 #if defined(__i386__) || defined(__x86_64__)
293 
294 static void
295 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
296 {
297 	uint32_t val;
298 	unsigned long base, off;
299 	char *va, *cfgptr;
300 	device_t pdev, mcp55;
301 	uint16_t vendor_id, device_id, word;
302 	uintptr_t bus, slot, func, ivend, idev;
303 	uint32_t *ptr32;
304 
305 	if (!mxge_nvidia_ecrc_enable)
306 		return;
307 
308 	pdev = device_get_parent(device_get_parent(sc->dev));
309 	if (pdev == NULL) {
310 		device_printf(sc->dev, "could not find parent?\n");
311 		return;
312 	}
313 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
314 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
315 
316 	if (vendor_id != 0x10de)
317 		return;
318 
319 	base = 0;
320 
321 	if (device_id == 0x005d) {
322 		/* ck804, base address is magic */
323 		base = 0xe0000000UL;
324 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
325 		/* mcp55, base address stored in chipset */
326 		mcp55 = pci_find_bsf(0, 0, 0);
327 		if (mcp55 &&
328 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
329 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
330 			word = pci_read_config(mcp55, 0x90, 2);
331 			base = ((unsigned long)word & 0x7ffeU) << 25;
332 		}
333 	}
334 	if (!base)
335 		return;
336 
337 	/*
338 	 * XXXX
339 	 * Test below is commented because it is believed that doing
340 	 * config read/write beyond 0xff will access the config space
341 	 * for the next larger function.  Uncomment this and remove
342 	 * the hacky pmap_mapdev() way of accessing config space when
343 	 * DragonFly grows support for extended pcie config space access.
344 	 */
345 #if 0
346 	/*
347 	 * See if we can, by some miracle, access the extended
348 	 * config space
349 	 */
350 	val = pci_read_config(pdev, 0x178, 4);
351 	if (val != 0xffffffff) {
352 		val |= 0x40;
353 		pci_write_config(pdev, 0x178, val, 4);
354 		return;
355 	}
356 #endif
357 	/*
358 	 * Rather than using normal pci config space writes, we must
359 	 * map the Nvidia config space ourselves.  This is because on
360 	 * opteron/nvidia class machine the 0xe000000 mapping is
361 	 * handled by the nvidia chipset, that means the internal PCI
362 	 * device (the on-chip northbridge), or the amd-8131 bridge
363 	 * and things behind them are not visible by this method.
364 	 */
365 
366 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
367 		      PCI_IVAR_BUS, &bus);
368 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
369 		      PCI_IVAR_SLOT, &slot);
370 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
371 		      PCI_IVAR_FUNCTION, &func);
372 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
373 		      PCI_IVAR_VENDOR, &ivend);
374 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
375 		      PCI_IVAR_DEVICE, &idev);
376 
377 	off =  base + 0x00100000UL * (unsigned long)bus +
378 	    0x00001000UL * (unsigned long)(func + 8 * slot);
379 
380 	/* map it into the kernel */
381 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
382 	if (va == NULL) {
383 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
384 		return;
385 	}
386 	/* get a pointer to the config space mapped into the kernel */
387 	cfgptr = va + (off & PAGE_MASK);
388 
389 	/* make sure that we can really access it */
390 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
391 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
392 	if (!(vendor_id == ivend && device_id == idev)) {
393 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
394 		    vendor_id, device_id);
395 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
396 		return;
397 	}
398 
399 	ptr32 = (uint32_t*)(cfgptr + 0x178);
400 	val = *ptr32;
401 
402 	if (val == 0xffffffff) {
403 		device_printf(sc->dev, "extended mapping failed\n");
404 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
405 		return;
406 	}
407 	*ptr32 = val | 0x40;
408 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
409 	if (bootverbose) {
410 		device_printf(sc->dev, "Enabled ECRC on upstream "
411 		    "Nvidia bridge at %d:%d:%d\n",
412 		    (int)bus, (int)slot, (int)func);
413 	}
414 }
415 
416 #else	/* __i386__ || __x86_64__ */
417 
418 static void
419 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
420 {
421 	device_printf(sc->dev, "Nforce 4 chipset on non-x86/x86_64!?!?!\n");
422 }
423 
424 #endif
425 
426 static int
427 mxge_dma_test(mxge_softc_t *sc, int test_type)
428 {
429 	mxge_cmd_t cmd;
430 	bus_addr_t dmatest_bus = sc->dmabench_dma.dmem_busaddr;
431 	int status;
432 	uint32_t len;
433 	const char *test = " ";
434 
435 	/*
436 	 * Run a small DMA test.
437 	 * The magic multipliers to the length tell the firmware
438 	 * to do DMA read, write, or read+write tests.  The
439 	 * results are returned in cmd.data0.  The upper 16
440 	 * bits of the return is the number of transfers completed.
441 	 * The lower 16 bits is the time in 0.5us ticks that the
442 	 * transfers took to complete.
443 	 */
444 
445 	len = sc->tx_boundary;
446 
447 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
448 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
449 	cmd.data2 = len * 0x10000;
450 	status = mxge_send_cmd(sc, test_type, &cmd);
451 	if (status != 0) {
452 		test = "read";
453 		goto abort;
454 	}
455 	sc->read_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
456 
457 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
458 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
459 	cmd.data2 = len * 0x1;
460 	status = mxge_send_cmd(sc, test_type, &cmd);
461 	if (status != 0) {
462 		test = "write";
463 		goto abort;
464 	}
465 	sc->write_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
466 
467 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
468 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
469 	cmd.data2 = len * 0x10001;
470 	status = mxge_send_cmd(sc, test_type, &cmd);
471 	if (status != 0) {
472 		test = "read/write";
473 		goto abort;
474 	}
475 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
476 	    (cmd.data0 & 0xffff);
477 
478 abort:
479 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST) {
480 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
481 		    test, status);
482 	}
483 	return status;
484 }
485 
486 /*
487  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
488  * when the PCI-E Completion packets are aligned on an 8-byte
489  * boundary.  Some PCI-E chip sets always align Completion packets; on
490  * the ones that do not, the alignment can be enforced by enabling
491  * ECRC generation (if supported).
492  *
493  * When PCI-E Completion packets are not aligned, it is actually more
494  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
495  *
496  * If the driver can neither enable ECRC nor verify that it has
497  * already been enabled, then it must use a firmware image which works
498  * around unaligned completion packets (ethp_z8e.dat), and it should
499  * also ensure that it never gives the device a Read-DMA which is
500  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
501  * enabled, then the driver should use the aligned (eth_z8e.dat)
502  * firmware image, and set tx_boundary to 4KB.
503  */
504 static int
505 mxge_firmware_probe(mxge_softc_t *sc)
506 {
507 	device_t dev = sc->dev;
508 	int reg, status;
509 	uint16_t pectl;
510 
511 	sc->tx_boundary = 4096;
512 
513 	/*
514 	 * Verify the max read request size was set to 4KB
515 	 * before trying the test with 4KB.
516 	 */
517 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
518 		pectl = pci_read_config(dev, reg + 0x8, 2);
519 		if ((pectl & (5 << 12)) != (5 << 12)) {
520 			device_printf(dev, "Max Read Req. size != 4k (0x%x)\n",
521 			    pectl);
522 			sc->tx_boundary = 2048;
523 		}
524 	}
525 
526 	/*
527 	 * Load the optimized firmware (which assumes aligned PCIe
528 	 * completions) in order to see if it works on this host.
529 	 */
530 	sc->fw_name = mxge_fw_aligned;
531 	status = mxge_load_firmware(sc, 1);
532 	if (status != 0)
533 		return status;
534 
535 	/*
536 	 * Enable ECRC if possible
537 	 */
538 	mxge_enable_nvidia_ecrc(sc);
539 
540 	/*
541 	 * Run a DMA test which watches for unaligned completions and
542 	 * aborts on the first one seen.  Not required on Z8ES or newer.
543 	 */
544 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
545 		return 0;
546 
547 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
548 	if (status == 0)
549 		return 0; /* keep the aligned firmware */
550 
551 	if (status != E2BIG)
552 		device_printf(dev, "DMA test failed: %d\n", status);
553 	if (status == ENOSYS) {
554 		device_printf(dev, "Falling back to ethp! "
555 		    "Please install up to date fw\n");
556 	}
557 	return status;
558 }
559 
560 static int
561 mxge_select_firmware(mxge_softc_t *sc)
562 {
563 	int aligned = 0;
564 	int force_firmware = mxge_force_firmware;
565 
566 	if (sc->throttle)
567 		force_firmware = sc->throttle;
568 
569 	if (force_firmware != 0) {
570 		if (force_firmware == 1)
571 			aligned = 1;
572 		else
573 			aligned = 0;
574 		if (bootverbose) {
575 			device_printf(sc->dev,
576 			    "Assuming %s completions (forced)\n",
577 			    aligned ? "aligned" : "unaligned");
578 		}
579 		goto abort;
580 	}
581 
582 	/*
583 	 * If the PCIe link width is 4 or less, we can use the aligned
584 	 * firmware and skip any checks
585 	 */
586 	if (sc->link_width != 0 && sc->link_width <= 4) {
587 		device_printf(sc->dev, "PCIe x%d Link, "
588 		    "expect reduced performance\n", sc->link_width);
589 		aligned = 1;
590 		goto abort;
591 	}
592 
593 	if (mxge_firmware_probe(sc) == 0)
594 		return 0;
595 
596 abort:
597 	if (aligned) {
598 		sc->fw_name = mxge_fw_aligned;
599 		sc->tx_boundary = 4096;
600 	} else {
601 		sc->fw_name = mxge_fw_unaligned;
602 		sc->tx_boundary = 2048;
603 	}
604 	return mxge_load_firmware(sc, 0);
605 }
606 
607 static int
608 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
609 {
610 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
611 		if_printf(sc->ifp, "Bad firmware type: 0x%x\n",
612 		    be32toh(hdr->mcp_type));
613 		return EIO;
614 	}
615 
616 	/* Save firmware version for sysctl */
617 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
618 	if (bootverbose)
619 		if_printf(sc->ifp, "firmware id: %s\n", hdr->version);
620 
621 	ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
622 	    &sc->fw_ver_minor, &sc->fw_ver_tiny);
623 
624 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR &&
625 	      sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
626 		if_printf(sc->ifp, "Found firmware version %s\n",
627 		    sc->fw_version);
628 		if_printf(sc->ifp, "Driver needs %d.%d\n",
629 		    MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
630 		return EINVAL;
631 	}
632 	return 0;
633 }
634 
635 static void *
636 z_alloc(void *nil, u_int items, u_int size)
637 {
638 	return kmalloc(items * size, M_TEMP, M_WAITOK);
639 }
640 
641 static void
642 z_free(void *nil, void *ptr)
643 {
644 	kfree(ptr, M_TEMP);
645 }
646 
647 static int
648 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
649 {
650 	z_stream zs;
651 	char *inflate_buffer;
652 	const struct firmware *fw;
653 	const mcp_gen_header_t *hdr;
654 	unsigned hdr_offset;
655 	int status;
656 	unsigned int i;
657 	char dummy;
658 	size_t fw_len;
659 
660 	fw = firmware_get(sc->fw_name);
661 	if (fw == NULL) {
662 		if_printf(sc->ifp, "Could not find firmware image %s\n",
663 		    sc->fw_name);
664 		return ENOENT;
665 	}
666 
667 	/* Setup zlib and decompress f/w */
668 	bzero(&zs, sizeof(zs));
669 	zs.zalloc = z_alloc;
670 	zs.zfree = z_free;
671 	status = inflateInit(&zs);
672 	if (status != Z_OK) {
673 		status = EIO;
674 		goto abort_with_fw;
675 	}
676 
677 	/*
678 	 * The uncompressed size is stored as the firmware version,
679 	 * which would otherwise go unused
680 	 */
681 	fw_len = (size_t)fw->version;
682 	inflate_buffer = kmalloc(fw_len, M_TEMP, M_WAITOK);
683 	zs.avail_in = fw->datasize;
684 	zs.next_in = __DECONST(char *, fw->data);
685 	zs.avail_out = fw_len;
686 	zs.next_out = inflate_buffer;
687 	status = inflate(&zs, Z_FINISH);
688 	if (status != Z_STREAM_END) {
689 		if_printf(sc->ifp, "zlib %d\n", status);
690 		status = EIO;
691 		goto abort_with_buffer;
692 	}
693 
694 	/* Check id */
695 	hdr_offset =
696 	htobe32(*(const uint32_t *)(inflate_buffer + MCP_HEADER_PTR_OFFSET));
697 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
698 		if_printf(sc->ifp, "Bad firmware file");
699 		status = EIO;
700 		goto abort_with_buffer;
701 	}
702 	hdr = (const void*)(inflate_buffer + hdr_offset);
703 
704 	status = mxge_validate_firmware(sc, hdr);
705 	if (status != 0)
706 		goto abort_with_buffer;
707 
708 	/* Copy the inflated firmware to NIC SRAM. */
709 	for (i = 0; i < fw_len; i += 256) {
710 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i, inflate_buffer + i,
711 		    min(256U, (unsigned)(fw_len - i)));
712 		wmb();
713 		dummy = *sc->sram;
714 		wmb();
715 	}
716 
717 	*limit = fw_len;
718 	status = 0;
719 abort_with_buffer:
720 	kfree(inflate_buffer, M_TEMP);
721 	inflateEnd(&zs);
722 abort_with_fw:
723 	firmware_put(fw, FIRMWARE_UNLOAD);
724 	return status;
725 }
726 
727 /*
728  * Enable or disable periodic RDMAs from the host to make certain
729  * chipsets resend dropped PCIe messages
730  */
731 static void
732 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
733 {
734 	char buf_bytes[72];
735 	volatile uint32_t *confirm;
736 	volatile char *submit;
737 	uint32_t *buf, dma_low, dma_high;
738 	int i;
739 
740 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
741 
742 	/* Clear confirmation addr */
743 	confirm = (volatile uint32_t *)sc->cmd;
744 	*confirm = 0;
745 	wmb();
746 
747 	/*
748 	 * Send an rdma command to the PCIe engine, and wait for the
749 	 * response in the confirmation address.  The firmware should
750 	 * write a -1 there to indicate it is alive and well
751 	 */
752 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
753 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
754 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
755 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
756 	buf[2] = htobe32(0xffffffff);		/* confirm data */
757 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
758 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
759 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
760 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
761 	buf[5] = htobe32(enable);		/* enable? */
762 
763 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
764 
765 	mxge_pio_copy(submit, buf, 64);
766 	wmb();
767 	DELAY(1000);
768 	wmb();
769 	i = 0;
770 	while (*confirm != 0xffffffff && i < 20) {
771 		DELAY(1000);
772 		i++;
773 	}
774 	if (*confirm != 0xffffffff) {
775 		if_printf(sc->ifp, "dummy rdma %s failed (%p = 0x%x)",
776 		    (enable ? "enable" : "disable"), confirm, *confirm);
777 	}
778 }
779 
780 static int
781 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
782 {
783 	mcp_cmd_t *buf;
784 	char buf_bytes[sizeof(*buf) + 8];
785 	volatile mcp_cmd_response_t *response = sc->cmd;
786 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
787 	uint32_t dma_low, dma_high;
788 	int err, sleep_total = 0;
789 
790 	/* Ensure buf is aligned to 8 bytes */
791 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
792 
793 	buf->data0 = htobe32(data->data0);
794 	buf->data1 = htobe32(data->data1);
795 	buf->data2 = htobe32(data->data2);
796 	buf->cmd = htobe32(cmd);
797 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
798 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
799 
800 	buf->response_addr.low = htobe32(dma_low);
801 	buf->response_addr.high = htobe32(dma_high);
802 
803 	response->result = 0xffffffff;
804 	wmb();
805 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
806 
807 	/*
808 	 * Wait up to 20ms
809 	 */
810 	err = EAGAIN;
811 	for (sleep_total = 0; sleep_total < 20; sleep_total++) {
812 		wmb();
813 		switch (be32toh(response->result)) {
814 		case 0:
815 			data->data0 = be32toh(response->data);
816 			err = 0;
817 			break;
818 		case 0xffffffff:
819 			DELAY(1000);
820 			break;
821 		case MXGEFW_CMD_UNKNOWN:
822 			err = ENOSYS;
823 			break;
824 		case MXGEFW_CMD_ERROR_UNALIGNED:
825 			err = E2BIG;
826 			break;
827 		case MXGEFW_CMD_ERROR_BUSY:
828 			err = EBUSY;
829 			break;
830 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
831 			err = ENXIO;
832 			break;
833 		default:
834 			if_printf(sc->ifp, "command %d failed, result = %d\n",
835 			    cmd, be32toh(response->result));
836 			err = ENXIO;
837 			break;
838 		}
839 		if (err != EAGAIN)
840 			break;
841 	}
842 	if (err == EAGAIN) {
843 		if_printf(sc->ifp, "command %d timed out result = %d\n",
844 		    cmd, be32toh(response->result));
845 	}
846 	return err;
847 }
848 
849 static int
850 mxge_adopt_running_firmware(mxge_softc_t *sc)
851 {
852 	struct mcp_gen_header *hdr;
853 	const size_t bytes = sizeof(struct mcp_gen_header);
854 	size_t hdr_offset;
855 	int status;
856 
857 	/*
858 	 * Find running firmware header
859 	 */
860 	hdr_offset =
861 	htobe32(*(volatile uint32_t *)(sc->sram + MCP_HEADER_PTR_OFFSET));
862 
863 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
864 		if_printf(sc->ifp, "Running firmware has bad header offset "
865 		    "(%zu)\n", hdr_offset);
866 		return EIO;
867 	}
868 
869 	/*
870 	 * Copy header of running firmware from SRAM to host memory to
871 	 * validate firmware
872 	 */
873 	hdr = kmalloc(bytes, M_DEVBUF, M_WAITOK);
874 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
875 	    rman_get_bushandle(sc->mem_res), hdr_offset, (char *)hdr, bytes);
876 	status = mxge_validate_firmware(sc, hdr);
877 	kfree(hdr, M_DEVBUF);
878 
879 	/*
880 	 * Check to see if adopted firmware has bug where adopting
881 	 * it will cause broadcasts to be filtered unless the NIC
882 	 * is kept in ALLMULTI mode
883 	 */
884 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
885 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
886 		sc->adopted_rx_filter_bug = 1;
887 		if_printf(sc->ifp, "Adopting fw %d.%d.%d: "
888 		    "working around rx filter bug\n",
889 		    sc->fw_ver_major, sc->fw_ver_minor, sc->fw_ver_tiny);
890 	}
891 
892 	return status;
893 }
894 
895 static int
896 mxge_load_firmware(mxge_softc_t *sc, int adopt)
897 {
898 	volatile uint32_t *confirm;
899 	volatile char *submit;
900 	char buf_bytes[72];
901 	uint32_t *buf, size, dma_low, dma_high;
902 	int status, i;
903 
904 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
905 
906 	size = sc->sram_size;
907 	status = mxge_load_firmware_helper(sc, &size);
908 	if (status) {
909 		if (!adopt)
910 			return status;
911 
912 		/*
913 		 * Try to use the currently running firmware, if
914 		 * it is new enough
915 		 */
916 		status = mxge_adopt_running_firmware(sc);
917 		if (status) {
918 			if_printf(sc->ifp,
919 			    "failed to adopt running firmware\n");
920 			return status;
921 		}
922 		if_printf(sc->ifp, "Successfully adopted running firmware\n");
923 
924 		if (sc->tx_boundary == 4096) {
925 			if_printf(sc->ifp,
926 			     "Using firmware currently running on NIC.  "
927 			     "For optimal\n");
928 			if_printf(sc->ifp, "performance consider loading "
929 			     "optimized firmware\n");
930 		}
931 		sc->fw_name = mxge_fw_unaligned;
932 		sc->tx_boundary = 2048;
933 		return 0;
934 	}
935 
936 	/* Clear confirmation addr */
937 	confirm = (volatile uint32_t *)sc->cmd;
938 	*confirm = 0;
939 	wmb();
940 
941 	/*
942 	 * Send a reload command to the bootstrap MCP, and wait for the
943 	 * response in the confirmation address.  The firmware should
944 	 * write a -1 there to indicate it is alive and well
945 	 */
946 
947 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
948 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
949 
950 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
951 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
952 	buf[2] = htobe32(0xffffffff);	/* confirm data */
953 
954 	/*
955 	 * FIX: All newest firmware should un-protect the bottom of
956 	 * the sram before handoff. However, the very first interfaces
957 	 * do not. Therefore the handoff copy must skip the first 8 bytes
958 	 */
959 					/* where the code starts*/
960 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
961 	buf[4] = htobe32(size - 8); 	/* length of code */
962 	buf[5] = htobe32(8);		/* where to copy to */
963 	buf[6] = htobe32(0);		/* where to jump to */
964 
965 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
966 	mxge_pio_copy(submit, buf, 64);
967 	wmb();
968 	DELAY(1000);
969 	wmb();
970 	i = 0;
971 	while (*confirm != 0xffffffff && i < 20) {
972 		DELAY(1000*10);
973 		i++;
974 	}
975 	if (*confirm != 0xffffffff) {
976 		if_printf(sc->ifp,"handoff failed (%p = 0x%x)",
977 		    confirm, *confirm);
978 		return ENXIO;
979 	}
980 	return 0;
981 }
982 
983 static int
984 mxge_update_mac_address(mxge_softc_t *sc)
985 {
986 	mxge_cmd_t cmd;
987 	uint8_t *addr = sc->mac_addr;
988 
989 	cmd.data0 = (addr[0] << 24) | (addr[1] << 16) |
990 	    (addr[2] << 8) | addr[3];
991 	cmd.data1 = (addr[4] << 8) | (addr[5]);
992 	return mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
993 }
994 
995 static int
996 mxge_change_pause(mxge_softc_t *sc, int pause)
997 {
998 	mxge_cmd_t cmd;
999 	int status;
1000 
1001 	if (pause)
1002 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL, &cmd);
1003 	else
1004 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL, &cmd);
1005 	if (status) {
1006 		if_printf(sc->ifp, "Failed to set flow control mode\n");
1007 		return ENXIO;
1008 	}
1009 	sc->pause = pause;
1010 	return 0;
1011 }
1012 
1013 static void
1014 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1015 {
1016 	mxge_cmd_t cmd;
1017 	int status;
1018 
1019 	if (mxge_always_promisc)
1020 		promisc = 1;
1021 
1022 	if (promisc)
1023 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC, &cmd);
1024 	else
1025 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC, &cmd);
1026 	if (status)
1027 		if_printf(sc->ifp, "Failed to set promisc mode\n");
1028 }
1029 
1030 static void
1031 mxge_set_multicast_list(mxge_softc_t *sc)
1032 {
1033 	mxge_cmd_t cmd;
1034 	struct ifmultiaddr *ifma;
1035 	struct ifnet *ifp = sc->ifp;
1036 	int err;
1037 
1038 	/* This firmware is known to not support multicast */
1039 	if (!sc->fw_multicast_support)
1040 		return;
1041 
1042 	/* Disable multicast filtering while we play with the lists*/
1043 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1044 	if (err != 0) {
1045 		if_printf(ifp, "Failed MXGEFW_ENABLE_ALLMULTI, "
1046 		    "error status: %d\n", err);
1047 		return;
1048 	}
1049 
1050 	if (sc->adopted_rx_filter_bug)
1051 		return;
1052 
1053 	if (ifp->if_flags & IFF_ALLMULTI) {
1054 		/* Request to disable multicast filtering, so quit here */
1055 		return;
1056 	}
1057 
1058 	/* Flush all the filters */
1059 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1060 	if (err != 0) {
1061 		if_printf(ifp, "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, "
1062 		    "error status: %d\n", err);
1063 		return;
1064 	}
1065 
1066 	/*
1067 	 * Walk the multicast list, and add each address
1068 	 */
1069 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1070 		if (ifma->ifma_addr->sa_family != AF_LINK)
1071 			continue;
1072 
1073 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1074 		    &cmd.data0, 4);
1075 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1076 		    &cmd.data1, 2);
1077 		cmd.data0 = htonl(cmd.data0);
1078 		cmd.data1 = htonl(cmd.data1);
1079 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1080 		if (err != 0) {
1081 			if_printf(ifp, "Failed MXGEFW_JOIN_MULTICAST_GROUP, "
1082 			    "error status: %d\n", err);
1083 			/* Abort, leaving multicast filtering off */
1084 			return;
1085 		}
1086 	}
1087 
1088 	/* Enable multicast filtering */
1089 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1090 	if (err != 0) {
1091 		if_printf(ifp, "Failed MXGEFW_DISABLE_ALLMULTI, "
1092 		    "error status: %d\n", err);
1093 	}
1094 }
1095 
1096 #if 0
1097 static int
1098 mxge_max_mtu(mxge_softc_t *sc)
1099 {
1100 	mxge_cmd_t cmd;
1101 	int status;
1102 
1103 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1104 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1105 
1106 	/* try to set nbufs to see if it we can
1107 	   use virtually contiguous jumbos */
1108 	cmd.data0 = 0;
1109 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1110 			       &cmd);
1111 	if (status == 0)
1112 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1113 
1114 	/* otherwise, we're limited to MJUMPAGESIZE */
1115 	return MJUMPAGESIZE - MXGEFW_PAD;
1116 }
1117 #endif
1118 
1119 static int
1120 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1121 {
1122 	struct mxge_slice_state *ss;
1123 	mxge_rx_done_t *rx_done;
1124 	volatile uint32_t *irq_claim;
1125 	mxge_cmd_t cmd;
1126 	int slice, status, rx_intr_size;
1127 
1128 	/*
1129 	 * Try to send a reset command to the card to see if it
1130 	 * is alive
1131 	 */
1132 	memset(&cmd, 0, sizeof (cmd));
1133 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1134 	if (status != 0) {
1135 		if_printf(sc->ifp, "failed reset\n");
1136 		return ENXIO;
1137 	}
1138 
1139 	mxge_dummy_rdma(sc, 1);
1140 
1141 	/*
1142 	 * Set the intrq size
1143 	 * XXX assume 4byte mcp_slot
1144 	 */
1145 	rx_intr_size = sc->rx_intr_slots * sizeof(mcp_slot_t);
1146 	cmd.data0 = rx_intr_size;
1147 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1148 
1149 	/*
1150 	 * Even though we already know how many slices are supported
1151 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1152 	 * has magic side effects, and must be called after a reset.
1153 	 * It must be called prior to calling any RSS related cmds,
1154 	 * including assigning an interrupt queue for anything but
1155 	 * slice 0.  It must also be called *after*
1156 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1157 	 * the firmware to compute offsets.
1158 	 */
1159 	if (sc->num_slices > 1) {
1160 		/* Ask the maximum number of slices it supports */
1161 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
1162 		if (status != 0) {
1163 			if_printf(sc->ifp, "failed to get number of slices\n");
1164 			return status;
1165 		}
1166 
1167 		/*
1168 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1169 		 * to setting up the interrupt queue DMA
1170 		 */
1171 		cmd.data0 = sc->num_slices;
1172 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1173 		if (sc->num_tx_rings > 1)
1174 			cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1175 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES, &cmd);
1176 		if (status != 0) {
1177 			if_printf(sc->ifp, "failed to set number of slices\n");
1178 			return status;
1179 		}
1180 	}
1181 
1182 	if (interrupts_setup) {
1183 		/* Now exchange information about interrupts  */
1184 		for (slice = 0; slice < sc->num_slices; slice++) {
1185 			ss = &sc->ss[slice];
1186 
1187 			rx_done = &ss->rx_data.rx_done;
1188 			memset(rx_done->entry, 0, rx_intr_size);
1189 
1190 			cmd.data0 =
1191 			    MXGE_LOWPART_TO_U32(ss->rx_done_dma.dmem_busaddr);
1192 			cmd.data1 =
1193 			    MXGE_HIGHPART_TO_U32(ss->rx_done_dma.dmem_busaddr);
1194 			cmd.data2 = slice;
1195 			status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA,
1196 			    &cmd);
1197 		}
1198 	}
1199 
1200 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET,
1201 	    &cmd);
1202 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1203 
1204 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1205 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1206 
1207 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, &cmd);
1208 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1209 
1210 	if (status != 0) {
1211 		if_printf(sc->ifp, "failed set interrupt parameters\n");
1212 		return status;
1213 	}
1214 
1215 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1216 
1217 	/* Run a DMA benchmark */
1218 	mxge_dma_test(sc, MXGEFW_DMA_TEST);
1219 
1220 	for (slice = 0; slice < sc->num_slices; slice++) {
1221 		ss = &sc->ss[slice];
1222 
1223 		ss->irq_claim = irq_claim + (2 * slice);
1224 
1225 		/* Reset mcp/driver shared state back to 0 */
1226 		ss->rx_data.rx_done.idx = 0;
1227 		ss->tx.req = 0;
1228 		ss->tx.done = 0;
1229 		ss->tx.pkt_done = 0;
1230 		ss->tx.queue_active = 0;
1231 		ss->tx.activate = 0;
1232 		ss->tx.deactivate = 0;
1233 		ss->rx_data.rx_big.cnt = 0;
1234 		ss->rx_data.rx_small.cnt = 0;
1235 		if (ss->fw_stats != NULL)
1236 			bzero(ss->fw_stats, sizeof(*ss->fw_stats));
1237 	}
1238 	sc->rdma_tags_available = 15;
1239 
1240 	status = mxge_update_mac_address(sc);
1241 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1242 	mxge_change_pause(sc, sc->pause);
1243 	mxge_set_multicast_list(sc);
1244 
1245 	if (sc->throttle) {
1246 		cmd.data0 = sc->throttle;
1247 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd))
1248 			if_printf(sc->ifp, "can't enable throttle\n");
1249 	}
1250 	return status;
1251 }
1252 
1253 static int
1254 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1255 {
1256 	mxge_cmd_t cmd;
1257 	mxge_softc_t *sc;
1258 	int err;
1259 	unsigned int throttle;
1260 
1261 	sc = arg1;
1262 	throttle = sc->throttle;
1263 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1264 	if (err != 0)
1265 		return err;
1266 
1267 	if (throttle == sc->throttle)
1268 		return 0;
1269 
1270 	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1271 		return EINVAL;
1272 
1273 	ifnet_serialize_all(sc->ifp);
1274 
1275 	cmd.data0 = throttle;
1276 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1277 	if (err == 0)
1278 		sc->throttle = throttle;
1279 
1280 	ifnet_deserialize_all(sc->ifp);
1281 	return err;
1282 }
1283 
1284 static int
1285 mxge_change_use_rss(SYSCTL_HANDLER_ARGS)
1286 {
1287 	mxge_softc_t *sc;
1288 	int err, use_rss;
1289 
1290 	sc = arg1;
1291 	use_rss = sc->use_rss;
1292 	err = sysctl_handle_int(oidp, &use_rss, arg2, req);
1293 	if (err != 0)
1294 		return err;
1295 
1296 	if (use_rss == sc->use_rss)
1297 		return 0;
1298 
1299 	ifnet_serialize_all(sc->ifp);
1300 
1301 	sc->use_rss = use_rss;
1302 	if (sc->ifp->if_flags & IFF_RUNNING) {
1303 		mxge_close(sc, 0);
1304 		mxge_open(sc);
1305 	}
1306 
1307 	ifnet_deserialize_all(sc->ifp);
1308 	return err;
1309 }
1310 
1311 static int
1312 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1313 {
1314 	mxge_softc_t *sc;
1315 	unsigned int intr_coal_delay;
1316 	int err;
1317 
1318 	sc = arg1;
1319 	intr_coal_delay = sc->intr_coal_delay;
1320 	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1321 	if (err != 0)
1322 		return err;
1323 
1324 	if (intr_coal_delay == sc->intr_coal_delay)
1325 		return 0;
1326 
1327 	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1328 		return EINVAL;
1329 
1330 	ifnet_serialize_all(sc->ifp);
1331 
1332 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1333 	sc->intr_coal_delay = intr_coal_delay;
1334 
1335 	ifnet_deserialize_all(sc->ifp);
1336 	return err;
1337 }
1338 
1339 static int
1340 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1341 {
1342 	mxge_softc_t *sc;
1343 	unsigned int enabled;
1344 	int err;
1345 
1346 	sc = arg1;
1347 	enabled = sc->pause;
1348 	err = sysctl_handle_int(oidp, &enabled, arg2, req);
1349 	if (err != 0)
1350 		return err;
1351 
1352 	if (enabled == sc->pause)
1353 		return 0;
1354 
1355 	ifnet_serialize_all(sc->ifp);
1356 	err = mxge_change_pause(sc, enabled);
1357 	ifnet_deserialize_all(sc->ifp);
1358 
1359 	return err;
1360 }
1361 
1362 static int
1363 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1364 {
1365 	int err;
1366 
1367 	if (arg1 == NULL)
1368 		return EFAULT;
1369 	arg2 = be32toh(*(int *)arg1);
1370 	arg1 = NULL;
1371 	err = sysctl_handle_int(oidp, arg1, arg2, req);
1372 
1373 	return err;
1374 }
1375 
1376 static void
1377 mxge_rem_sysctls(mxge_softc_t *sc)
1378 {
1379 	if (sc->ss != NULL) {
1380 		struct mxge_slice_state *ss;
1381 		int slice;
1382 
1383 		for (slice = 0; slice < sc->num_slices; slice++) {
1384 			ss = &sc->ss[slice];
1385 			if (ss->sysctl_tree != NULL) {
1386 				sysctl_ctx_free(&ss->sysctl_ctx);
1387 				ss->sysctl_tree = NULL;
1388 			}
1389 		}
1390 	}
1391 
1392 	if (sc->slice_sysctl_tree != NULL) {
1393 		sysctl_ctx_free(&sc->slice_sysctl_ctx);
1394 		sc->slice_sysctl_tree = NULL;
1395 	}
1396 }
1397 
1398 static void
1399 mxge_add_sysctls(mxge_softc_t *sc)
1400 {
1401 	struct sysctl_ctx_list *ctx;
1402 	struct sysctl_oid_list *children;
1403 	mcp_irq_data_t *fw;
1404 	struct mxge_slice_state *ss;
1405 	int slice;
1406 	char slice_num[8];
1407 
1408 	ctx = device_get_sysctl_ctx(sc->dev);
1409 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1410 	fw = sc->ss[0].fw_stats;
1411 
1412 	/*
1413 	 * Random information
1414 	 */
1415 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version",
1416 	    CTLFLAG_RD, &sc->fw_version, 0, "firmware version");
1417 
1418 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "serial_number",
1419 	    CTLFLAG_RD, &sc->serial_number_string, 0, "serial number");
1420 
1421 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "product_code",
1422 	    CTLFLAG_RD, &sc->product_code_string, 0, "product code");
1423 
1424 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "pcie_link_width",
1425 	    CTLFLAG_RD, &sc->link_width, 0, "link width");
1426 
1427 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_boundary",
1428 	    CTLFLAG_RD, &sc->tx_boundary, 0, "tx boundary");
1429 
1430 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_combine",
1431 	    CTLFLAG_RD, &sc->wc, 0, "write combining PIO");
1432 
1433 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_dma_MBs",
1434 	    CTLFLAG_RD, &sc->read_dma, 0, "DMA Read speed in MB/s");
1435 
1436 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_dma_MBs",
1437 	    CTLFLAG_RD, &sc->write_dma, 0, "DMA Write speed in MB/s");
1438 
1439 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_write_dma_MBs",
1440 	    CTLFLAG_RD, &sc->read_write_dma, 0,
1441 	    "DMA concurrent Read/Write speed in MB/s");
1442 
1443 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "watchdog_resets",
1444 	    CTLFLAG_RD, &sc->watchdog_resets, 0,
1445 	    "Number of times NIC was reset");
1446 
1447 	/*
1448 	 * Performance related tunables
1449 	 */
1450 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "intr_coal_delay",
1451 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_intr_coal, "I",
1452 	    "Interrupt coalescing delay in usecs");
1453 
1454 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "throttle",
1455 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_throttle, "I",
1456 	    "Transmit throttling");
1457 
1458 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "flow_control_enabled",
1459 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_flow_control, "I",
1460 	    "Interrupt coalescing delay in usecs");
1461 
1462 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "use_rss",
1463 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_use_rss, "I",
1464 	    "Use RSS");
1465 
1466 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "deassert_wait",
1467 	    CTLFLAG_RW, &mxge_deassert_wait, 0,
1468 	    "Wait for IRQ line to go low in ihandler");
1469 
1470 	/*
1471 	 * Stats block from firmware is in network byte order.
1472 	 * Need to swap it
1473 	 */
1474 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "link_up",
1475 	    CTLTYPE_INT|CTLFLAG_RD, &fw->link_up, 0,
1476 	    mxge_handle_be32, "I", "link up");
1477 
1478 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_tags_available",
1479 	    CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available, 0,
1480 	    mxge_handle_be32, "I", "rdma_tags_available");
1481 
1482 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_crc32",
1483 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_crc32, 0,
1484 	    mxge_handle_be32, "I", "dropped_bad_crc32");
1485 
1486 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_phy",
1487 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_phy, 0,
1488 	    mxge_handle_be32, "I", "dropped_bad_phy");
1489 
1490 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_error_or_filtered",
1491 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_error_or_filtered, 0,
1492 	    mxge_handle_be32, "I", "dropped_link_error_or_filtered");
1493 
1494 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_overflow",
1495 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow, 0,
1496 	    mxge_handle_be32, "I", "dropped_link_overflow");
1497 
1498 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_multicast_filtered",
1499 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_multicast_filtered, 0,
1500 	    mxge_handle_be32, "I", "dropped_multicast_filtered");
1501 
1502 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_big_buffer",
1503 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer, 0,
1504 	    mxge_handle_be32, "I", "dropped_no_big_buffer");
1505 
1506 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_small_buffer",
1507 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_small_buffer, 0,
1508 	    mxge_handle_be32, "I", "dropped_no_small_buffer");
1509 
1510 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_overrun",
1511 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun, 0,
1512 	    mxge_handle_be32, "I", "dropped_overrun");
1513 
1514 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_pause",
1515 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_pause, 0,
1516 	    mxge_handle_be32, "I", "dropped_pause");
1517 
1518 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_runt",
1519 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt, 0,
1520 	    mxge_handle_be32, "I", "dropped_runt");
1521 
1522 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_unicast_filtered",
1523 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered, 0,
1524 	    mxge_handle_be32, "I", "dropped_unicast_filtered");
1525 
1526 	/* add counters exported for debugging from all slices */
1527 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1528 	sc->slice_sysctl_tree = SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx,
1529 	    children, OID_AUTO, "slice", CTLFLAG_RD, 0, "");
1530 	if (sc->slice_sysctl_tree == NULL) {
1531 		device_printf(sc->dev, "can't add slice sysctl node\n");
1532 		return;
1533 	}
1534 
1535 	for (slice = 0; slice < sc->num_slices; slice++) {
1536 		ss = &sc->ss[slice];
1537 		sysctl_ctx_init(&ss->sysctl_ctx);
1538 		ctx = &ss->sysctl_ctx;
1539 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1540 		ksprintf(slice_num, "%d", slice);
1541 		ss->sysctl_tree = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
1542 		    slice_num, CTLFLAG_RD, 0, "");
1543 		if (ss->sysctl_tree == NULL) {
1544 			device_printf(sc->dev,
1545 			    "can't add %d slice sysctl node\n", slice);
1546 			return;	/* XXX continue? */
1547 		}
1548 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1549 
1550 		/*
1551 		 * XXX change to ULONG
1552 		 */
1553 
1554 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_small_cnt",
1555 		    CTLFLAG_RD, &ss->rx_data.rx_small.cnt, 0, "rx_small_cnt");
1556 
1557 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_big_cnt",
1558 		    CTLFLAG_RD, &ss->rx_data.rx_big.cnt, 0, "rx_small_cnt");
1559 
1560 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_req",
1561 		    CTLFLAG_RD, &ss->tx.req, 0, "tx_req");
1562 
1563 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_done",
1564 		    CTLFLAG_RD, &ss->tx.done, 0, "tx_done");
1565 
1566 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_pkt_done",
1567 		    CTLFLAG_RD, &ss->tx.pkt_done, 0, "tx_done");
1568 
1569 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_queue_active",
1570 		    CTLFLAG_RD, &ss->tx.queue_active, 0, "tx_queue_active");
1571 
1572 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_activate",
1573 		    CTLFLAG_RD, &ss->tx.activate, 0, "tx_activate");
1574 
1575 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_deactivate",
1576 		    CTLFLAG_RD, &ss->tx.deactivate, 0, "tx_deactivate");
1577 	}
1578 }
1579 
1580 /*
1581  * Copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1582  * backwards one at a time and handle ring wraps
1583  */
1584 static __inline void
1585 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1586     mcp_kreq_ether_send_t *src, int cnt)
1587 {
1588 	int idx, starting_slot;
1589 
1590 	starting_slot = tx->req;
1591 	while (cnt > 1) {
1592 		cnt--;
1593 		idx = (starting_slot + cnt) & tx->mask;
1594 		mxge_pio_copy(&tx->lanai[idx], &src[cnt], sizeof(*src));
1595 		wmb();
1596 	}
1597 }
1598 
1599 /*
1600  * Copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1601  * at most 32 bytes at a time, so as to avoid involving the software
1602  * pio handler in the nic.  We re-write the first segment's flags
1603  * to mark them valid only after writing the entire chain
1604  */
1605 static __inline void
1606 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src, int cnt)
1607 {
1608 	int idx, i;
1609 	uint32_t *src_ints;
1610 	volatile uint32_t *dst_ints;
1611 	mcp_kreq_ether_send_t *srcp;
1612 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1613 	uint8_t last_flags;
1614 
1615 	idx = tx->req & tx->mask;
1616 
1617 	last_flags = src->flags;
1618 	src->flags = 0;
1619 	wmb();
1620 	dst = dstp = &tx->lanai[idx];
1621 	srcp = src;
1622 
1623 	if ((idx + cnt) < tx->mask) {
1624 		for (i = 0; i < cnt - 1; i += 2) {
1625 			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1626 			wmb(); /* force write every 32 bytes */
1627 			srcp += 2;
1628 			dstp += 2;
1629 		}
1630 	} else {
1631 		/*
1632 		 * Submit all but the first request, and ensure
1633 		 * that it is submitted below
1634 		 */
1635 		mxge_submit_req_backwards(tx, src, cnt);
1636 		i = 0;
1637 	}
1638 	if (i < cnt) {
1639 		/* Submit the first request */
1640 		mxge_pio_copy(dstp, srcp, sizeof(*src));
1641 		wmb(); /* barrier before setting valid flag */
1642 	}
1643 
1644 	/* Re-write the last 32-bits with the valid flags */
1645 	src->flags = last_flags;
1646 	src_ints = (uint32_t *)src;
1647 	src_ints+=3;
1648 	dst_ints = (volatile uint32_t *)dst;
1649 	dst_ints+=3;
1650 	*dst_ints = *src_ints;
1651 	tx->req += cnt;
1652 	wmb();
1653 }
1654 
1655 static int
1656 mxge_pullup_tso(struct mbuf **mp)
1657 {
1658 	int hoff, iphlen, thoff;
1659 	struct mbuf *m;
1660 
1661 	m = *mp;
1662 	KASSERT(M_WRITABLE(m), ("TSO mbuf not writable"));
1663 
1664 	iphlen = m->m_pkthdr.csum_iphlen;
1665 	thoff = m->m_pkthdr.csum_thlen;
1666 	hoff = m->m_pkthdr.csum_lhlen;
1667 
1668 	KASSERT(iphlen > 0, ("invalid ip hlen"));
1669 	KASSERT(thoff > 0, ("invalid tcp hlen"));
1670 	KASSERT(hoff > 0, ("invalid ether hlen"));
1671 
1672 	if (__predict_false(m->m_len < hoff + iphlen + thoff)) {
1673 		m = m_pullup(m, hoff + iphlen + thoff);
1674 		if (m == NULL) {
1675 			*mp = NULL;
1676 			return ENOBUFS;
1677 		}
1678 		*mp = m;
1679 	}
1680 	return 0;
1681 }
1682 
1683 static int
1684 mxge_encap_tso(mxge_tx_ring_t *tx, struct mxge_buffer_state *info_map,
1685     struct mbuf *m, int busdma_seg_cnt)
1686 {
1687 	mcp_kreq_ether_send_t *req;
1688 	bus_dma_segment_t *seg;
1689 	uint32_t low, high_swapped;
1690 	int len, seglen, cum_len, cum_len_next;
1691 	int next_is_first, chop, cnt, rdma_count, small;
1692 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1693 	uint8_t flags, flags_next;
1694 	struct mxge_buffer_state *info_last;
1695 	bus_dmamap_t map = info_map->map;
1696 
1697 	mss = m->m_pkthdr.tso_segsz;
1698 
1699 	/*
1700 	 * Negative cum_len signifies to the send loop that we are
1701 	 * still in the header portion of the TSO packet.
1702 	 */
1703 	cum_len = -(m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen +
1704 	    m->m_pkthdr.csum_thlen);
1705 
1706 	/*
1707 	 * TSO implies checksum offload on this hardware
1708 	 */
1709 	cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1710 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1711 
1712 	/*
1713 	 * For TSO, pseudo_hdr_offset holds mss.  The firmware figures
1714 	 * out where to put the checksum by parsing the header.
1715 	 */
1716 	pseudo_hdr_offset = htobe16(mss);
1717 
1718 	req = tx->req_list;
1719 	seg = tx->seg_list;
1720 	cnt = 0;
1721 	rdma_count = 0;
1722 
1723 	/*
1724 	 * "rdma_count" is the number of RDMAs belonging to the current
1725 	 * packet BEFORE the current send request.  For non-TSO packets,
1726 	 * this is equal to "count".
1727 	 *
1728 	 * For TSO packets, rdma_count needs to be reset to 0 after a
1729 	 * segment cut.
1730 	 *
1731 	 * The rdma_count field of the send request is the number of
1732 	 * RDMAs of the packet starting at that request.  For TSO send
1733 	 * requests with one ore more cuts in the middle, this is the
1734 	 * number of RDMAs starting after the last cut in the request.
1735 	 * All previous segments before the last cut implicitly have 1
1736 	 * RDMA.
1737 	 *
1738 	 * Since the number of RDMAs is not known beforehand, it must be
1739 	 * filled-in retroactively - after each segmentation cut or at
1740 	 * the end of the entire packet.
1741 	 */
1742 
1743 	while (busdma_seg_cnt) {
1744 		/*
1745 		 * Break the busdma segment up into pieces
1746 		 */
1747 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1748 		high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1749 		len = seg->ds_len;
1750 
1751 		while (len) {
1752 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1753 			seglen = len;
1754 			cum_len_next = cum_len + seglen;
1755 			(req - rdma_count)->rdma_count = rdma_count + 1;
1756 			if (__predict_true(cum_len >= 0)) {
1757 				/* Payload */
1758 				chop = (cum_len_next > mss);
1759 				cum_len_next = cum_len_next % mss;
1760 				next_is_first = (cum_len_next == 0);
1761 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1762 				flags_next |=
1763 				    next_is_first * MXGEFW_FLAGS_FIRST;
1764 				rdma_count |= -(chop | next_is_first);
1765 				rdma_count += chop & !next_is_first;
1766 			} else if (cum_len_next >= 0) {
1767 				/* Header ends */
1768 				rdma_count = -1;
1769 				cum_len_next = 0;
1770 				seglen = -cum_len;
1771 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1772 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1773 				    MXGEFW_FLAGS_FIRST |
1774 				    (small * MXGEFW_FLAGS_SMALL);
1775 			}
1776 
1777 			req->addr_high = high_swapped;
1778 			req->addr_low = htobe32(low);
1779 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1780 			req->pad = 0;
1781 			req->rdma_count = 1;
1782 			req->length = htobe16(seglen);
1783 			req->cksum_offset = cksum_offset;
1784 			req->flags =
1785 			    flags | ((cum_len & 1) * MXGEFW_FLAGS_ALIGN_ODD);
1786 			low += seglen;
1787 			len -= seglen;
1788 			cum_len = cum_len_next;
1789 			flags = flags_next;
1790 			req++;
1791 			cnt++;
1792 			rdma_count++;
1793 			if (__predict_false(cksum_offset > seglen))
1794 				cksum_offset -= seglen;
1795 			else
1796 				cksum_offset = 0;
1797 			if (__predict_false(cnt > tx->max_desc))
1798 				goto drop;
1799 		}
1800 		busdma_seg_cnt--;
1801 		seg++;
1802 	}
1803 	(req - rdma_count)->rdma_count = rdma_count;
1804 
1805 	do {
1806 		req--;
1807 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1808 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1809 
1810 	info_last = &tx->info[((cnt - 1) + tx->req) & tx->mask];
1811 
1812 	info_map->map = info_last->map;
1813 	info_last->map = map;
1814 	info_last->m = m;
1815 
1816 	mxge_submit_req(tx, tx->req_list, cnt);
1817 
1818 	if (tx->send_go != NULL && tx->queue_active == 0) {
1819 		/* Tell the NIC to start polling this slice */
1820 		*tx->send_go = 1;
1821 		tx->queue_active = 1;
1822 		tx->activate++;
1823 		wmb();
1824 	}
1825 	return 0;
1826 
1827 drop:
1828 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1829 	m_freem(m);
1830 	return ENOBUFS;
1831 }
1832 
1833 static int
1834 mxge_encap(mxge_tx_ring_t *tx, struct mbuf *m, bus_addr_t zeropad)
1835 {
1836 	mcp_kreq_ether_send_t *req;
1837 	bus_dma_segment_t *seg;
1838 	bus_dmamap_t map;
1839 	int cnt, cum_len, err, i, idx, odd_flag;
1840 	uint16_t pseudo_hdr_offset;
1841 	uint8_t flags, cksum_offset;
1842 	struct mxge_buffer_state *info_map, *info_last;
1843 
1844 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
1845 		err = mxge_pullup_tso(&m);
1846 		if (__predict_false(err))
1847 			return err;
1848 	}
1849 
1850 	/*
1851 	 * Map the frame for DMA
1852 	 */
1853 	idx = tx->req & tx->mask;
1854 	info_map = &tx->info[idx];
1855 	map = info_map->map;
1856 
1857 	err = bus_dmamap_load_mbuf_defrag(tx->dmat, map, &m,
1858 	    tx->seg_list, tx->max_desc - 2, &cnt, BUS_DMA_NOWAIT);
1859 	if (__predict_false(err != 0))
1860 		goto drop;
1861 	bus_dmamap_sync(tx->dmat, map, BUS_DMASYNC_PREWRITE);
1862 
1863 	/*
1864 	 * TSO is different enough, we handle it in another routine
1865 	 */
1866 	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1867 		return mxge_encap_tso(tx, info_map, m, cnt);
1868 
1869 	req = tx->req_list;
1870 	cksum_offset = 0;
1871 	pseudo_hdr_offset = 0;
1872 	flags = MXGEFW_FLAGS_NO_TSO;
1873 
1874 	/*
1875 	 * Checksum offloading
1876 	 */
1877 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1878 		cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1879 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1880 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1881 		req->cksum_offset = cksum_offset;
1882 		flags |= MXGEFW_FLAGS_CKSUM;
1883 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1884 	} else {
1885 		odd_flag = 0;
1886 	}
1887 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1888 		flags |= MXGEFW_FLAGS_SMALL;
1889 
1890 	/*
1891 	 * Convert segments into a request list
1892 	 */
1893 	cum_len = 0;
1894 	seg = tx->seg_list;
1895 	req->flags = MXGEFW_FLAGS_FIRST;
1896 	for (i = 0; i < cnt; i++) {
1897 		req->addr_low = htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1898 		req->addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1899 		req->length = htobe16(seg->ds_len);
1900 		req->cksum_offset = cksum_offset;
1901 		if (cksum_offset > seg->ds_len)
1902 			cksum_offset -= seg->ds_len;
1903 		else
1904 			cksum_offset = 0;
1905 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1906 		req->pad = 0; /* complete solid 16-byte block */
1907 		req->rdma_count = 1;
1908 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1909 		cum_len += seg->ds_len;
1910 		seg++;
1911 		req++;
1912 		req->flags = 0;
1913 	}
1914 	req--;
1915 
1916 	/*
1917 	 * Pad runt to 60 bytes
1918 	 */
1919 	if (cum_len < 60) {
1920 		req++;
1921 		req->addr_low = htobe32(MXGE_LOWPART_TO_U32(zeropad));
1922 		req->addr_high = htobe32(MXGE_HIGHPART_TO_U32(zeropad));
1923 		req->length = htobe16(60 - cum_len);
1924 		req->cksum_offset = 0;
1925 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1926 		req->pad = 0; /* complete solid 16-byte block */
1927 		req->rdma_count = 1;
1928 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1929 		cnt++;
1930 	}
1931 
1932 	tx->req_list[0].rdma_count = cnt;
1933 #if 0
1934 	/* print what the firmware will see */
1935 	for (i = 0; i < cnt; i++) {
1936 		kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1937 		    "cso:%d, flags:0x%x, rdma:%d\n",
1938 		    i, (int)ntohl(tx->req_list[i].addr_high),
1939 		    (int)ntohl(tx->req_list[i].addr_low),
1940 		    (int)ntohs(tx->req_list[i].length),
1941 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1942 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1943 		    tx->req_list[i].rdma_count);
1944 	}
1945 	kprintf("--------------\n");
1946 #endif
1947 	info_last = &tx->info[((cnt - 1) + tx->req) & tx->mask];
1948 
1949 	info_map->map = info_last->map;
1950 	info_last->map = map;
1951 	info_last->m = m;
1952 
1953 	mxge_submit_req(tx, tx->req_list, cnt);
1954 
1955 	if (tx->send_go != NULL && tx->queue_active == 0) {
1956 		/* Tell the NIC to start polling this slice */
1957 		*tx->send_go = 1;
1958 		tx->queue_active = 1;
1959 		tx->activate++;
1960 		wmb();
1961 	}
1962 	return 0;
1963 
1964 drop:
1965 	m_freem(m);
1966 	return err;
1967 }
1968 
1969 static void
1970 mxge_start(struct ifnet *ifp, struct ifaltq_subque *ifsq)
1971 {
1972 	mxge_softc_t *sc = ifp->if_softc;
1973 	mxge_tx_ring_t *tx = ifsq_get_priv(ifsq);
1974 	bus_addr_t zeropad;
1975 	int encap = 0;
1976 
1977 	KKASSERT(tx->ifsq == ifsq);
1978 	ASSERT_SERIALIZED(&tx->tx_serialize);
1979 
1980 	if ((ifp->if_flags & IFF_RUNNING) == 0 || ifsq_is_oactive(ifsq))
1981 		return;
1982 
1983 	zeropad = sc->zeropad_dma.dmem_busaddr;
1984 	while (tx->mask - (tx->req - tx->done) > tx->max_desc) {
1985 		struct mbuf *m;
1986 		int error;
1987 
1988 		m = ifsq_dequeue(ifsq);
1989 		if (m == NULL)
1990 			goto done;
1991 
1992 		BPF_MTAP(ifp, m);
1993 		error = mxge_encap(tx, m, zeropad);
1994 		if (!error)
1995 			encap = 1;
1996 		else
1997 			IFNET_STAT_INC(ifp, oerrors, 1);
1998 	}
1999 
2000 	/* Ran out of transmit slots */
2001 	ifsq_set_oactive(ifsq);
2002 done:
2003 	if (encap)
2004 		tx->watchdog.wd_timer = 5;
2005 }
2006 
2007 static void
2008 mxge_watchdog(struct ifaltq_subque *ifsq)
2009 {
2010 	struct ifnet *ifp = ifsq_get_ifp(ifsq);
2011 	struct mxge_softc *sc = ifp->if_softc;
2012 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
2013 	mxge_tx_ring_t *tx = ifsq_get_priv(ifsq);
2014 
2015 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
2016 
2017 	/* Check for pause blocking before resetting */
2018 	if (tx->watchdog_rx_pause == rx_pause) {
2019 		mxge_warn_stuck(sc, tx, 0);
2020 		mxge_watchdog_reset(sc);
2021 		return;
2022 	} else {
2023 		if_printf(ifp, "Flow control blocking xmits, "
2024 		    "check link partner\n");
2025 	}
2026 	tx->watchdog_rx_pause = rx_pause;
2027 }
2028 
2029 /*
2030  * Copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2031  * at most 32 bytes at a time, so as to avoid involving the software
2032  * pio handler in the nic.  We re-write the first segment's low
2033  * DMA address to mark it valid only after we write the entire chunk
2034  * in a burst
2035  */
2036 static __inline void
2037 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2038     mcp_kreq_ether_recv_t *src)
2039 {
2040 	uint32_t low;
2041 
2042 	low = src->addr_low;
2043 	src->addr_low = 0xffffffff;
2044 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2045 	wmb();
2046 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2047 	wmb();
2048 	src->addr_low = low;
2049 	dst->addr_low = low;
2050 	wmb();
2051 }
2052 
2053 static int
2054 mxge_get_buf_small(mxge_rx_ring_t *rx, bus_dmamap_t map, int idx,
2055     boolean_t init)
2056 {
2057 	bus_dma_segment_t seg;
2058 	struct mbuf *m;
2059 	int cnt, err, mflag;
2060 
2061 	mflag = M_NOWAIT;
2062 	if (__predict_false(init))
2063 		mflag = M_WAITOK;
2064 
2065 	m = m_gethdr(mflag, MT_DATA);
2066 	if (m == NULL) {
2067 		err = ENOBUFS;
2068 		if (__predict_false(init)) {
2069 			/*
2070 			 * During initialization, there
2071 			 * is nothing to setup; bail out
2072 			 */
2073 			return err;
2074 		}
2075 		goto done;
2076 	}
2077 	m->m_len = m->m_pkthdr.len = MHLEN;
2078 
2079 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2080 	    &seg, 1, &cnt, BUS_DMA_NOWAIT);
2081 	if (err != 0) {
2082 		m_freem(m);
2083 		if (__predict_false(init)) {
2084 			/*
2085 			 * During initialization, there
2086 			 * is nothing to setup; bail out
2087 			 */
2088 			return err;
2089 		}
2090 		goto done;
2091 	}
2092 
2093 	rx->info[idx].m = m;
2094 	rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2095 	rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2096 
2097 done:
2098 	if ((idx & 7) == 7)
2099 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2100 	return err;
2101 }
2102 
2103 static int
2104 mxge_get_buf_big(mxge_rx_ring_t *rx, bus_dmamap_t map, int idx,
2105     boolean_t init)
2106 {
2107 	bus_dma_segment_t seg;
2108 	struct mbuf *m;
2109 	int cnt, err, mflag;
2110 
2111 	mflag = M_NOWAIT;
2112 	if (__predict_false(init))
2113 		mflag = M_WAITOK;
2114 
2115 	if (rx->cl_size == MCLBYTES)
2116 		m = m_getcl(mflag, MT_DATA, M_PKTHDR);
2117 	else
2118 		m = m_getjcl(mflag, MT_DATA, M_PKTHDR, MJUMPAGESIZE);
2119 	if (m == NULL) {
2120 		err = ENOBUFS;
2121 		if (__predict_false(init)) {
2122 			/*
2123 			 * During initialization, there
2124 			 * is nothing to setup; bail out
2125 			 */
2126 			return err;
2127 		}
2128 		goto done;
2129 	}
2130 	m->m_len = m->m_pkthdr.len = rx->cl_size;
2131 
2132 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2133 	    &seg, 1, &cnt, BUS_DMA_NOWAIT);
2134 	if (err != 0) {
2135 		m_freem(m);
2136 		if (__predict_false(init)) {
2137 			/*
2138 			 * During initialization, there
2139 			 * is nothing to setup; bail out
2140 			 */
2141 			return err;
2142 		}
2143 		goto done;
2144 	}
2145 
2146 	rx->info[idx].m = m;
2147 	rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2148 	rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2149 
2150 done:
2151 	if ((idx & 7) == 7)
2152 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2153 	return err;
2154 }
2155 
2156 /*
2157  * Myri10GE hardware checksums are not valid if the sender
2158  * padded the frame with non-zero padding.  This is because
2159  * the firmware just does a simple 16-bit 1s complement
2160  * checksum across the entire frame, excluding the first 14
2161  * bytes.  It is best to simply to check the checksum and
2162  * tell the stack about it only if the checksum is good
2163  */
2164 static __inline uint16_t
2165 mxge_rx_csum(struct mbuf *m, int csum)
2166 {
2167 	const struct ether_header *eh;
2168 	const struct ip *ip;
2169 	uint16_t c;
2170 
2171 	eh = mtod(m, const struct ether_header *);
2172 
2173 	/* Only deal with IPv4 TCP & UDP for now */
2174 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2175 		return 1;
2176 
2177 	ip = (const struct ip *)(eh + 1);
2178 	if (__predict_false(ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP))
2179 		return 1;
2180 
2181 #ifdef INET
2182 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2183 	    htonl(ntohs(csum) + ntohs(ip->ip_len) +
2184 	          - (ip->ip_hl << 2) + ip->ip_p));
2185 #else
2186 	c = 1;
2187 #endif
2188 	c ^= 0xffff;
2189 	return c;
2190 }
2191 
2192 static void
2193 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2194 {
2195 	struct ether_vlan_header *evl;
2196 	uint32_t partial;
2197 
2198 	evl = mtod(m, struct ether_vlan_header *);
2199 
2200 	/*
2201 	 * Fix checksum by subtracting EVL_ENCAPLEN bytes after
2202 	 * what the firmware thought was the end of the ethernet
2203 	 * header.
2204 	 */
2205 
2206 	/* Put checksum into host byte order */
2207 	*csum = ntohs(*csum);
2208 
2209 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2210 	*csum += ~partial;
2211 	*csum += ((*csum) < ~partial);
2212 	*csum = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2213 	*csum = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2214 
2215 	/*
2216 	 * Restore checksum to network byte order;
2217 	 * later consumers expect this
2218 	 */
2219 	*csum = htons(*csum);
2220 
2221 	/* save the tag */
2222 	m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2223 	m->m_flags |= M_VLANTAG;
2224 
2225 	/*
2226 	 * Remove the 802.1q header by copying the Ethernet
2227 	 * addresses over it and adjusting the beginning of
2228 	 * the data in the mbuf.  The encapsulated Ethernet
2229 	 * type field is already in place.
2230 	 */
2231 	bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2232 	    ETHER_HDR_LEN - ETHER_TYPE_LEN);
2233 	m_adj(m, EVL_ENCAPLEN);
2234 }
2235 
2236 
2237 static __inline void
2238 mxge_rx_done_big(struct ifnet *ifp, mxge_rx_ring_t *rx,
2239     uint32_t len, uint32_t csum)
2240 {
2241 	struct mbuf *m;
2242 	const struct ether_header *eh;
2243 	bus_dmamap_t old_map;
2244 	int idx;
2245 
2246 	idx = rx->cnt & rx->mask;
2247 	rx->cnt++;
2248 
2249 	/* Save a pointer to the received mbuf */
2250 	m = rx->info[idx].m;
2251 
2252 	/* Try to replace the received mbuf */
2253 	if (mxge_get_buf_big(rx, rx->extra_map, idx, FALSE)) {
2254 		/* Drop the frame -- the old mbuf is re-cycled */
2255 		IFNET_STAT_INC(ifp, ierrors, 1);
2256 		return;
2257 	}
2258 
2259 	/* Unmap the received buffer */
2260 	old_map = rx->info[idx].map;
2261 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2262 	bus_dmamap_unload(rx->dmat, old_map);
2263 
2264 	/* Swap the bus_dmamap_t's */
2265 	rx->info[idx].map = rx->extra_map;
2266 	rx->extra_map = old_map;
2267 
2268 	/*
2269 	 * mcp implicitly skips 1st 2 bytes so that packet is properly
2270 	 * aligned
2271 	 */
2272 	m->m_data += MXGEFW_PAD;
2273 
2274 	m->m_pkthdr.rcvif = ifp;
2275 	m->m_len = m->m_pkthdr.len = len;
2276 
2277 	IFNET_STAT_INC(ifp, ipackets, 1);
2278 
2279 	eh = mtod(m, const struct ether_header *);
2280 	if (eh->ether_type == htons(ETHERTYPE_VLAN))
2281 		mxge_vlan_tag_remove(m, &csum);
2282 
2283 	/* If the checksum is valid, mark it in the mbuf header */
2284 	if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2285 	    mxge_rx_csum(m, csum) == 0) {
2286 		/* Tell the stack that the checksum is good */
2287 		m->m_pkthdr.csum_data = 0xffff;
2288 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2289 		    CSUM_DATA_VALID;
2290 	}
2291 	ifp->if_input(ifp, m, NULL, -1);
2292 }
2293 
2294 static __inline void
2295 mxge_rx_done_small(struct ifnet *ifp, mxge_rx_ring_t *rx,
2296     uint32_t len, uint32_t csum)
2297 {
2298 	const struct ether_header *eh;
2299 	struct mbuf *m;
2300 	bus_dmamap_t old_map;
2301 	int idx;
2302 
2303 	idx = rx->cnt & rx->mask;
2304 	rx->cnt++;
2305 
2306 	/* Save a pointer to the received mbuf */
2307 	m = rx->info[idx].m;
2308 
2309 	/* Try to replace the received mbuf */
2310 	if (mxge_get_buf_small(rx, rx->extra_map, idx, FALSE)) {
2311 		/* Drop the frame -- the old mbuf is re-cycled */
2312 		IFNET_STAT_INC(ifp, ierrors, 1);
2313 		return;
2314 	}
2315 
2316 	/* Unmap the received buffer */
2317 	old_map = rx->info[idx].map;
2318 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2319 	bus_dmamap_unload(rx->dmat, old_map);
2320 
2321 	/* Swap the bus_dmamap_t's */
2322 	rx->info[idx].map = rx->extra_map;
2323 	rx->extra_map = old_map;
2324 
2325 	/*
2326 	 * mcp implicitly skips 1st 2 bytes so that packet is properly
2327 	 * aligned
2328 	 */
2329 	m->m_data += MXGEFW_PAD;
2330 
2331 	m->m_pkthdr.rcvif = ifp;
2332 	m->m_len = m->m_pkthdr.len = len;
2333 
2334 	IFNET_STAT_INC(ifp, ipackets, 1);
2335 
2336 	eh = mtod(m, const struct ether_header *);
2337 	if (eh->ether_type == htons(ETHERTYPE_VLAN))
2338 		mxge_vlan_tag_remove(m, &csum);
2339 
2340 	/* If the checksum is valid, mark it in the mbuf header */
2341 	if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2342 	    mxge_rx_csum(m, csum) == 0) {
2343 		/* Tell the stack that the checksum is good */
2344 		m->m_pkthdr.csum_data = 0xffff;
2345 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2346 		    CSUM_DATA_VALID;
2347 	}
2348 	ifp->if_input(ifp, m, NULL, -1);
2349 }
2350 
2351 static __inline void
2352 mxge_clean_rx_done(struct ifnet *ifp, struct mxge_rx_data *rx_data, int cycle)
2353 {
2354 	mxge_rx_done_t *rx_done = &rx_data->rx_done;
2355 
2356 	while (rx_done->entry[rx_done->idx].length != 0 && cycle != 0) {
2357 		uint16_t length, checksum;
2358 
2359 		length = ntohs(rx_done->entry[rx_done->idx].length);
2360 		rx_done->entry[rx_done->idx].length = 0;
2361 
2362 		checksum = rx_done->entry[rx_done->idx].checksum;
2363 
2364 		if (length <= MXGE_RX_SMALL_BUFLEN) {
2365 			mxge_rx_done_small(ifp, &rx_data->rx_small,
2366 			    length, checksum);
2367 		} else {
2368 			mxge_rx_done_big(ifp, &rx_data->rx_big,
2369 			    length, checksum);
2370 		}
2371 
2372 		rx_done->idx++;
2373 		rx_done->idx &= rx_done->mask;
2374 		--cycle;
2375 	}
2376 }
2377 
2378 static __inline void
2379 mxge_tx_done(struct ifnet *ifp, mxge_tx_ring_t *tx, uint32_t mcp_idx)
2380 {
2381 	ASSERT_SERIALIZED(&tx->tx_serialize);
2382 
2383 	while (tx->pkt_done != mcp_idx) {
2384 		struct mbuf *m;
2385 		int idx;
2386 
2387 		idx = tx->done & tx->mask;
2388 		tx->done++;
2389 
2390 		m = tx->info[idx].m;
2391 		/*
2392 		 * mbuf and DMA map only attached to the first
2393 		 * segment per-mbuf.
2394 		 */
2395 		if (m != NULL) {
2396 			tx->pkt_done++;
2397 			IFNET_STAT_INC(ifp, opackets, 1);
2398 			tx->info[idx].m = NULL;
2399 			bus_dmamap_unload(tx->dmat, tx->info[idx].map);
2400 			m_freem(m);
2401 		}
2402 	}
2403 
2404 	/*
2405 	 * If we have space, clear OACTIVE to tell the stack that
2406 	 * its OK to send packets
2407 	 */
2408 	if (tx->req - tx->done < (tx->mask + 1) / 2) {
2409 		ifsq_clr_oactive(tx->ifsq);
2410 		if (tx->req == tx->done) {
2411 			/* Reset watchdog */
2412 			tx->watchdog.wd_timer = 0;
2413 		}
2414 	}
2415 
2416 	if (!ifsq_is_empty(tx->ifsq))
2417 		ifsq_devstart(tx->ifsq);
2418 
2419 	if (tx->send_stop != NULL && tx->req == tx->done) {
2420 		/*
2421 		 * Let the NIC stop polling this queue, since there
2422 		 * are no more transmits pending
2423 		 */
2424 		*tx->send_stop = 1;
2425 		tx->queue_active = 0;
2426 		tx->deactivate++;
2427 		wmb();
2428 	}
2429 }
2430 
2431 static struct mxge_media_type mxge_xfp_media_types[] = {
2432 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2433 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2434 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2435 	{0,		(1 << 5),	"10GBASE-ER"},
2436 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2437 	{0,		(1 << 3),	"10GBASE-SW"},
2438 	{0,		(1 << 2),	"10GBASE-LW"},
2439 	{0,		(1 << 1),	"10GBASE-EW"},
2440 	{0,		(1 << 0),	"Reserved"}
2441 };
2442 
2443 static struct mxge_media_type mxge_sfp_media_types[] = {
2444 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2445 	{0,		(1 << 7),	"Reserved"},
2446 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2447 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2448 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2449 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2450 };
2451 
2452 static void
2453 mxge_media_set(mxge_softc_t *sc, int media_type)
2454 {
2455 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type, 0, NULL);
2456 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2457 	sc->current_media = media_type;
2458 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2459 }
2460 
2461 static void
2462 mxge_media_init(mxge_softc_t *sc)
2463 {
2464 	const char *ptr;
2465 	int i;
2466 
2467 	ifmedia_removeall(&sc->media);
2468 	mxge_media_set(sc, IFM_AUTO);
2469 
2470 	/*
2471 	 * Parse the product code to deterimine the interface type
2472 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2473 	 * after the 3rd dash in the driver's cached copy of the
2474 	 * EEPROM's product code string.
2475 	 */
2476 	ptr = sc->product_code_string;
2477 	if (ptr == NULL) {
2478 		if_printf(sc->ifp, "Missing product code\n");
2479 		return;
2480 	}
2481 
2482 	for (i = 0; i < 3; i++, ptr++) {
2483 		ptr = strchr(ptr, '-');
2484 		if (ptr == NULL) {
2485 			if_printf(sc->ifp, "only %d dashes in PC?!?\n", i);
2486 			return;
2487 		}
2488 	}
2489 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2490 		/* -C is CX4 */
2491 		sc->connector = MXGE_CX4;
2492 		mxge_media_set(sc, IFM_10G_CX4);
2493 	} else if (*ptr == 'Q') {
2494 		/* -Q is Quad Ribbon Fiber */
2495 		sc->connector = MXGE_QRF;
2496 		if_printf(sc->ifp, "Quad Ribbon Fiber Media\n");
2497 		/* DragonFly has no media type for Quad ribbon fiber */
2498 	} else if (*ptr == 'R') {
2499 		/* -R is XFP */
2500 		sc->connector = MXGE_XFP;
2501 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2502 		/* -S or -2S is SFP+ */
2503 		sc->connector = MXGE_SFP;
2504 	} else {
2505 		if_printf(sc->ifp, "Unknown media type: %c\n", *ptr);
2506 	}
2507 }
2508 
2509 /*
2510  * Determine the media type for a NIC.  Some XFPs will identify
2511  * themselves only when their link is up, so this is initiated via a
2512  * link up interrupt.  However, this can potentially take up to
2513  * several milliseconds, so it is run via the watchdog routine, rather
2514  * than in the interrupt handler itself.
2515  */
2516 static void
2517 mxge_media_probe(mxge_softc_t *sc)
2518 {
2519 	mxge_cmd_t cmd;
2520 	const char *cage_type;
2521 	struct mxge_media_type *mxge_media_types = NULL;
2522 	int i, err, ms, mxge_media_type_entries;
2523 	uint32_t byte;
2524 
2525 	sc->need_media_probe = 0;
2526 
2527 	if (sc->connector == MXGE_XFP) {
2528 		/* -R is XFP */
2529 		mxge_media_types = mxge_xfp_media_types;
2530 		mxge_media_type_entries = NELEM(mxge_xfp_media_types);
2531 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2532 		cage_type = "XFP";
2533 	} else 	if (sc->connector == MXGE_SFP) {
2534 		/* -S or -2S is SFP+ */
2535 		mxge_media_types = mxge_sfp_media_types;
2536 		mxge_media_type_entries = NELEM(mxge_sfp_media_types);
2537 		cage_type = "SFP+";
2538 		byte = 3;
2539 	} else {
2540 		/* nothing to do; media type cannot change */
2541 		return;
2542 	}
2543 
2544 	/*
2545 	 * At this point we know the NIC has an XFP cage, so now we
2546 	 * try to determine what is in the cage by using the
2547 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2548 	 * register.  We read just one byte, which may take over
2549 	 * a millisecond
2550 	 */
2551 
2552 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2553 	cmd.data1 = byte;
2554 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2555 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE)
2556 		if_printf(sc->ifp, "failed to read XFP\n");
2557 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT)
2558 		if_printf(sc->ifp, "Type R/S with no XFP!?!?\n");
2559 	if (err != MXGEFW_CMD_OK)
2560 		return;
2561 
2562 	/* Now we wait for the data to be cached */
2563 	cmd.data0 = byte;
2564 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2565 	for (ms = 0; err == EBUSY && ms < 50; ms++) {
2566 		DELAY(1000);
2567 		cmd.data0 = byte;
2568 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2569 	}
2570 	if (err != MXGEFW_CMD_OK) {
2571 		if_printf(sc->ifp, "failed to read %s (%d, %dms)\n",
2572 		    cage_type, err, ms);
2573 		return;
2574 	}
2575 
2576 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2577 		if (bootverbose) {
2578 			if_printf(sc->ifp, "%s:%s\n", cage_type,
2579 			    mxge_media_types[0].name);
2580 		}
2581 		if (sc->current_media != mxge_media_types[0].flag) {
2582 			mxge_media_init(sc);
2583 			mxge_media_set(sc, mxge_media_types[0].flag);
2584 		}
2585 		return;
2586 	}
2587 	for (i = 1; i < mxge_media_type_entries; i++) {
2588 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2589 			if (bootverbose) {
2590 				if_printf(sc->ifp, "%s:%s\n", cage_type,
2591 				    mxge_media_types[i].name);
2592 			}
2593 
2594 			if (sc->current_media != mxge_media_types[i].flag) {
2595 				mxge_media_init(sc);
2596 				mxge_media_set(sc, mxge_media_types[i].flag);
2597 			}
2598 			return;
2599 		}
2600 	}
2601 	if (bootverbose) {
2602 		if_printf(sc->ifp, "%s media 0x%x unknown\n", cage_type,
2603 		    cmd.data0);
2604 	}
2605 }
2606 
2607 static void
2608 mxge_intr_status(struct mxge_softc *sc, const mcp_irq_data_t *stats)
2609 {
2610 	if (sc->link_state != stats->link_up) {
2611 		sc->link_state = stats->link_up;
2612 		if (sc->link_state) {
2613 			sc->ifp->if_link_state = LINK_STATE_UP;
2614 			if_link_state_change(sc->ifp);
2615 			if (bootverbose)
2616 				if_printf(sc->ifp, "link up\n");
2617 		} else {
2618 			sc->ifp->if_link_state = LINK_STATE_DOWN;
2619 			if_link_state_change(sc->ifp);
2620 			if (bootverbose)
2621 				if_printf(sc->ifp, "link down\n");
2622 		}
2623 		sc->need_media_probe = 1;
2624 	}
2625 
2626 	if (sc->rdma_tags_available != be32toh(stats->rdma_tags_available)) {
2627 		sc->rdma_tags_available = be32toh(stats->rdma_tags_available);
2628 		if_printf(sc->ifp, "RDMA timed out! %d tags left\n",
2629 		    sc->rdma_tags_available);
2630 	}
2631 
2632 	if (stats->link_down) {
2633 		sc->down_cnt += stats->link_down;
2634 		sc->link_state = 0;
2635 		sc->ifp->if_link_state = LINK_STATE_DOWN;
2636 		if_link_state_change(sc->ifp);
2637 	}
2638 }
2639 
2640 static void
2641 mxge_serialize_skipmain(struct mxge_softc *sc)
2642 {
2643 	lwkt_serialize_array_enter(sc->serializes, sc->nserialize, 1);
2644 }
2645 
2646 static void
2647 mxge_deserialize_skipmain(struct mxge_softc *sc)
2648 {
2649 	lwkt_serialize_array_exit(sc->serializes, sc->nserialize, 1);
2650 }
2651 
2652 static void
2653 mxge_legacy(void *arg)
2654 {
2655 	struct mxge_slice_state *ss = arg;
2656 	mxge_softc_t *sc = ss->sc;
2657 	mcp_irq_data_t *stats = ss->fw_stats;
2658 	mxge_tx_ring_t *tx = &ss->tx;
2659 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2660 	uint32_t send_done_count;
2661 	uint8_t valid;
2662 
2663 	ASSERT_SERIALIZED(&sc->main_serialize);
2664 
2665 	/* Make sure the DMA has finished */
2666 	if (!stats->valid)
2667 		return;
2668 	valid = stats->valid;
2669 
2670 	/* Lower legacy IRQ */
2671 	*sc->irq_deassert = 0;
2672 	if (!mxge_deassert_wait) {
2673 		/* Don't wait for conf. that irq is low */
2674 		stats->valid = 0;
2675 	}
2676 
2677 	mxge_serialize_skipmain(sc);
2678 
2679 	/*
2680 	 * Loop while waiting for legacy irq deassertion
2681 	 * XXX do we really want to loop?
2682 	 */
2683 	do {
2684 		/* Check for transmit completes and receives */
2685 		send_done_count = be32toh(stats->send_done_count);
2686 		while ((send_done_count != tx->pkt_done) ||
2687 		       (rx_done->entry[rx_done->idx].length != 0)) {
2688 			if (send_done_count != tx->pkt_done) {
2689 				mxge_tx_done(&sc->arpcom.ac_if, tx,
2690 				    (int)send_done_count);
2691 			}
2692 			mxge_clean_rx_done(&sc->arpcom.ac_if, &ss->rx_data, -1);
2693 			send_done_count = be32toh(stats->send_done_count);
2694 		}
2695 		if (mxge_deassert_wait)
2696 			wmb();
2697 	} while (*((volatile uint8_t *)&stats->valid));
2698 
2699 	mxge_deserialize_skipmain(sc);
2700 
2701 	/* Fw link & error stats meaningful only on the first slice */
2702 	if (__predict_false(stats->stats_updated))
2703 		mxge_intr_status(sc, stats);
2704 
2705 	/* Check to see if we have rx token to pass back */
2706 	if (valid & 0x1)
2707 		*ss->irq_claim = be32toh(3);
2708 	*(ss->irq_claim + 1) = be32toh(3);
2709 }
2710 
2711 static void
2712 mxge_msi(void *arg)
2713 {
2714 	struct mxge_slice_state *ss = arg;
2715 	mxge_softc_t *sc = ss->sc;
2716 	mcp_irq_data_t *stats = ss->fw_stats;
2717 	mxge_tx_ring_t *tx = &ss->tx;
2718 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2719 	uint32_t send_done_count;
2720 	uint8_t valid;
2721 #ifndef IFPOLL_ENABLE
2722 	const boolean_t polling = FALSE;
2723 #else
2724 	boolean_t polling = FALSE;
2725 #endif
2726 
2727 	ASSERT_SERIALIZED(&sc->main_serialize);
2728 
2729 	/* Make sure the DMA has finished */
2730 	if (__predict_false(!stats->valid))
2731 		return;
2732 
2733 	valid = stats->valid;
2734 	stats->valid = 0;
2735 
2736 #ifdef IFPOLL_ENABLE
2737 	if (sc->arpcom.ac_if.if_flags & IFF_NPOLLING)
2738 		polling = TRUE;
2739 #endif
2740 
2741 	if (!polling) {
2742 		/* Check for receives */
2743 		lwkt_serialize_enter(&ss->rx_data.rx_serialize);
2744 		if (rx_done->entry[rx_done->idx].length != 0)
2745 			mxge_clean_rx_done(&sc->arpcom.ac_if, &ss->rx_data, -1);
2746 		lwkt_serialize_exit(&ss->rx_data.rx_serialize);
2747 	}
2748 
2749 	/*
2750 	 * Check for transmit completes
2751 	 *
2752 	 * NOTE:
2753 	 * Since pkt_done is only changed by mxge_tx_done(),
2754 	 * which is called only in interrupt handler, the
2755 	 * check w/o holding tx serializer is MPSAFE.
2756 	 */
2757 	send_done_count = be32toh(stats->send_done_count);
2758 	if (send_done_count != tx->pkt_done) {
2759 		lwkt_serialize_enter(&tx->tx_serialize);
2760 		mxge_tx_done(&sc->arpcom.ac_if, tx, (int)send_done_count);
2761 		lwkt_serialize_exit(&tx->tx_serialize);
2762 	}
2763 
2764 	if (__predict_false(stats->stats_updated))
2765 		mxge_intr_status(sc, stats);
2766 
2767 	/* Check to see if we have rx token to pass back */
2768 	if (!polling && (valid & 0x1))
2769 		*ss->irq_claim = be32toh(3);
2770 	*(ss->irq_claim + 1) = be32toh(3);
2771 }
2772 
2773 static void
2774 mxge_msix_rx(void *arg)
2775 {
2776 	struct mxge_slice_state *ss = arg;
2777 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2778 
2779 #ifdef IFPOLL_ENABLE
2780 	if (ss->sc->arpcom.ac_if.if_flags & IFF_NPOLLING)
2781 		return;
2782 #endif
2783 
2784 	ASSERT_SERIALIZED(&ss->rx_data.rx_serialize);
2785 
2786 	if (rx_done->entry[rx_done->idx].length != 0)
2787 		mxge_clean_rx_done(&ss->sc->arpcom.ac_if, &ss->rx_data, -1);
2788 
2789 	*ss->irq_claim = be32toh(3);
2790 }
2791 
2792 static void
2793 mxge_msix_rxtx(void *arg)
2794 {
2795 	struct mxge_slice_state *ss = arg;
2796 	mxge_softc_t *sc = ss->sc;
2797 	mcp_irq_data_t *stats = ss->fw_stats;
2798 	mxge_tx_ring_t *tx = &ss->tx;
2799 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2800 	uint32_t send_done_count;
2801 	uint8_t valid;
2802 #ifndef IFPOLL_ENABLE
2803 	const boolean_t polling = FALSE;
2804 #else
2805 	boolean_t polling = FALSE;
2806 #endif
2807 
2808 	ASSERT_SERIALIZED(&ss->rx_data.rx_serialize);
2809 
2810 	/* Make sure the DMA has finished */
2811 	if (__predict_false(!stats->valid))
2812 		return;
2813 
2814 	valid = stats->valid;
2815 	stats->valid = 0;
2816 
2817 #ifdef IFPOLL_ENABLE
2818 	if (sc->arpcom.ac_if.if_flags & IFF_NPOLLING)
2819 		polling = TRUE;
2820 #endif
2821 
2822 	/* Check for receives */
2823 	if (!polling && rx_done->entry[rx_done->idx].length != 0)
2824 		mxge_clean_rx_done(&sc->arpcom.ac_if, &ss->rx_data, -1);
2825 
2826 	/*
2827 	 * Check for transmit completes
2828 	 *
2829 	 * NOTE:
2830 	 * Since pkt_done is only changed by mxge_tx_done(),
2831 	 * which is called only in interrupt handler, the
2832 	 * check w/o holding tx serializer is MPSAFE.
2833 	 */
2834 	send_done_count = be32toh(stats->send_done_count);
2835 	if (send_done_count != tx->pkt_done) {
2836 		lwkt_serialize_enter(&tx->tx_serialize);
2837 		mxge_tx_done(&sc->arpcom.ac_if, tx, (int)send_done_count);
2838 		lwkt_serialize_exit(&tx->tx_serialize);
2839 	}
2840 
2841 	/* Check to see if we have rx token to pass back */
2842 	if (!polling && (valid & 0x1))
2843 		*ss->irq_claim = be32toh(3);
2844 	*(ss->irq_claim + 1) = be32toh(3);
2845 }
2846 
2847 static void
2848 mxge_init(void *arg)
2849 {
2850 	struct mxge_softc *sc = arg;
2851 
2852 	ASSERT_IFNET_SERIALIZED_ALL(sc->ifp);
2853 	if ((sc->ifp->if_flags & IFF_RUNNING) == 0)
2854 		mxge_open(sc);
2855 }
2856 
2857 static void
2858 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2859 {
2860 	int i;
2861 
2862 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
2863 		if (ss->rx_data.rx_big.info[i].m == NULL)
2864 			continue;
2865 		bus_dmamap_unload(ss->rx_data.rx_big.dmat,
2866 		    ss->rx_data.rx_big.info[i].map);
2867 		m_freem(ss->rx_data.rx_big.info[i].m);
2868 		ss->rx_data.rx_big.info[i].m = NULL;
2869 	}
2870 
2871 	for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
2872 		if (ss->rx_data.rx_small.info[i].m == NULL)
2873 			continue;
2874 		bus_dmamap_unload(ss->rx_data.rx_small.dmat,
2875 		    ss->rx_data.rx_small.info[i].map);
2876 		m_freem(ss->rx_data.rx_small.info[i].m);
2877 		ss->rx_data.rx_small.info[i].m = NULL;
2878 	}
2879 
2880 	/* Transmit ring used only on the first slice */
2881 	if (ss->tx.info == NULL)
2882 		return;
2883 
2884 	for (i = 0; i <= ss->tx.mask; i++) {
2885 		if (ss->tx.info[i].m == NULL)
2886 			continue;
2887 		bus_dmamap_unload(ss->tx.dmat, ss->tx.info[i].map);
2888 		m_freem(ss->tx.info[i].m);
2889 		ss->tx.info[i].m = NULL;
2890 	}
2891 }
2892 
2893 static void
2894 mxge_free_mbufs(mxge_softc_t *sc)
2895 {
2896 	int slice;
2897 
2898 	for (slice = 0; slice < sc->num_slices; slice++)
2899 		mxge_free_slice_mbufs(&sc->ss[slice]);
2900 }
2901 
2902 static void
2903 mxge_free_slice_rings(struct mxge_slice_state *ss)
2904 {
2905 	int i;
2906 
2907 	if (ss->rx_data.rx_done.entry != NULL) {
2908 		mxge_dma_free(&ss->rx_done_dma);
2909 		ss->rx_data.rx_done.entry = NULL;
2910 	}
2911 
2912 	if (ss->tx.req_list != NULL) {
2913 		kfree(ss->tx.req_list, M_DEVBUF);
2914 		ss->tx.req_list = NULL;
2915 	}
2916 
2917 	if (ss->tx.seg_list != NULL) {
2918 		kfree(ss->tx.seg_list, M_DEVBUF);
2919 		ss->tx.seg_list = NULL;
2920 	}
2921 
2922 	if (ss->rx_data.rx_small.shadow != NULL) {
2923 		kfree(ss->rx_data.rx_small.shadow, M_DEVBUF);
2924 		ss->rx_data.rx_small.shadow = NULL;
2925 	}
2926 
2927 	if (ss->rx_data.rx_big.shadow != NULL) {
2928 		kfree(ss->rx_data.rx_big.shadow, M_DEVBUF);
2929 		ss->rx_data.rx_big.shadow = NULL;
2930 	}
2931 
2932 	if (ss->tx.info != NULL) {
2933 		if (ss->tx.dmat != NULL) {
2934 			for (i = 0; i <= ss->tx.mask; i++) {
2935 				bus_dmamap_destroy(ss->tx.dmat,
2936 				    ss->tx.info[i].map);
2937 			}
2938 			bus_dma_tag_destroy(ss->tx.dmat);
2939 		}
2940 		kfree(ss->tx.info, M_DEVBUF);
2941 		ss->tx.info = NULL;
2942 	}
2943 
2944 	if (ss->rx_data.rx_small.info != NULL) {
2945 		if (ss->rx_data.rx_small.dmat != NULL) {
2946 			for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
2947 				bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
2948 				    ss->rx_data.rx_small.info[i].map);
2949 			}
2950 			bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
2951 			    ss->rx_data.rx_small.extra_map);
2952 			bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
2953 		}
2954 		kfree(ss->rx_data.rx_small.info, M_DEVBUF);
2955 		ss->rx_data.rx_small.info = NULL;
2956 	}
2957 
2958 	if (ss->rx_data.rx_big.info != NULL) {
2959 		if (ss->rx_data.rx_big.dmat != NULL) {
2960 			for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
2961 				bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
2962 				    ss->rx_data.rx_big.info[i].map);
2963 			}
2964 			bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
2965 			    ss->rx_data.rx_big.extra_map);
2966 			bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
2967 		}
2968 		kfree(ss->rx_data.rx_big.info, M_DEVBUF);
2969 		ss->rx_data.rx_big.info = NULL;
2970 	}
2971 }
2972 
2973 static void
2974 mxge_free_rings(mxge_softc_t *sc)
2975 {
2976 	int slice;
2977 
2978 	if (sc->ss == NULL)
2979 		return;
2980 
2981 	for (slice = 0; slice < sc->num_slices; slice++)
2982 		mxge_free_slice_rings(&sc->ss[slice]);
2983 }
2984 
2985 static int
2986 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
2987     int tx_ring_entries)
2988 {
2989 	mxge_softc_t *sc = ss->sc;
2990 	size_t bytes;
2991 	int err, i;
2992 
2993 	/*
2994 	 * Allocate per-slice receive resources
2995 	 */
2996 
2997 	ss->rx_data.rx_small.mask = ss->rx_data.rx_big.mask =
2998 	    rx_ring_entries - 1;
2999 	ss->rx_data.rx_done.mask = (2 * rx_ring_entries) - 1;
3000 
3001 	/* Allocate the rx shadow rings */
3002 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_small.shadow);
3003 	ss->rx_data.rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3004 
3005 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_big.shadow);
3006 	ss->rx_data.rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3007 
3008 	/* Allocate the rx host info rings */
3009 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_small.info);
3010 	ss->rx_data.rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3011 
3012 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_big.info);
3013 	ss->rx_data.rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3014 
3015 	/* Allocate the rx busdma resources */
3016 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3017 				 1,			/* alignment */
3018 				 4096,			/* boundary */
3019 				 BUS_SPACE_MAXADDR,	/* low */
3020 				 BUS_SPACE_MAXADDR,	/* high */
3021 				 NULL, NULL,		/* filter */
3022 				 MHLEN,			/* maxsize */
3023 				 1,			/* num segs */
3024 				 MHLEN,			/* maxsegsize */
3025 				 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
3026 				 			/* flags */
3027 				 &ss->rx_data.rx_small.dmat); /* tag */
3028 	if (err != 0) {
3029 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3030 		    err);
3031 		return err;
3032 	}
3033 
3034 	err = bus_dmamap_create(ss->rx_data.rx_small.dmat, BUS_DMA_WAITOK,
3035 	    &ss->rx_data.rx_small.extra_map);
3036 	if (err != 0) {
3037 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n", err);
3038 		bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
3039 		ss->rx_data.rx_small.dmat = NULL;
3040 		return err;
3041 	}
3042 	for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
3043 		err = bus_dmamap_create(ss->rx_data.rx_small.dmat,
3044 		    BUS_DMA_WAITOK, &ss->rx_data.rx_small.info[i].map);
3045 		if (err != 0) {
3046 			int j;
3047 
3048 			device_printf(sc->dev, "Err %d rx_small dmamap\n", err);
3049 
3050 			for (j = 0; j < i; ++j) {
3051 				bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
3052 				    ss->rx_data.rx_small.info[j].map);
3053 			}
3054 			bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
3055 			    ss->rx_data.rx_small.extra_map);
3056 			bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
3057 			ss->rx_data.rx_small.dmat = NULL;
3058 			return err;
3059 		}
3060 	}
3061 
3062 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3063 				 1,			/* alignment */
3064 				 4096,			/* boundary */
3065 				 BUS_SPACE_MAXADDR,	/* low */
3066 				 BUS_SPACE_MAXADDR,	/* high */
3067 				 NULL, NULL,		/* filter */
3068 				 4096,			/* maxsize */
3069 				 1,			/* num segs */
3070 				 4096,			/* maxsegsize*/
3071 				 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
3072 				 			/* flags */
3073 				 &ss->rx_data.rx_big.dmat); /* tag */
3074 	if (err != 0) {
3075 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3076 		    err);
3077 		return err;
3078 	}
3079 
3080 	err = bus_dmamap_create(ss->rx_data.rx_big.dmat, BUS_DMA_WAITOK,
3081 	    &ss->rx_data.rx_big.extra_map);
3082 	if (err != 0) {
3083 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n", err);
3084 		bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
3085 		ss->rx_data.rx_big.dmat = NULL;
3086 		return err;
3087 	}
3088 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
3089 		err = bus_dmamap_create(ss->rx_data.rx_big.dmat, BUS_DMA_WAITOK,
3090 		    &ss->rx_data.rx_big.info[i].map);
3091 		if (err != 0) {
3092 			int j;
3093 
3094 			device_printf(sc->dev, "Err %d rx_big dmamap\n", err);
3095 			for (j = 0; j < i; ++j) {
3096 				bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
3097 				    ss->rx_data.rx_big.info[j].map);
3098 			}
3099 			bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
3100 			    ss->rx_data.rx_big.extra_map);
3101 			bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
3102 			ss->rx_data.rx_big.dmat = NULL;
3103 			return err;
3104 		}
3105 	}
3106 
3107 	/*
3108 	 * Now allocate TX resources
3109 	 */
3110 
3111 	ss->tx.mask = tx_ring_entries - 1;
3112 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3113 
3114 	/*
3115 	 * Allocate the tx request copy block; MUST be at least 8 bytes
3116 	 * aligned
3117 	 */
3118 	bytes = sizeof(*ss->tx.req_list) * (ss->tx.max_desc + 4);
3119 	ss->tx.req_list = kmalloc_cachealign(__VM_CACHELINE_ALIGN(bytes),
3120 	    M_DEVBUF, M_WAITOK);
3121 
3122 	/* Allocate the tx busdma segment list */
3123 	bytes = sizeof(*ss->tx.seg_list) * ss->tx.max_desc;
3124 	ss->tx.seg_list = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3125 
3126 	/* Allocate the tx host info ring */
3127 	bytes = tx_ring_entries * sizeof(*ss->tx.info);
3128 	ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3129 
3130 	/* Allocate the tx busdma resources */
3131 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3132 				 1,			/* alignment */
3133 				 sc->tx_boundary,	/* boundary */
3134 				 BUS_SPACE_MAXADDR,	/* low */
3135 				 BUS_SPACE_MAXADDR,	/* high */
3136 				 NULL, NULL,		/* filter */
3137 				 IP_MAXPACKET +
3138 				 sizeof(struct ether_vlan_header),
3139 				 			/* maxsize */
3140 				 ss->tx.max_desc - 2,	/* num segs */
3141 				 sc->tx_boundary,	/* maxsegsz */
3142 				 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW |
3143 				 BUS_DMA_ONEBPAGE,	/* flags */
3144 				 &ss->tx.dmat);		/* tag */
3145 	if (err != 0) {
3146 		device_printf(sc->dev, "Err %d allocating tx dmat\n", err);
3147 		return err;
3148 	}
3149 
3150 	/*
3151 	 * Now use these tags to setup DMA maps for each slot in the ring
3152 	 */
3153 	for (i = 0; i <= ss->tx.mask; i++) {
3154 		err = bus_dmamap_create(ss->tx.dmat,
3155 		    BUS_DMA_WAITOK | BUS_DMA_ONEBPAGE, &ss->tx.info[i].map);
3156 		if (err != 0) {
3157 			int j;
3158 
3159 			device_printf(sc->dev, "Err %d tx dmamap\n", err);
3160 			for (j = 0; j < i; ++j) {
3161 				bus_dmamap_destroy(ss->tx.dmat,
3162 				    ss->tx.info[j].map);
3163 			}
3164 			bus_dma_tag_destroy(ss->tx.dmat);
3165 			ss->tx.dmat = NULL;
3166 			return err;
3167 		}
3168 	}
3169 	return 0;
3170 }
3171 
3172 static int
3173 mxge_alloc_rings(mxge_softc_t *sc)
3174 {
3175 	mxge_cmd_t cmd;
3176 	int tx_ring_size;
3177 	int tx_ring_entries, rx_ring_entries;
3178 	int err, slice;
3179 
3180 	/* Get ring sizes */
3181 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3182 	if (err != 0) {
3183 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3184 		return err;
3185 	}
3186 	tx_ring_size = cmd.data0;
3187 
3188 	tx_ring_entries = tx_ring_size / sizeof(mcp_kreq_ether_send_t);
3189 	rx_ring_entries = sc->rx_intr_slots / 2;
3190 
3191 	if (bootverbose) {
3192 		device_printf(sc->dev, "tx desc %d, rx desc %d\n",
3193 		    tx_ring_entries, rx_ring_entries);
3194 	}
3195 
3196 	sc->ifp->if_nmbclusters = rx_ring_entries * sc->num_slices;
3197 	sc->ifp->if_nmbjclusters = sc->ifp->if_nmbclusters;
3198 
3199 	ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3200 	ifq_set_ready(&sc->ifp->if_snd);
3201 	ifq_set_subq_cnt(&sc->ifp->if_snd, sc->num_tx_rings);
3202 
3203 	if (sc->num_tx_rings > 1) {
3204 		sc->ifp->if_mapsubq = ifq_mapsubq_mask;
3205 		ifq_set_subq_mask(&sc->ifp->if_snd, sc->num_tx_rings - 1);
3206 	}
3207 
3208 	for (slice = 0; slice < sc->num_slices; slice++) {
3209 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3210 		    rx_ring_entries, tx_ring_entries);
3211 		if (err != 0) {
3212 			device_printf(sc->dev,
3213 			    "alloc %d slice rings failed\n", slice);
3214 			return err;
3215 		}
3216 	}
3217 	return 0;
3218 }
3219 
3220 static void
3221 mxge_choose_params(int mtu, int *cl_size)
3222 {
3223 	int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3224 
3225 	if (bufsize < MCLBYTES) {
3226 		*cl_size = MCLBYTES;
3227 	} else {
3228 		KASSERT(bufsize < MJUMPAGESIZE, ("invalid MTU %d", mtu));
3229 		*cl_size = MJUMPAGESIZE;
3230 	}
3231 }
3232 
3233 static int
3234 mxge_slice_open(struct mxge_slice_state *ss, int cl_size)
3235 {
3236 	mxge_cmd_t cmd;
3237 	int err, i, slice;
3238 
3239 	slice = ss - ss->sc->ss;
3240 
3241 	/*
3242 	 * Get the lanai pointers to the send and receive rings
3243 	 */
3244 	err = 0;
3245 
3246 	if (ss->sc->num_tx_rings == 1) {
3247 		if (slice == 0) {
3248 			cmd.data0 = slice;
3249 			err = mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SEND_OFFSET,
3250 			    &cmd);
3251 			ss->tx.lanai = (volatile mcp_kreq_ether_send_t *)
3252 			    (ss->sc->sram + cmd.data0);
3253 			/* Leave send_go and send_stop as NULL */
3254 		}
3255 	} else {
3256 		cmd.data0 = slice;
3257 		err = mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3258 		ss->tx.lanai = (volatile mcp_kreq_ether_send_t *)
3259 		    (ss->sc->sram + cmd.data0);
3260 		ss->tx.send_go = (volatile uint32_t *)
3261 		    (ss->sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3262 		ss->tx.send_stop = (volatile uint32_t *)
3263 		    (ss->sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3264 	}
3265 
3266 	cmd.data0 = slice;
3267 	err |= mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3268 	ss->rx_data.rx_small.lanai =
3269 	    (volatile mcp_kreq_ether_recv_t *)(ss->sc->sram + cmd.data0);
3270 
3271 	cmd.data0 = slice;
3272 	err |= mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3273 	ss->rx_data.rx_big.lanai =
3274 	    (volatile mcp_kreq_ether_recv_t *)(ss->sc->sram + cmd.data0);
3275 
3276 	if (err != 0) {
3277 		if_printf(ss->sc->ifp,
3278 		    "failed to get ring sizes or locations\n");
3279 		return EIO;
3280 	}
3281 
3282 	/*
3283 	 * Stock small receive ring
3284 	 */
3285 	for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
3286 		err = mxge_get_buf_small(&ss->rx_data.rx_small,
3287 		    ss->rx_data.rx_small.info[i].map, i, TRUE);
3288 		if (err) {
3289 			if_printf(ss->sc->ifp, "alloced %d/%d smalls\n", i,
3290 			    ss->rx_data.rx_small.mask + 1);
3291 			return ENOMEM;
3292 		}
3293 	}
3294 
3295 	/*
3296 	 * Stock big receive ring
3297 	 */
3298 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
3299 		ss->rx_data.rx_big.shadow[i].addr_low = 0xffffffff;
3300 		ss->rx_data.rx_big.shadow[i].addr_high = 0xffffffff;
3301 	}
3302 
3303 	ss->rx_data.rx_big.cl_size = cl_size;
3304 
3305 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
3306 		err = mxge_get_buf_big(&ss->rx_data.rx_big,
3307 		    ss->rx_data.rx_big.info[i].map, i, TRUE);
3308 		if (err) {
3309 			if_printf(ss->sc->ifp, "alloced %d/%d bigs\n", i,
3310 			    ss->rx_data.rx_big.mask + 1);
3311 			return ENOMEM;
3312 		}
3313 	}
3314 	return 0;
3315 }
3316 
3317 static int
3318 mxge_open(mxge_softc_t *sc)
3319 {
3320 	struct ifnet *ifp = sc->ifp;
3321 	mxge_cmd_t cmd;
3322 	int err, slice, cl_size, i;
3323 	bus_addr_t bus;
3324 	volatile uint8_t *itable;
3325 	struct mxge_slice_state *ss;
3326 
3327 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
3328 
3329 	/* Copy the MAC address in case it was overridden */
3330 	bcopy(IF_LLADDR(ifp), sc->mac_addr, ETHER_ADDR_LEN);
3331 
3332 	err = mxge_reset(sc, 1);
3333 	if (err != 0) {
3334 		if_printf(ifp, "failed to reset\n");
3335 		return EIO;
3336 	}
3337 
3338 	if (sc->num_slices > 1) {
3339 		/* Setup the indirection table */
3340 		cmd.data0 = sc->num_slices;
3341 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE, &cmd);
3342 
3343 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET, &cmd);
3344 		if (err != 0) {
3345 			if_printf(ifp, "failed to setup rss tables\n");
3346 			return err;
3347 		}
3348 
3349 		/* Just enable an identity mapping */
3350 		itable = sc->sram + cmd.data0;
3351 		for (i = 0; i < sc->num_slices; i++)
3352 			itable[i] = (uint8_t)i;
3353 
3354 		if (sc->use_rss) {
3355 			volatile uint8_t *hwkey;
3356 			uint8_t swkey[MXGE_HWRSS_KEYLEN];
3357 
3358 			err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_KEY_OFFSET,
3359 			    &cmd);
3360 			if (err != 0) {
3361 				if_printf(ifp, "failed to get rsskey\n");
3362 				return err;
3363 			}
3364 			hwkey = sc->sram + cmd.data0;
3365 
3366 			toeplitz_get_key(swkey, MXGE_HWRSS_KEYLEN);
3367 			for (i = 0; i < MXGE_HWRSS_KEYLEN; ++i)
3368 				hwkey[i] = swkey[i];
3369 			wmb();
3370 
3371 			err = mxge_send_cmd(sc, MXGEFW_CMD_RSS_KEY_UPDATED,
3372 			    &cmd);
3373 			if (err != 0) {
3374 				if_printf(ifp, "failed to update rsskey\n");
3375 				return err;
3376 			}
3377 			if (bootverbose)
3378 				if_printf(ifp, "RSS key updated\n");
3379 		}
3380 
3381 		cmd.data0 = 1;
3382 		if (sc->use_rss) {
3383 			if (bootverbose)
3384 				if_printf(ifp, "input hash: RSS\n");
3385 			cmd.data1 = MXGEFW_RSS_HASH_TYPE_IPV4 |
3386 			    MXGEFW_RSS_HASH_TYPE_TCP_IPV4;
3387 		} else {
3388 			if (bootverbose)
3389 				if_printf(ifp, "input hash: SRC_DST_PORT\n");
3390 			cmd.data1 = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
3391 		}
3392 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3393 		if (err != 0) {
3394 			if_printf(ifp, "failed to enable slices\n");
3395 			return err;
3396 		}
3397 	}
3398 
3399 	cmd.data0 = MXGEFW_TSO_MODE_NDIS;
3400 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_TSO_MODE, &cmd);
3401 	if (err) {
3402 		/*
3403 		 * Can't change TSO mode to NDIS, never allow TSO then
3404 		 */
3405 		if_printf(ifp, "failed to set TSO mode\n");
3406 		ifp->if_capenable &= ~IFCAP_TSO;
3407 		ifp->if_capabilities &= ~IFCAP_TSO;
3408 		ifp->if_hwassist &= ~CSUM_TSO;
3409 	}
3410 
3411 	mxge_choose_params(ifp->if_mtu, &cl_size);
3412 
3413 	cmd.data0 = 1;
3414 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS, &cmd);
3415 	/*
3416 	 * Error is only meaningful if we're trying to set
3417 	 * MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1
3418 	 */
3419 
3420 	/*
3421 	 * Give the firmware the mtu and the big and small buffer
3422 	 * sizes.  The firmware wants the big buf size to be a power
3423 	 * of two. Luckily, DragonFly's clusters are powers of two
3424 	 */
3425 	cmd.data0 = ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3426 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3427 
3428 	cmd.data0 = MXGE_RX_SMALL_BUFLEN;
3429 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE, &cmd);
3430 
3431 	cmd.data0 = cl_size;
3432 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3433 
3434 	if (err != 0) {
3435 		if_printf(ifp, "failed to setup params\n");
3436 		goto abort;
3437 	}
3438 
3439 	/* Now give him the pointer to the stats block */
3440 	for (slice = 0; slice < sc->num_slices; slice++) {
3441 		ss = &sc->ss[slice];
3442 		cmd.data0 = MXGE_LOWPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3443 		cmd.data1 = MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3444 		cmd.data2 = sizeof(struct mcp_irq_data);
3445 		cmd.data2 |= (slice << 16);
3446 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3447 	}
3448 
3449 	if (err != 0) {
3450 		bus = sc->ss->fw_stats_dma.dmem_busaddr;
3451 		bus += offsetof(struct mcp_irq_data, send_done_count);
3452 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3453 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3454 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3455 		    &cmd);
3456 
3457 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3458 		sc->fw_multicast_support = 0;
3459 	} else {
3460 		sc->fw_multicast_support = 1;
3461 	}
3462 
3463 	if (err != 0) {
3464 		if_printf(ifp, "failed to setup params\n");
3465 		goto abort;
3466 	}
3467 
3468 	for (slice = 0; slice < sc->num_slices; slice++) {
3469 		err = mxge_slice_open(&sc->ss[slice], cl_size);
3470 		if (err != 0) {
3471 			if_printf(ifp, "couldn't open slice %d\n", slice);
3472 			goto abort;
3473 		}
3474 	}
3475 
3476 	/* Finally, start the firmware running */
3477 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3478 	if (err) {
3479 		if_printf(ifp, "Couldn't bring up link\n");
3480 		goto abort;
3481 	}
3482 
3483 	ifp->if_flags |= IFF_RUNNING;
3484 	for (i = 0; i < sc->num_tx_rings; ++i) {
3485 		mxge_tx_ring_t *tx = &sc->ss[i].tx;
3486 
3487 		ifsq_clr_oactive(tx->ifsq);
3488 		ifsq_watchdog_start(&tx->watchdog);
3489 	}
3490 
3491 	return 0;
3492 
3493 abort:
3494 	mxge_free_mbufs(sc);
3495 	return err;
3496 }
3497 
3498 static void
3499 mxge_close(mxge_softc_t *sc, int down)
3500 {
3501 	struct ifnet *ifp = sc->ifp;
3502 	mxge_cmd_t cmd;
3503 	int err, old_down_cnt, i;
3504 
3505 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
3506 
3507 	if (!down) {
3508 		old_down_cnt = sc->down_cnt;
3509 		wmb();
3510 
3511 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3512 		if (err)
3513 			if_printf(ifp, "Couldn't bring down link\n");
3514 
3515 		if (old_down_cnt == sc->down_cnt) {
3516 			/*
3517 			 * Wait for down irq
3518 			 * XXX racy
3519 			 */
3520 			ifnet_deserialize_all(ifp);
3521 			DELAY(10 * sc->intr_coal_delay);
3522 			ifnet_serialize_all(ifp);
3523 		}
3524 
3525 		wmb();
3526 		if (old_down_cnt == sc->down_cnt)
3527 			if_printf(ifp, "never got down irq\n");
3528 	}
3529 	mxge_free_mbufs(sc);
3530 
3531 	ifp->if_flags &= ~IFF_RUNNING;
3532 	for (i = 0; i < sc->num_tx_rings; ++i) {
3533 		mxge_tx_ring_t *tx = &sc->ss[i].tx;
3534 
3535 		ifsq_clr_oactive(tx->ifsq);
3536 		ifsq_watchdog_stop(&tx->watchdog);
3537 	}
3538 }
3539 
3540 static void
3541 mxge_setup_cfg_space(mxge_softc_t *sc)
3542 {
3543 	device_t dev = sc->dev;
3544 	int reg;
3545 	uint16_t lnk, pectl;
3546 
3547 	/* Find the PCIe link width and set max read request to 4KB */
3548 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3549 		lnk = pci_read_config(dev, reg + 0x12, 2);
3550 		sc->link_width = (lnk >> 4) & 0x3f;
3551 
3552 		if (sc->pectl == 0) {
3553 			pectl = pci_read_config(dev, reg + 0x8, 2);
3554 			pectl = (pectl & ~0x7000) | (5 << 12);
3555 			pci_write_config(dev, reg + 0x8, pectl, 2);
3556 			sc->pectl = pectl;
3557 		} else {
3558 			/* Restore saved pectl after watchdog reset */
3559 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3560 		}
3561 	}
3562 
3563 	/* Enable DMA and memory space access */
3564 	pci_enable_busmaster(dev);
3565 }
3566 
3567 static uint32_t
3568 mxge_read_reboot(mxge_softc_t *sc)
3569 {
3570 	device_t dev = sc->dev;
3571 	uint32_t vs;
3572 
3573 	/* Find the vendor specific offset */
3574 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3575 		if_printf(sc->ifp, "could not find vendor specific offset\n");
3576 		return (uint32_t)-1;
3577 	}
3578 	/* Enable read32 mode */
3579 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3580 	/* Tell NIC which register to read */
3581 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3582 	return pci_read_config(dev, vs + 0x14, 4);
3583 }
3584 
3585 static void
3586 mxge_watchdog_reset(mxge_softc_t *sc)
3587 {
3588 	struct pci_devinfo *dinfo;
3589 	int err, running;
3590 	uint32_t reboot;
3591 	uint16_t cmd;
3592 
3593 	err = ENXIO;
3594 
3595 	if_printf(sc->ifp, "Watchdog reset!\n");
3596 
3597 	/*
3598 	 * Check to see if the NIC rebooted.  If it did, then all of
3599 	 * PCI config space has been reset, and things like the
3600 	 * busmaster bit will be zero.  If this is the case, then we
3601 	 * must restore PCI config space before the NIC can be used
3602 	 * again
3603 	 */
3604 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3605 	if (cmd == 0xffff) {
3606 		/*
3607 		 * Maybe the watchdog caught the NIC rebooting; wait
3608 		 * up to 100ms for it to finish.  If it does not come
3609 		 * back, then give up
3610 		 */
3611 		DELAY(1000*100);
3612 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3613 		if (cmd == 0xffff)
3614 			if_printf(sc->ifp, "NIC disappeared!\n");
3615 	}
3616 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3617 		/* Print the reboot status */
3618 		reboot = mxge_read_reboot(sc);
3619 		if_printf(sc->ifp, "NIC rebooted, status = 0x%x\n", reboot);
3620 
3621 		running = sc->ifp->if_flags & IFF_RUNNING;
3622 		if (running) {
3623 			/*
3624 			 * Quiesce NIC so that TX routines will not try to
3625 			 * xmit after restoration of BAR
3626 			 */
3627 
3628 			/* Mark the link as down */
3629 			if (sc->link_state) {
3630 				sc->ifp->if_link_state = LINK_STATE_DOWN;
3631 				if_link_state_change(sc->ifp);
3632 			}
3633 			mxge_close(sc, 1);
3634 		}
3635 		/* Restore PCI configuration space */
3636 		dinfo = device_get_ivars(sc->dev);
3637 		pci_cfg_restore(sc->dev, dinfo);
3638 
3639 		/* And redo any changes we made to our config space */
3640 		mxge_setup_cfg_space(sc);
3641 
3642 		/* Reload f/w */
3643 		err = mxge_load_firmware(sc, 0);
3644 		if (err)
3645 			if_printf(sc->ifp, "Unable to re-load f/w\n");
3646 		if (running && !err) {
3647 			int i;
3648 
3649 			err = mxge_open(sc);
3650 
3651 			for (i = 0; i < sc->num_tx_rings; ++i)
3652 				ifsq_devstart_sched(sc->ss[i].tx.ifsq);
3653 		}
3654 		sc->watchdog_resets++;
3655 	} else {
3656 		if_printf(sc->ifp, "NIC did not reboot, not resetting\n");
3657 		err = 0;
3658 	}
3659 	if (err) {
3660 		if_printf(sc->ifp, "watchdog reset failed\n");
3661 	} else {
3662 		if (sc->dying == 2)
3663 			sc->dying = 0;
3664 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3665 	}
3666 }
3667 
3668 static void
3669 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3670 {
3671 	if_printf(sc->ifp, "slice %d struck? ring state:\n", slice);
3672 	if_printf(sc->ifp, "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3673 	    tx->req, tx->done, tx->queue_active);
3674 	if_printf(sc->ifp, "tx.activate=%d tx.deactivate=%d\n",
3675 	    tx->activate, tx->deactivate);
3676 	if_printf(sc->ifp, "pkt_done=%d fw=%d\n",
3677 	    tx->pkt_done, be32toh(sc->ss->fw_stats->send_done_count));
3678 }
3679 
3680 static u_long
3681 mxge_update_stats(mxge_softc_t *sc)
3682 {
3683 	u_long ipackets, opackets, pkts;
3684 
3685 	IFNET_STAT_GET(sc->ifp, ipackets, ipackets);
3686 	IFNET_STAT_GET(sc->ifp, opackets, opackets);
3687 
3688 	pkts = ipackets - sc->ipackets;
3689 	pkts += opackets - sc->opackets;
3690 
3691 	sc->ipackets = ipackets;
3692 	sc->opackets = opackets;
3693 
3694 	return pkts;
3695 }
3696 
3697 static void
3698 mxge_tick(void *arg)
3699 {
3700 	mxge_softc_t *sc = arg;
3701 	u_long pkts = 0;
3702 	int err = 0;
3703 	int ticks;
3704 
3705 	lwkt_serialize_enter(&sc->main_serialize);
3706 
3707 	ticks = mxge_ticks;
3708 	if (sc->ifp->if_flags & IFF_RUNNING) {
3709 		/* Aggregate stats from different slices */
3710 		pkts = mxge_update_stats(sc);
3711 		if (sc->need_media_probe)
3712 			mxge_media_probe(sc);
3713 	}
3714 	if (pkts == 0) {
3715 		uint16_t cmd;
3716 
3717 		/* Ensure NIC did not suffer h/w fault while idle */
3718 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3719 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3720 			sc->dying = 2;
3721 			mxge_serialize_skipmain(sc);
3722 			mxge_watchdog_reset(sc);
3723 			mxge_deserialize_skipmain(sc);
3724 			err = ENXIO;
3725 		}
3726 
3727 		/* Look less often if NIC is idle */
3728 		ticks *= 4;
3729 	}
3730 
3731 	if (err == 0)
3732 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
3733 
3734 	lwkt_serialize_exit(&sc->main_serialize);
3735 }
3736 
3737 static int
3738 mxge_media_change(struct ifnet *ifp)
3739 {
3740 	return EINVAL;
3741 }
3742 
3743 static int
3744 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3745 {
3746 	struct ifnet *ifp = sc->ifp;
3747 	int real_mtu, old_mtu;
3748 	int err = 0;
3749 
3750 	real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3751 	if (mtu > sc->max_mtu || real_mtu < 60)
3752 		return EINVAL;
3753 
3754 	old_mtu = ifp->if_mtu;
3755 	ifp->if_mtu = mtu;
3756 	if (ifp->if_flags & IFF_RUNNING) {
3757 		mxge_close(sc, 0);
3758 		err = mxge_open(sc);
3759 		if (err != 0) {
3760 			ifp->if_mtu = old_mtu;
3761 			mxge_close(sc, 0);
3762 			mxge_open(sc);
3763 		}
3764 	}
3765 	return err;
3766 }
3767 
3768 static void
3769 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3770 {
3771 	mxge_softc_t *sc = ifp->if_softc;
3772 
3773 
3774 	if (sc == NULL)
3775 		return;
3776 	ifmr->ifm_status = IFM_AVALID;
3777 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
3778 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3779 	ifmr->ifm_active |= sc->current_media;
3780 }
3781 
3782 static int
3783 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data,
3784     struct ucred *cr __unused)
3785 {
3786 	mxge_softc_t *sc = ifp->if_softc;
3787 	struct ifreq *ifr = (struct ifreq *)data;
3788 	int err, mask;
3789 
3790 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
3791 	err = 0;
3792 
3793 	switch (command) {
3794 	case SIOCSIFMTU:
3795 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3796 		break;
3797 
3798 	case SIOCSIFFLAGS:
3799 		if (sc->dying)
3800 			return EINVAL;
3801 
3802 		if (ifp->if_flags & IFF_UP) {
3803 			if (!(ifp->if_flags & IFF_RUNNING)) {
3804 				err = mxge_open(sc);
3805 			} else {
3806 				/*
3807 				 * Take care of PROMISC and ALLMULTI
3808 				 * flag changes
3809 				 */
3810 				mxge_change_promisc(sc,
3811 				    ifp->if_flags & IFF_PROMISC);
3812 				mxge_set_multicast_list(sc);
3813 			}
3814 		} else {
3815 			if (ifp->if_flags & IFF_RUNNING)
3816 				mxge_close(sc, 0);
3817 		}
3818 		break;
3819 
3820 	case SIOCADDMULTI:
3821 	case SIOCDELMULTI:
3822 		mxge_set_multicast_list(sc);
3823 		break;
3824 
3825 	case SIOCSIFCAP:
3826 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3827 		if (mask & IFCAP_TXCSUM) {
3828 			ifp->if_capenable ^= IFCAP_TXCSUM;
3829 			if (ifp->if_capenable & IFCAP_TXCSUM)
3830 				ifp->if_hwassist |= CSUM_TCP | CSUM_UDP;
3831 			else
3832 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
3833 		}
3834 		if (mask & IFCAP_TSO) {
3835 			ifp->if_capenable ^= IFCAP_TSO;
3836 			if (ifp->if_capenable & IFCAP_TSO)
3837 				ifp->if_hwassist |= CSUM_TSO;
3838 			else
3839 				ifp->if_hwassist &= ~CSUM_TSO;
3840 		}
3841 		if (mask & IFCAP_RXCSUM)
3842 			ifp->if_capenable ^= IFCAP_RXCSUM;
3843 		if (mask & IFCAP_VLAN_HWTAGGING)
3844 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3845 		break;
3846 
3847 	case SIOCGIFMEDIA:
3848 		mxge_media_probe(sc);
3849 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3850 		    &sc->media, command);
3851 		break;
3852 
3853 	default:
3854 		err = ether_ioctl(ifp, command, data);
3855 		break;
3856 	}
3857 	return err;
3858 }
3859 
3860 static void
3861 mxge_fetch_tunables(mxge_softc_t *sc)
3862 {
3863 	sc->intr_coal_delay = mxge_intr_coal_delay;
3864 	if (sc->intr_coal_delay < 0 || sc->intr_coal_delay > (10 * 1000))
3865 		sc->intr_coal_delay = MXGE_INTR_COAL_DELAY;
3866 
3867 	/* XXX */
3868 	if (mxge_ticks == 0)
3869 		mxge_ticks = hz / 2;
3870 
3871 	sc->pause = mxge_flow_control;
3872 	sc->use_rss = mxge_use_rss;
3873 
3874 	sc->throttle = mxge_throttle;
3875 	if (sc->throttle && sc->throttle > MXGE_MAX_THROTTLE)
3876 		sc->throttle = MXGE_MAX_THROTTLE;
3877 	if (sc->throttle && sc->throttle < MXGE_MIN_THROTTLE)
3878 		sc->throttle = MXGE_MIN_THROTTLE;
3879 }
3880 
3881 static void
3882 mxge_free_slices(mxge_softc_t *sc)
3883 {
3884 	struct mxge_slice_state *ss;
3885 	int i;
3886 
3887 	if (sc->ss == NULL)
3888 		return;
3889 
3890 	for (i = 0; i < sc->num_slices; i++) {
3891 		ss = &sc->ss[i];
3892 		if (ss->fw_stats != NULL) {
3893 			mxge_dma_free(&ss->fw_stats_dma);
3894 			ss->fw_stats = NULL;
3895 		}
3896 		if (ss->rx_data.rx_done.entry != NULL) {
3897 			mxge_dma_free(&ss->rx_done_dma);
3898 			ss->rx_data.rx_done.entry = NULL;
3899 		}
3900 	}
3901 	kfree(sc->ss, M_DEVBUF);
3902 	sc->ss = NULL;
3903 }
3904 
3905 static int
3906 mxge_alloc_slices(mxge_softc_t *sc)
3907 {
3908 	mxge_cmd_t cmd;
3909 	struct mxge_slice_state *ss;
3910 	size_t bytes;
3911 	int err, i, rx_ring_size;
3912 
3913 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3914 	if (err != 0) {
3915 		device_printf(sc->dev, "Cannot determine rx ring size\n");
3916 		return err;
3917 	}
3918 	rx_ring_size = cmd.data0;
3919 	sc->rx_intr_slots = 2 * (rx_ring_size / sizeof (mcp_dma_addr_t));
3920 
3921 	bytes = sizeof(*sc->ss) * sc->num_slices;
3922 	sc->ss = kmalloc_cachealign(bytes, M_DEVBUF, M_WAITOK | M_ZERO);
3923 
3924 	for (i = 0; i < sc->num_slices; i++) {
3925 		ss = &sc->ss[i];
3926 
3927 		ss->sc = sc;
3928 
3929 		lwkt_serialize_init(&ss->rx_data.rx_serialize);
3930 		lwkt_serialize_init(&ss->tx.tx_serialize);
3931 		ss->intr_rid = -1;
3932 
3933 		/*
3934 		 * Allocate per-slice rx interrupt queue
3935 		 * XXX assume 4bytes mcp_slot
3936 		 */
3937 		bytes = sc->rx_intr_slots * sizeof(mcp_slot_t);
3938 		err = mxge_dma_alloc(sc, &ss->rx_done_dma, bytes, 4096);
3939 		if (err != 0) {
3940 			device_printf(sc->dev,
3941 			    "alloc %d slice rx_done failed\n", i);
3942 			return err;
3943 		}
3944 		ss->rx_data.rx_done.entry = ss->rx_done_dma.dmem_addr;
3945 
3946 		/*
3947 		 * Allocate the per-slice firmware stats
3948 		 */
3949 		bytes = sizeof(*ss->fw_stats);
3950 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
3951 		    sizeof(*ss->fw_stats), 64);
3952 		if (err != 0) {
3953 			device_printf(sc->dev,
3954 			    "alloc %d fw_stats failed\n", i);
3955 			return err;
3956 		}
3957 		ss->fw_stats = ss->fw_stats_dma.dmem_addr;
3958 	}
3959 	return 0;
3960 }
3961 
3962 static void
3963 mxge_slice_probe(mxge_softc_t *sc)
3964 {
3965 	int status, max_intr_slots, max_slices, num_slices;
3966 	int msix_cnt, msix_enable, i, multi_tx;
3967 	mxge_cmd_t cmd;
3968 	const char *old_fw;
3969 
3970 	sc->num_slices = 1;
3971 	sc->num_tx_rings = 1;
3972 
3973 	num_slices = device_getenv_int(sc->dev, "num_slices", mxge_num_slices);
3974 	if (num_slices == 1)
3975 		return;
3976 
3977 	if (ncpus2 == 1)
3978 		return;
3979 
3980 	msix_enable = device_getenv_int(sc->dev, "msix.enable",
3981 	    mxge_msix_enable);
3982 	if (!msix_enable)
3983 		return;
3984 
3985 	msix_cnt = pci_msix_count(sc->dev);
3986 	if (msix_cnt < 2)
3987 		return;
3988 
3989 	/*
3990 	 * Round down MSI-X vector count to the nearest power of 2
3991 	 */
3992 	i = 0;
3993 	while ((1 << (i + 1)) <= msix_cnt)
3994 		++i;
3995 	msix_cnt = 1 << i;
3996 
3997 	/*
3998 	 * Now load the slice aware firmware see what it supports
3999 	 */
4000 	old_fw = sc->fw_name;
4001 	if (old_fw == mxge_fw_aligned)
4002 		sc->fw_name = mxge_fw_rss_aligned;
4003 	else
4004 		sc->fw_name = mxge_fw_rss_unaligned;
4005 	status = mxge_load_firmware(sc, 0);
4006 	if (status != 0) {
4007 		device_printf(sc->dev, "Falling back to a single slice\n");
4008 		return;
4009 	}
4010 
4011 	/*
4012 	 * Try to send a reset command to the card to see if it is alive
4013 	 */
4014 	memset(&cmd, 0, sizeof(cmd));
4015 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4016 	if (status != 0) {
4017 		device_printf(sc->dev, "failed reset\n");
4018 		goto abort_with_fw;
4019 	}
4020 
4021 	/*
4022 	 * Get rx ring size to calculate rx interrupt queue size
4023 	 */
4024 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4025 	if (status != 0) {
4026 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4027 		goto abort_with_fw;
4028 	}
4029 	max_intr_slots = 2 * (cmd.data0 / sizeof(mcp_dma_addr_t));
4030 
4031 	/*
4032 	 * Tell it the size of the rx interrupt queue
4033 	 */
4034 	cmd.data0 = max_intr_slots * sizeof(struct mcp_slot);
4035 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4036 	if (status != 0) {
4037 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4038 		goto abort_with_fw;
4039 	}
4040 
4041 	/*
4042 	 * Ask the maximum number of slices it supports
4043 	 */
4044 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4045 	if (status != 0) {
4046 		device_printf(sc->dev,
4047 		    "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4048 		goto abort_with_fw;
4049 	}
4050 	max_slices = cmd.data0;
4051 
4052 	/*
4053 	 * Round down max slices count to the nearest power of 2
4054 	 */
4055 	i = 0;
4056 	while ((1 << (i + 1)) <= max_slices)
4057 		++i;
4058 	max_slices = 1 << i;
4059 
4060 	if (max_slices > msix_cnt)
4061 		max_slices = msix_cnt;
4062 
4063 	sc->num_slices = num_slices;
4064 	sc->num_slices = if_ring_count2(sc->num_slices, max_slices);
4065 
4066 	multi_tx = device_getenv_int(sc->dev, "multi_tx", mxge_multi_tx);
4067 	if (multi_tx)
4068 		sc->num_tx_rings = sc->num_slices;
4069 
4070 	if (bootverbose) {
4071 		device_printf(sc->dev, "using %d slices, max %d\n",
4072 		    sc->num_slices, max_slices);
4073 	}
4074 
4075 	if (sc->num_slices == 1)
4076 		goto abort_with_fw;
4077 	return;
4078 
4079 abort_with_fw:
4080 	sc->fw_name = old_fw;
4081 	mxge_load_firmware(sc, 0);
4082 }
4083 
4084 static void
4085 mxge_setup_serialize(struct mxge_softc *sc)
4086 {
4087 	int i = 0, slice;
4088 
4089 	/* Main + rx + tx */
4090 	sc->nserialize = (2 * sc->num_slices) + 1;
4091 	sc->serializes =
4092 	    kmalloc(sc->nserialize * sizeof(struct lwkt_serialize *),
4093 	        M_DEVBUF, M_WAITOK | M_ZERO);
4094 
4095 	/*
4096 	 * Setup serializes
4097 	 *
4098 	 * NOTE: Order is critical
4099 	 */
4100 
4101 	KKASSERT(i < sc->nserialize);
4102 	sc->serializes[i++] = &sc->main_serialize;
4103 
4104 	for (slice = 0; slice < sc->num_slices; ++slice) {
4105 		KKASSERT(i < sc->nserialize);
4106 		sc->serializes[i++] = &sc->ss[slice].rx_data.rx_serialize;
4107 	}
4108 
4109 	for (slice = 0; slice < sc->num_slices; ++slice) {
4110 		KKASSERT(i < sc->nserialize);
4111 		sc->serializes[i++] = &sc->ss[slice].tx.tx_serialize;
4112 	}
4113 
4114 	KKASSERT(i == sc->nserialize);
4115 }
4116 
4117 static void
4118 mxge_serialize(struct ifnet *ifp, enum ifnet_serialize slz)
4119 {
4120 	struct mxge_softc *sc = ifp->if_softc;
4121 
4122 	ifnet_serialize_array_enter(sc->serializes, sc->nserialize, slz);
4123 }
4124 
4125 static void
4126 mxge_deserialize(struct ifnet *ifp, enum ifnet_serialize slz)
4127 {
4128 	struct mxge_softc *sc = ifp->if_softc;
4129 
4130 	ifnet_serialize_array_exit(sc->serializes, sc->nserialize, slz);
4131 }
4132 
4133 static int
4134 mxge_tryserialize(struct ifnet *ifp, enum ifnet_serialize slz)
4135 {
4136 	struct mxge_softc *sc = ifp->if_softc;
4137 
4138 	return ifnet_serialize_array_try(sc->serializes, sc->nserialize, slz);
4139 }
4140 
4141 #ifdef INVARIANTS
4142 
4143 static void
4144 mxge_serialize_assert(struct ifnet *ifp, enum ifnet_serialize slz,
4145     boolean_t serialized)
4146 {
4147 	struct mxge_softc *sc = ifp->if_softc;
4148 
4149 	ifnet_serialize_array_assert(sc->serializes, sc->nserialize,
4150 	    slz, serialized);
4151 }
4152 
4153 #endif	/* INVARIANTS */
4154 
4155 #ifdef IFPOLL_ENABLE
4156 
4157 static void
4158 mxge_npoll_rx(struct ifnet *ifp, void *xss, int cycle)
4159 {
4160 	struct mxge_slice_state *ss = xss;
4161 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
4162 
4163 	ASSERT_SERIALIZED(&ss->rx_data.rx_serialize);
4164 
4165 	if (rx_done->entry[rx_done->idx].length != 0) {
4166 		mxge_clean_rx_done(&ss->sc->arpcom.ac_if, &ss->rx_data, cycle);
4167 	} else {
4168 		/*
4169 		 * XXX
4170 		 * This register writting obviously has cost,
4171 		 * however, if we don't hand back the rx token,
4172 		 * the upcoming packets may suffer rediculously
4173 		 * large delay, as observed on 8AL-C using ping(8).
4174 		 */
4175 		*ss->irq_claim = be32toh(3);
4176 	}
4177 }
4178 
4179 static void
4180 mxge_npoll(struct ifnet *ifp, struct ifpoll_info *info)
4181 {
4182 	struct mxge_softc *sc = ifp->if_softc;
4183 	int i;
4184 
4185 	if (info == NULL)
4186 		return;
4187 
4188 	/*
4189 	 * Only poll rx; polling tx and status don't seem to work
4190 	 */
4191 	for (i = 0; i < sc->num_slices; ++i) {
4192 		struct mxge_slice_state *ss = &sc->ss[i];
4193 		int idx = ss->intr_cpuid;
4194 
4195 		KKASSERT(idx < ncpus2);
4196 		info->ifpi_rx[idx].poll_func = mxge_npoll_rx;
4197 		info->ifpi_rx[idx].arg = ss;
4198 		info->ifpi_rx[idx].serializer = &ss->rx_data.rx_serialize;
4199 	}
4200 }
4201 
4202 #endif	/* IFPOLL_ENABLE */
4203 
4204 static int
4205 mxge_attach(device_t dev)
4206 {
4207 	mxge_softc_t *sc = device_get_softc(dev);
4208 	struct ifnet *ifp = &sc->arpcom.ac_if;
4209 	int err, rid, i;
4210 
4211 	/*
4212 	 * Avoid rewriting half the lines in this file to use
4213 	 * &sc->arpcom.ac_if instead
4214 	 */
4215 	sc->ifp = ifp;
4216 	sc->dev = dev;
4217 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4218 	ifmedia_init(&sc->media, 0, mxge_media_change, mxge_media_status);
4219 
4220 	lwkt_serialize_init(&sc->main_serialize);
4221 
4222 	mxge_fetch_tunables(sc);
4223 
4224 	err = bus_dma_tag_create(NULL,			/* parent */
4225 				 1,			/* alignment */
4226 				 0,			/* boundary */
4227 				 BUS_SPACE_MAXADDR,	/* low */
4228 				 BUS_SPACE_MAXADDR,	/* high */
4229 				 NULL, NULL,		/* filter */
4230 				 BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
4231 				 0, 			/* num segs */
4232 				 BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
4233 				 0,			/* flags */
4234 				 &sc->parent_dmat);	/* tag */
4235 	if (err != 0) {
4236 		device_printf(dev, "Err %d allocating parent dmat\n", err);
4237 		goto failed;
4238 	}
4239 
4240 	callout_init_mp(&sc->co_hdl);
4241 
4242 	mxge_setup_cfg_space(sc);
4243 
4244 	/*
4245 	 * Map the board into the kernel
4246 	 */
4247 	rid = PCIR_BARS;
4248 	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
4249 	    &rid, RF_ACTIVE);
4250 	if (sc->mem_res == NULL) {
4251 		device_printf(dev, "could not map memory\n");
4252 		err = ENXIO;
4253 		goto failed;
4254 	}
4255 
4256 	sc->sram = rman_get_virtual(sc->mem_res);
4257 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4258 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4259 		device_printf(dev, "impossible memory region size %ld\n",
4260 		    rman_get_size(sc->mem_res));
4261 		err = ENXIO;
4262 		goto failed;
4263 	}
4264 
4265 	/*
4266 	 * Make NULL terminated copy of the EEPROM strings section of
4267 	 * lanai SRAM
4268 	 */
4269 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4270 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4271 	    rman_get_bushandle(sc->mem_res),
4272 	    sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4273 	    sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE - 2);
4274 	err = mxge_parse_strings(sc);
4275 	if (err != 0) {
4276 		device_printf(dev, "parse EEPROM string failed\n");
4277 		goto failed;
4278 	}
4279 
4280 	/*
4281 	 * Enable write combining for efficient use of PCIe bus
4282 	 */
4283 	mxge_enable_wc(sc);
4284 
4285 	/*
4286 	 * Allocate the out of band DMA memory
4287 	 */
4288 	err = mxge_dma_alloc(sc, &sc->cmd_dma, sizeof(mxge_cmd_t), 64);
4289 	if (err != 0) {
4290 		device_printf(dev, "alloc cmd DMA buf failed\n");
4291 		goto failed;
4292 	}
4293 	sc->cmd = sc->cmd_dma.dmem_addr;
4294 
4295 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4296 	if (err != 0) {
4297 		device_printf(dev, "alloc zeropad DMA buf failed\n");
4298 		goto failed;
4299 	}
4300 
4301 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4302 	if (err != 0) {
4303 		device_printf(dev, "alloc dmabench DMA buf failed\n");
4304 		goto failed;
4305 	}
4306 
4307 	/* Select & load the firmware */
4308 	err = mxge_select_firmware(sc);
4309 	if (err != 0) {
4310 		device_printf(dev, "select firmware failed\n");
4311 		goto failed;
4312 	}
4313 
4314 	mxge_slice_probe(sc);
4315 	err = mxge_alloc_slices(sc);
4316 	if (err != 0) {
4317 		device_printf(dev, "alloc slices failed\n");
4318 		goto failed;
4319 	}
4320 
4321 	err = mxge_alloc_intr(sc);
4322 	if (err != 0) {
4323 		device_printf(dev, "alloc intr failed\n");
4324 		goto failed;
4325 	}
4326 
4327 	/* Setup serializes */
4328 	mxge_setup_serialize(sc);
4329 
4330 	err = mxge_reset(sc, 0);
4331 	if (err != 0) {
4332 		device_printf(dev, "reset failed\n");
4333 		goto failed;
4334 	}
4335 
4336 	err = mxge_alloc_rings(sc);
4337 	if (err != 0) {
4338 		device_printf(dev, "failed to allocate rings\n");
4339 		goto failed;
4340 	}
4341 
4342 	ifp->if_baudrate = IF_Gbps(10UL);
4343 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO;
4344 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4345 
4346 	ifp->if_capabilities |= IFCAP_VLAN_MTU;
4347 #if 0
4348 	/* Well, its software, sigh */
4349 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
4350 #endif
4351 	ifp->if_capenable = ifp->if_capabilities;
4352 
4353 	ifp->if_softc = sc;
4354 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4355 	ifp->if_init = mxge_init;
4356 	ifp->if_ioctl = mxge_ioctl;
4357 	ifp->if_start = mxge_start;
4358 #ifdef IFPOLL_ENABLE
4359 	if (sc->intr_type != PCI_INTR_TYPE_LEGACY)
4360 		ifp->if_npoll = mxge_npoll;
4361 #endif
4362 	ifp->if_serialize = mxge_serialize;
4363 	ifp->if_deserialize = mxge_deserialize;
4364 	ifp->if_tryserialize = mxge_tryserialize;
4365 #ifdef INVARIANTS
4366 	ifp->if_serialize_assert = mxge_serialize_assert;
4367 #endif
4368 
4369 	/* Increase TSO burst length */
4370 	ifp->if_tsolen = (32 * ETHERMTU);
4371 
4372 	/* Initialise the ifmedia structure */
4373 	mxge_media_init(sc);
4374 	mxge_media_probe(sc);
4375 
4376 	ether_ifattach(ifp, sc->mac_addr, NULL);
4377 
4378 	/* Setup TX rings and subqueues */
4379 	for (i = 0; i < sc->num_tx_rings; ++i) {
4380 		struct ifaltq_subque *ifsq = ifq_get_subq(&ifp->if_snd, i);
4381 		struct mxge_slice_state *ss = &sc->ss[i];
4382 
4383 		ifsq_set_cpuid(ifsq, ss->intr_cpuid);
4384 		ifsq_set_hw_serialize(ifsq, &ss->tx.tx_serialize);
4385 		ifsq_set_priv(ifsq, &ss->tx);
4386 		ss->tx.ifsq = ifsq;
4387 
4388 		ifsq_watchdog_init(&ss->tx.watchdog, ifsq, mxge_watchdog);
4389 	}
4390 
4391 	/*
4392 	 * XXX
4393 	 * We are not ready to do "gather" jumbo frame, so
4394 	 * limit MTU to MJUMPAGESIZE
4395 	 */
4396 	sc->max_mtu = MJUMPAGESIZE -
4397 	    ETHER_HDR_LEN - EVL_ENCAPLEN - MXGEFW_PAD - 1;
4398 	sc->dying = 0;
4399 
4400 	err = mxge_setup_intr(sc);
4401 	if (err != 0) {
4402 		device_printf(dev, "alloc and setup intr failed\n");
4403 		ether_ifdetach(ifp);
4404 		goto failed;
4405 	}
4406 
4407 	mxge_add_sysctls(sc);
4408 
4409 	/* Increase non-cluster mbuf limit; used by small RX rings */
4410 	mb_inclimit(ifp->if_nmbclusters);
4411 
4412 	callout_reset_bycpu(&sc->co_hdl, mxge_ticks, mxge_tick, sc,
4413 	    sc->ss[0].intr_cpuid);
4414 	return 0;
4415 
4416 failed:
4417 	mxge_detach(dev);
4418 	return err;
4419 }
4420 
4421 static int
4422 mxge_detach(device_t dev)
4423 {
4424 	mxge_softc_t *sc = device_get_softc(dev);
4425 
4426 	if (device_is_attached(dev)) {
4427 		struct ifnet *ifp = sc->ifp;
4428 		int mblimit = ifp->if_nmbclusters;
4429 
4430 		ifnet_serialize_all(ifp);
4431 
4432 		sc->dying = 1;
4433 		if (ifp->if_flags & IFF_RUNNING)
4434 			mxge_close(sc, 1);
4435 		callout_stop(&sc->co_hdl);
4436 
4437 		mxge_teardown_intr(sc, sc->num_slices);
4438 
4439 		ifnet_deserialize_all(ifp);
4440 
4441 		callout_terminate(&sc->co_hdl);
4442 
4443 		ether_ifdetach(ifp);
4444 
4445 		/* Decrease non-cluster mbuf limit increased by us */
4446 		mb_inclimit(-mblimit);
4447 	}
4448 	ifmedia_removeall(&sc->media);
4449 
4450 	if (sc->cmd != NULL && sc->zeropad_dma.dmem_addr != NULL &&
4451 	    sc->sram != NULL)
4452 		mxge_dummy_rdma(sc, 0);
4453 
4454 	mxge_free_intr(sc);
4455 	mxge_rem_sysctls(sc);
4456 	mxge_free_rings(sc);
4457 
4458 	/* MUST after sysctls, intr and rings are freed */
4459 	mxge_free_slices(sc);
4460 
4461 	if (sc->dmabench_dma.dmem_addr != NULL)
4462 		mxge_dma_free(&sc->dmabench_dma);
4463 	if (sc->zeropad_dma.dmem_addr != NULL)
4464 		mxge_dma_free(&sc->zeropad_dma);
4465 	if (sc->cmd_dma.dmem_addr != NULL)
4466 		mxge_dma_free(&sc->cmd_dma);
4467 
4468 	if (sc->msix_table_res != NULL) {
4469 		bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(2),
4470 		    sc->msix_table_res);
4471 	}
4472 	if (sc->mem_res != NULL) {
4473 		bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS,
4474 		    sc->mem_res);
4475 	}
4476 
4477 	if (sc->parent_dmat != NULL)
4478 		bus_dma_tag_destroy(sc->parent_dmat);
4479 
4480 	return 0;
4481 }
4482 
4483 static int
4484 mxge_shutdown(device_t dev)
4485 {
4486 	return 0;
4487 }
4488 
4489 static void
4490 mxge_free_msix(struct mxge_softc *sc, boolean_t setup)
4491 {
4492 	int i;
4493 
4494 	KKASSERT(sc->num_slices > 1);
4495 
4496 	for (i = 0; i < sc->num_slices; ++i) {
4497 		struct mxge_slice_state *ss = &sc->ss[i];
4498 
4499 		if (ss->intr_res != NULL) {
4500 			bus_release_resource(sc->dev, SYS_RES_IRQ,
4501 			    ss->intr_rid, ss->intr_res);
4502 		}
4503 		if (ss->intr_rid >= 0)
4504 			pci_release_msix_vector(sc->dev, ss->intr_rid);
4505 	}
4506 	if (setup)
4507 		pci_teardown_msix(sc->dev);
4508 }
4509 
4510 static int
4511 mxge_alloc_msix(struct mxge_softc *sc)
4512 {
4513 	struct mxge_slice_state *ss;
4514 	int offset, rid, error, i;
4515 	boolean_t setup = FALSE;
4516 
4517 	KKASSERT(sc->num_slices > 1);
4518 
4519 	if (sc->num_slices == ncpus2) {
4520 		offset = 0;
4521 	} else {
4522 		int offset_def;
4523 
4524 		offset_def = (sc->num_slices * device_get_unit(sc->dev)) %
4525 		    ncpus2;
4526 
4527 		offset = device_getenv_int(sc->dev, "msix.offset", offset_def);
4528 		if (offset >= ncpus2 ||
4529 		    offset % sc->num_slices != 0) {
4530 			device_printf(sc->dev, "invalid msix.offset %d, "
4531 			    "use %d\n", offset, offset_def);
4532 			offset = offset_def;
4533 		}
4534 	}
4535 
4536 	ss = &sc->ss[0];
4537 
4538 	ss->intr_serialize = &sc->main_serialize;
4539 	ss->intr_func = mxge_msi;
4540 	ksnprintf(ss->intr_desc0, sizeof(ss->intr_desc0),
4541 	    "%s comb", device_get_nameunit(sc->dev));
4542 	ss->intr_desc = ss->intr_desc0;
4543 	ss->intr_cpuid = offset;
4544 
4545 	for (i = 1; i < sc->num_slices; ++i) {
4546 		ss = &sc->ss[i];
4547 
4548 		ss->intr_serialize = &ss->rx_data.rx_serialize;
4549 		if (sc->num_tx_rings == 1) {
4550 			ss->intr_func = mxge_msix_rx;
4551 			ksnprintf(ss->intr_desc0, sizeof(ss->intr_desc0),
4552 			    "%s rx", device_get_nameunit(sc->dev));
4553 		} else {
4554 			ss->intr_func = mxge_msix_rxtx;
4555 			ksnprintf(ss->intr_desc0, sizeof(ss->intr_desc0),
4556 			    "%s rxtx", device_get_nameunit(sc->dev));
4557 		}
4558 		ss->intr_desc = ss->intr_desc0;
4559 		ss->intr_cpuid = offset + i;
4560 	}
4561 
4562 	rid = PCIR_BAR(2);
4563 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4564 	    &rid, RF_ACTIVE);
4565 	if (sc->msix_table_res == NULL) {
4566 		device_printf(sc->dev, "couldn't alloc MSI-X table res\n");
4567 		return ENXIO;
4568 	}
4569 
4570 	error = pci_setup_msix(sc->dev);
4571 	if (error) {
4572 		device_printf(sc->dev, "could not setup MSI-X\n");
4573 		goto back;
4574 	}
4575 	setup = TRUE;
4576 
4577 	for (i = 0; i < sc->num_slices; ++i) {
4578 		ss = &sc->ss[i];
4579 
4580 		error = pci_alloc_msix_vector(sc->dev, i, &ss->intr_rid,
4581 		    ss->intr_cpuid);
4582 		if (error) {
4583 			device_printf(sc->dev, "could not alloc "
4584 			    "MSI-X %d on cpu%d\n", i, ss->intr_cpuid);
4585 			goto back;
4586 		}
4587 
4588 		ss->intr_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
4589 		    &ss->intr_rid, RF_ACTIVE);
4590 		if (ss->intr_res == NULL) {
4591 			device_printf(sc->dev, "could not alloc "
4592 			    "MSI-X %d resource\n", i);
4593 			error = ENXIO;
4594 			goto back;
4595 		}
4596 	}
4597 
4598 	pci_enable_msix(sc->dev);
4599 	sc->intr_type = PCI_INTR_TYPE_MSIX;
4600 back:
4601 	if (error)
4602 		mxge_free_msix(sc, setup);
4603 	return error;
4604 }
4605 
4606 static int
4607 mxge_alloc_intr(struct mxge_softc *sc)
4608 {
4609 	struct mxge_slice_state *ss;
4610 	u_int irq_flags;
4611 
4612 	if (sc->num_slices > 1) {
4613 		int error;
4614 
4615 		error = mxge_alloc_msix(sc);
4616 		if (error)
4617 			return error;
4618 		KKASSERT(sc->intr_type == PCI_INTR_TYPE_MSIX);
4619 		return 0;
4620 	}
4621 
4622 	ss = &sc->ss[0];
4623 
4624 	sc->intr_type = pci_alloc_1intr(sc->dev, mxge_msi_enable,
4625 	    &ss->intr_rid, &irq_flags);
4626 
4627 	ss->intr_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
4628 	    &ss->intr_rid, irq_flags);
4629 	if (ss->intr_res == NULL) {
4630 		device_printf(sc->dev, "could not alloc interrupt\n");
4631 		return ENXIO;
4632 	}
4633 
4634 	if (sc->intr_type == PCI_INTR_TYPE_LEGACY)
4635 		ss->intr_func = mxge_legacy;
4636 	else
4637 		ss->intr_func = mxge_msi;
4638 	ss->intr_serialize = &sc->main_serialize;
4639 	ss->intr_cpuid = rman_get_cpuid(ss->intr_res);
4640 
4641 	return 0;
4642 }
4643 
4644 static int
4645 mxge_setup_intr(struct mxge_softc *sc)
4646 {
4647 	int i;
4648 
4649 	for (i = 0; i < sc->num_slices; ++i) {
4650 		struct mxge_slice_state *ss = &sc->ss[i];
4651 		int error;
4652 
4653 		error = bus_setup_intr_descr(sc->dev, ss->intr_res,
4654 		    INTR_MPSAFE, ss->intr_func, ss, &ss->intr_hand,
4655 		    ss->intr_serialize, ss->intr_desc);
4656 		if (error) {
4657 			device_printf(sc->dev, "can't setup %dth intr\n", i);
4658 			mxge_teardown_intr(sc, i);
4659 			return error;
4660 		}
4661 	}
4662 	return 0;
4663 }
4664 
4665 static void
4666 mxge_teardown_intr(struct mxge_softc *sc, int cnt)
4667 {
4668 	int i;
4669 
4670 	if (sc->ss == NULL)
4671 		return;
4672 
4673 	for (i = 0; i < cnt; ++i) {
4674 		struct mxge_slice_state *ss = &sc->ss[i];
4675 
4676 		bus_teardown_intr(sc->dev, ss->intr_res, ss->intr_hand);
4677 	}
4678 }
4679 
4680 static void
4681 mxge_free_intr(struct mxge_softc *sc)
4682 {
4683 	if (sc->ss == NULL)
4684 		return;
4685 
4686 	if (sc->intr_type != PCI_INTR_TYPE_MSIX) {
4687 		struct mxge_slice_state *ss = &sc->ss[0];
4688 
4689 		if (ss->intr_res != NULL) {
4690 			bus_release_resource(sc->dev, SYS_RES_IRQ,
4691 			    ss->intr_rid, ss->intr_res);
4692 		}
4693 		if (sc->intr_type == PCI_INTR_TYPE_MSI)
4694 			pci_release_msi(sc->dev);
4695 	} else {
4696 		mxge_free_msix(sc, TRUE);
4697 	}
4698 }
4699