xref: /dragonfly/sys/dev/netif/mxge/if_mxge.c (revision 926deccb)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2013, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 $FreeBSD: head/sys/dev/mxge/if_mxge.c 254263 2013-08-12 23:30:01Z scottl $
29 
30 ***************************************************************************/
31 
32 #include "opt_inet.h"
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/linker.h>
37 #include <sys/firmware.h>
38 #include <sys/endian.h>
39 #include <sys/in_cksum.h>
40 #include <sys/sockio.h>
41 #include <sys/mbuf.h>
42 #include <sys/malloc.h>
43 #include <sys/kernel.h>
44 #include <sys/module.h>
45 #include <sys/serialize.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
48 
49 #include <net/if.h>
50 #include <net/if_arp.h>
51 #include <net/ifq_var.h>
52 #include <net/ethernet.h>
53 #include <net/if_dl.h>
54 #include <net/if_media.h>
55 
56 #include <net/bpf.h>
57 
58 #include <net/if_types.h>
59 #include <net/vlan/if_vlan_var.h>
60 #include <net/zlib.h>
61 
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/tcp.h>
66 
67 #include <sys/bus.h>
68 #include <sys/rman.h>
69 
70 #include <bus/pci/pcireg.h>
71 #include <bus/pci/pcivar.h>
72 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
73 
74 #include <vm/vm.h>		/* for pmap_mapdev() */
75 #include <vm/pmap.h>
76 
77 #if defined(__i386__) || defined(__x86_64__)
78 #include <machine/specialreg.h>
79 #endif
80 
81 #include <dev/netif/mxge/mxge_mcp.h>
82 #include <dev/netif/mxge/mcp_gen_header.h>
83 #include <dev/netif/mxge/if_mxge_var.h>
84 
85 #define MXGE_RX_SMALL_BUFLEN		(MHLEN - MXGEFW_PAD)
86 
87 /* tunable params */
88 static int mxge_nvidia_ecrc_enable = 1;
89 static int mxge_force_firmware = 0;
90 static int mxge_intr_coal_delay = MXGE_INTR_COAL_DELAY;
91 static int mxge_deassert_wait = 1;
92 static int mxge_flow_control = 1;
93 static int mxge_ticks;
94 static int mxge_max_slices = 1;
95 static int mxge_always_promisc = 0;
96 static int mxge_throttle = 0;
97 static int mxge_msi_enable = 1;
98 
99 static const char *mxge_fw_unaligned = "mxge_ethp_z8e";
100 static const char *mxge_fw_aligned = "mxge_eth_z8e";
101 static const char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
102 static const char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
103 
104 TUNABLE_INT("hw.mxge.max_slices", &mxge_max_slices);
105 TUNABLE_INT("hw.mxge.flow_control_enabled", &mxge_flow_control);
106 TUNABLE_INT("hw.mxge.intr_coal_delay", &mxge_intr_coal_delay);
107 TUNABLE_INT("hw.mxge.nvidia_ecrc_enable", &mxge_nvidia_ecrc_enable);
108 TUNABLE_INT("hw.mxge.force_firmware", &mxge_force_firmware);
109 TUNABLE_INT("hw.mxge.deassert_wait", &mxge_deassert_wait);
110 TUNABLE_INT("hw.mxge.ticks", &mxge_ticks);
111 TUNABLE_INT("hw.mxge.always_promisc", &mxge_always_promisc);
112 TUNABLE_INT("hw.mxge.throttle", &mxge_throttle);
113 TUNABLE_INT("hw.mxge.msi.enable", &mxge_msi_enable);
114 
115 static int mxge_probe(device_t dev);
116 static int mxge_attach(device_t dev);
117 static int mxge_detach(device_t dev);
118 static int mxge_shutdown(device_t dev);
119 
120 static device_method_t mxge_methods[] = {
121 	/* Device interface */
122 	DEVMETHOD(device_probe, mxge_probe),
123 	DEVMETHOD(device_attach, mxge_attach),
124 	DEVMETHOD(device_detach, mxge_detach),
125 	DEVMETHOD(device_shutdown, mxge_shutdown),
126 	DEVMETHOD_END
127 };
128 
129 static driver_t mxge_driver = {
130 	"mxge",
131 	mxge_methods,
132 	sizeof(mxge_softc_t),
133 };
134 
135 static devclass_t mxge_devclass;
136 
137 /* Declare ourselves to be a child of the PCI bus.*/
138 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, NULL, NULL);
139 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
140 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
141 
142 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
143 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
144 static void mxge_close(mxge_softc_t *sc, int down);
145 static int mxge_open(mxge_softc_t *sc);
146 static void mxge_tick(void *arg);
147 static void mxge_watchdog_reset(mxge_softc_t *sc);
148 static void mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice);
149 
150 static int
151 mxge_probe(device_t dev)
152 {
153 	if (pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM &&
154 	    (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E ||
155 	     pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9)) {
156 		int rev = pci_get_revid(dev);
157 
158 		switch (rev) {
159 		case MXGE_PCI_REV_Z8E:
160 			device_set_desc(dev, "Myri10G-PCIE-8A");
161 			break;
162 		case MXGE_PCI_REV_Z8ES:
163 			device_set_desc(dev, "Myri10G-PCIE-8B");
164 			break;
165 		default:
166 			device_set_desc(dev, "Myri10G-PCIE-8??");
167 			device_printf(dev, "Unrecognized rev %d NIC\n", rev);
168 			break;
169 		}
170 		return 0;
171 	}
172 	return ENXIO;
173 }
174 
175 static void
176 mxge_enable_wc(mxge_softc_t *sc)
177 {
178 #if defined(__i386__) || defined(__x86_64__)
179 	vm_offset_t len;
180 
181 	sc->wc = 1;
182 	len = rman_get_size(sc->mem_res);
183 	pmap_change_attr((vm_offset_t) sc->sram, len / PAGE_SIZE,
184 	    PAT_WRITE_COMBINING);
185 #endif
186 }
187 
188 static int
189 mxge_dma_alloc(mxge_softc_t *sc, bus_dmamem_t *dma, size_t bytes,
190     bus_size_t alignment)
191 {
192 	bus_size_t boundary;
193 	int err;
194 
195 	if (bytes > 4096 && alignment == 4096)
196 		boundary = 0;
197 	else
198 		boundary = 4096;
199 
200 	err = bus_dmamem_coherent(sc->parent_dmat, alignment, boundary,
201 	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, bytes,
202 	    BUS_DMA_WAITOK | BUS_DMA_ZERO, dma);
203 	if (err != 0) {
204 		device_printf(sc->dev, "bus_dmamem_coherent failed: %d\n", err);
205 		return err;
206 	}
207 	return 0;
208 }
209 
210 static void
211 mxge_dma_free(bus_dmamem_t *dma)
212 {
213 	bus_dmamap_unload(dma->dmem_tag, dma->dmem_map);
214 	bus_dmamem_free(dma->dmem_tag, dma->dmem_addr, dma->dmem_map);
215 	bus_dma_tag_destroy(dma->dmem_tag);
216 }
217 
218 /*
219  * The eeprom strings on the lanaiX have the format
220  * SN=x\0
221  * MAC=x:x:x:x:x:x\0
222  * PC=text\0
223  */
224 static int
225 mxge_parse_strings(mxge_softc_t *sc)
226 {
227 	const char *ptr;
228 	int i, found_mac, found_sn2;
229 	char *endptr;
230 
231 	ptr = sc->eeprom_strings;
232 	found_mac = 0;
233 	found_sn2 = 0;
234 	while (*ptr != '\0') {
235 		if (strncmp(ptr, "MAC=", 4) == 0) {
236 			ptr += 4;
237 			for (i = 0;;) {
238 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
239 				if (endptr - ptr != 2)
240 					goto abort;
241 				ptr = endptr;
242 				if (++i == 6)
243 					break;
244 				if (*ptr++ != ':')
245 					goto abort;
246 			}
247 			found_mac = 1;
248 		} else if (strncmp(ptr, "PC=", 3) == 0) {
249 			ptr += 3;
250 			strlcpy(sc->product_code_string, ptr,
251 			    sizeof(sc->product_code_string));
252 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
253 			ptr += 3;
254 			strlcpy(sc->serial_number_string, ptr,
255 			    sizeof(sc->serial_number_string));
256 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
257 			/* SN2 takes precedence over SN */
258 			ptr += 4;
259 			found_sn2 = 1;
260 			strlcpy(sc->serial_number_string, ptr,
261 			    sizeof(sc->serial_number_string));
262 		}
263 		while (*ptr++ != '\0') {}
264 	}
265 
266 	if (found_mac)
267 		return 0;
268 
269 abort:
270 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
271 	return ENXIO;
272 }
273 
274 #if defined(__i386__) || defined(__x86_64__)
275 
276 static void
277 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
278 {
279 	uint32_t val;
280 	unsigned long base, off;
281 	char *va, *cfgptr;
282 	device_t pdev, mcp55;
283 	uint16_t vendor_id, device_id, word;
284 	uintptr_t bus, slot, func, ivend, idev;
285 	uint32_t *ptr32;
286 
287 	if (!mxge_nvidia_ecrc_enable)
288 		return;
289 
290 	pdev = device_get_parent(device_get_parent(sc->dev));
291 	if (pdev == NULL) {
292 		device_printf(sc->dev, "could not find parent?\n");
293 		return;
294 	}
295 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
296 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
297 
298 	if (vendor_id != 0x10de)
299 		return;
300 
301 	base = 0;
302 
303 	if (device_id == 0x005d) {
304 		/* ck804, base address is magic */
305 		base = 0xe0000000UL;
306 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
307 		/* mcp55, base address stored in chipset */
308 		mcp55 = pci_find_bsf(0, 0, 0);
309 		if (mcp55 &&
310 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
311 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
312 			word = pci_read_config(mcp55, 0x90, 2);
313 			base = ((unsigned long)word & 0x7ffeU) << 25;
314 		}
315 	}
316 	if (!base)
317 		return;
318 
319 	/*
320 	 * XXXX
321 	 * Test below is commented because it is believed that doing
322 	 * config read/write beyond 0xff will access the config space
323 	 * for the next larger function.  Uncomment this and remove
324 	 * the hacky pmap_mapdev() way of accessing config space when
325 	 * DragonFly grows support for extended pcie config space access.
326 	 */
327 #if 0
328 	/*
329 	 * See if we can, by some miracle, access the extended
330 	 * config space
331 	 */
332 	val = pci_read_config(pdev, 0x178, 4);
333 	if (val != 0xffffffff) {
334 		val |= 0x40;
335 		pci_write_config(pdev, 0x178, val, 4);
336 		return;
337 	}
338 #endif
339 	/*
340 	 * Rather than using normal pci config space writes, we must
341 	 * map the Nvidia config space ourselves.  This is because on
342 	 * opteron/nvidia class machine the 0xe000000 mapping is
343 	 * handled by the nvidia chipset, that means the internal PCI
344 	 * device (the on-chip northbridge), or the amd-8131 bridge
345 	 * and things behind them are not visible by this method.
346 	 */
347 
348 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
349 		      PCI_IVAR_BUS, &bus);
350 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
351 		      PCI_IVAR_SLOT, &slot);
352 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
353 		      PCI_IVAR_FUNCTION, &func);
354 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
355 		      PCI_IVAR_VENDOR, &ivend);
356 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
357 		      PCI_IVAR_DEVICE, &idev);
358 
359 	off =  base + 0x00100000UL * (unsigned long)bus +
360 	    0x00001000UL * (unsigned long)(func + 8 * slot);
361 
362 	/* map it into the kernel */
363 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
364 	if (va == NULL) {
365 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
366 		return;
367 	}
368 	/* get a pointer to the config space mapped into the kernel */
369 	cfgptr = va + (off & PAGE_MASK);
370 
371 	/* make sure that we can really access it */
372 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
373 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
374 	if (!(vendor_id == ivend && device_id == idev)) {
375 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
376 		    vendor_id, device_id);
377 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
378 		return;
379 	}
380 
381 	ptr32 = (uint32_t*)(cfgptr + 0x178);
382 	val = *ptr32;
383 
384 	if (val == 0xffffffff) {
385 		device_printf(sc->dev, "extended mapping failed\n");
386 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
387 		return;
388 	}
389 	*ptr32 = val | 0x40;
390 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
391 	if (bootverbose) {
392 		device_printf(sc->dev, "Enabled ECRC on upstream "
393 		    "Nvidia bridge at %d:%d:%d\n",
394 		    (int)bus, (int)slot, (int)func);
395 	}
396 }
397 
398 #else	/* __i386__ || __x86_64__ */
399 
400 static void
401 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
402 {
403 	device_printf(sc->dev, "Nforce 4 chipset on non-x86/x86_64!?!?!\n");
404 }
405 
406 #endif
407 
408 static int
409 mxge_dma_test(mxge_softc_t *sc, int test_type)
410 {
411 	mxge_cmd_t cmd;
412 	bus_addr_t dmatest_bus = sc->dmabench_dma.dmem_busaddr;
413 	int status;
414 	uint32_t len;
415 	const char *test = " ";
416 
417 	/*
418 	 * Run a small DMA test.
419 	 * The magic multipliers to the length tell the firmware
420 	 * to do DMA read, write, or read+write tests.  The
421 	 * results are returned in cmd.data0.  The upper 16
422 	 * bits of the return is the number of transfers completed.
423 	 * The lower 16 bits is the time in 0.5us ticks that the
424 	 * transfers took to complete.
425 	 */
426 
427 	len = sc->tx_boundary;
428 
429 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
430 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
431 	cmd.data2 = len * 0x10000;
432 	status = mxge_send_cmd(sc, test_type, &cmd);
433 	if (status != 0) {
434 		test = "read";
435 		goto abort;
436 	}
437 	sc->read_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
438 
439 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
440 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
441 	cmd.data2 = len * 0x1;
442 	status = mxge_send_cmd(sc, test_type, &cmd);
443 	if (status != 0) {
444 		test = "write";
445 		goto abort;
446 	}
447 	sc->write_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
448 
449 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
450 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
451 	cmd.data2 = len * 0x10001;
452 	status = mxge_send_cmd(sc, test_type, &cmd);
453 	if (status != 0) {
454 		test = "read/write";
455 		goto abort;
456 	}
457 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
458 	    (cmd.data0 & 0xffff);
459 
460 abort:
461 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST) {
462 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
463 		    test, status);
464 	}
465 	return status;
466 }
467 
468 /*
469  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
470  * when the PCI-E Completion packets are aligned on an 8-byte
471  * boundary.  Some PCI-E chip sets always align Completion packets; on
472  * the ones that do not, the alignment can be enforced by enabling
473  * ECRC generation (if supported).
474  *
475  * When PCI-E Completion packets are not aligned, it is actually more
476  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
477  *
478  * If the driver can neither enable ECRC nor verify that it has
479  * already been enabled, then it must use a firmware image which works
480  * around unaligned completion packets (ethp_z8e.dat), and it should
481  * also ensure that it never gives the device a Read-DMA which is
482  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
483  * enabled, then the driver should use the aligned (eth_z8e.dat)
484  * firmware image, and set tx_boundary to 4KB.
485  */
486 static int
487 mxge_firmware_probe(mxge_softc_t *sc)
488 {
489 	device_t dev = sc->dev;
490 	int reg, status;
491 	uint16_t pectl;
492 
493 	sc->tx_boundary = 4096;
494 
495 	/*
496 	 * Verify the max read request size was set to 4KB
497 	 * before trying the test with 4KB.
498 	 */
499 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
500 		pectl = pci_read_config(dev, reg + 0x8, 2);
501 		if ((pectl & (5 << 12)) != (5 << 12)) {
502 			device_printf(dev, "Max Read Req. size != 4k (0x%x)\n",
503 			    pectl);
504 			sc->tx_boundary = 2048;
505 		}
506 	}
507 
508 	/*
509 	 * Load the optimized firmware (which assumes aligned PCIe
510 	 * completions) in order to see if it works on this host.
511 	 */
512 	sc->fw_name = mxge_fw_aligned;
513 	status = mxge_load_firmware(sc, 1);
514 	if (status != 0)
515 		return status;
516 
517 	/*
518 	 * Enable ECRC if possible
519 	 */
520 	mxge_enable_nvidia_ecrc(sc);
521 
522 	/*
523 	 * Run a DMA test which watches for unaligned completions and
524 	 * aborts on the first one seen.  Not required on Z8ES or newer.
525 	 */
526 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
527 		return 0;
528 
529 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
530 	if (status == 0)
531 		return 0; /* keep the aligned firmware */
532 
533 	if (status != E2BIG)
534 		device_printf(dev, "DMA test failed: %d\n", status);
535 	if (status == ENOSYS) {
536 		device_printf(dev, "Falling back to ethp! "
537 		    "Please install up to date fw\n");
538 	}
539 	return status;
540 }
541 
542 static int
543 mxge_select_firmware(mxge_softc_t *sc)
544 {
545 	int aligned = 0;
546 	int force_firmware = mxge_force_firmware;
547 
548 	if (sc->throttle)
549 		force_firmware = sc->throttle;
550 
551 	if (force_firmware != 0) {
552 		if (force_firmware == 1)
553 			aligned = 1;
554 		else
555 			aligned = 0;
556 		if (bootverbose) {
557 			device_printf(sc->dev,
558 			    "Assuming %s completions (forced)\n",
559 			    aligned ? "aligned" : "unaligned");
560 		}
561 		goto abort;
562 	}
563 
564 	/*
565 	 * If the PCIe link width is 4 or less, we can use the aligned
566 	 * firmware and skip any checks
567 	 */
568 	if (sc->link_width != 0 && sc->link_width <= 4) {
569 		device_printf(sc->dev, "PCIe x%d Link, "
570 		    "expect reduced performance\n", sc->link_width);
571 		aligned = 1;
572 		goto abort;
573 	}
574 
575 	if (mxge_firmware_probe(sc) == 0)
576 		return 0;
577 
578 abort:
579 	if (aligned) {
580 		sc->fw_name = mxge_fw_aligned;
581 		sc->tx_boundary = 4096;
582 	} else {
583 		sc->fw_name = mxge_fw_unaligned;
584 		sc->tx_boundary = 2048;
585 	}
586 	return mxge_load_firmware(sc, 0);
587 }
588 
589 static int
590 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
591 {
592 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
593 		if_printf(sc->ifp, "Bad firmware type: 0x%x\n",
594 		    be32toh(hdr->mcp_type));
595 		return EIO;
596 	}
597 
598 	/* Save firmware version for sysctl */
599 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
600 	if (bootverbose)
601 		if_printf(sc->ifp, "firmware id: %s\n", hdr->version);
602 
603 	ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
604 	    &sc->fw_ver_minor, &sc->fw_ver_tiny);
605 
606 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR &&
607 	      sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
608 		if_printf(sc->ifp, "Found firmware version %s\n",
609 		    sc->fw_version);
610 		if_printf(sc->ifp, "Driver needs %d.%d\n",
611 		    MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
612 		return EINVAL;
613 	}
614 	return 0;
615 }
616 
617 static void *
618 z_alloc(void *nil, u_int items, u_int size)
619 {
620 	return kmalloc(items * size, M_TEMP, M_WAITOK);
621 }
622 
623 static void
624 z_free(void *nil, void *ptr)
625 {
626 	kfree(ptr, M_TEMP);
627 }
628 
629 static int
630 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
631 {
632 	z_stream zs;
633 	char *inflate_buffer;
634 	const struct firmware *fw;
635 	const mcp_gen_header_t *hdr;
636 	unsigned hdr_offset;
637 	int status;
638 	unsigned int i;
639 	char dummy;
640 	size_t fw_len;
641 
642 	fw = firmware_get(sc->fw_name);
643 	if (fw == NULL) {
644 		if_printf(sc->ifp, "Could not find firmware image %s\n",
645 		    sc->fw_name);
646 		return ENOENT;
647 	}
648 
649 	/* Setup zlib and decompress f/w */
650 	bzero(&zs, sizeof(zs));
651 	zs.zalloc = z_alloc;
652 	zs.zfree = z_free;
653 	status = inflateInit(&zs);
654 	if (status != Z_OK) {
655 		status = EIO;
656 		goto abort_with_fw;
657 	}
658 
659 	/*
660 	 * The uncompressed size is stored as the firmware version,
661 	 * which would otherwise go unused
662 	 */
663 	fw_len = (size_t)fw->version;
664 	inflate_buffer = kmalloc(fw_len, M_TEMP, M_WAITOK);
665 	zs.avail_in = fw->datasize;
666 	zs.next_in = __DECONST(char *, fw->data);
667 	zs.avail_out = fw_len;
668 	zs.next_out = inflate_buffer;
669 	status = inflate(&zs, Z_FINISH);
670 	if (status != Z_STREAM_END) {
671 		if_printf(sc->ifp, "zlib %d\n", status);
672 		status = EIO;
673 		goto abort_with_buffer;
674 	}
675 
676 	/* Check id */
677 	hdr_offset =
678 	htobe32(*(const uint32_t *)(inflate_buffer + MCP_HEADER_PTR_OFFSET));
679 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
680 		if_printf(sc->ifp, "Bad firmware file");
681 		status = EIO;
682 		goto abort_with_buffer;
683 	}
684 	hdr = (const void*)(inflate_buffer + hdr_offset);
685 
686 	status = mxge_validate_firmware(sc, hdr);
687 	if (status != 0)
688 		goto abort_with_buffer;
689 
690 	/* Copy the inflated firmware to NIC SRAM. */
691 	for (i = 0; i < fw_len; i += 256) {
692 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i, inflate_buffer + i,
693 		    min(256U, (unsigned)(fw_len - i)));
694 		wmb();
695 		dummy = *sc->sram;
696 		wmb();
697 	}
698 
699 	*limit = fw_len;
700 	status = 0;
701 abort_with_buffer:
702 	kfree(inflate_buffer, M_TEMP);
703 	inflateEnd(&zs);
704 abort_with_fw:
705 	firmware_put(fw, FIRMWARE_UNLOAD);
706 	return status;
707 }
708 
709 /*
710  * Enable or disable periodic RDMAs from the host to make certain
711  * chipsets resend dropped PCIe messages
712  */
713 static void
714 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
715 {
716 	char buf_bytes[72];
717 	volatile uint32_t *confirm;
718 	volatile char *submit;
719 	uint32_t *buf, dma_low, dma_high;
720 	int i;
721 
722 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
723 
724 	/* Clear confirmation addr */
725 	confirm = (volatile uint32_t *)sc->cmd;
726 	*confirm = 0;
727 	wmb();
728 
729 	/*
730 	 * Send an rdma command to the PCIe engine, and wait for the
731 	 * response in the confirmation address.  The firmware should
732 	 * write a -1 there to indicate it is alive and well
733 	 */
734 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
735 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
736 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
737 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
738 	buf[2] = htobe32(0xffffffff);		/* confirm data */
739 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
740 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
741 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
742 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
743 	buf[5] = htobe32(enable);		/* enable? */
744 
745 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
746 
747 	mxge_pio_copy(submit, buf, 64);
748 	wmb();
749 	DELAY(1000);
750 	wmb();
751 	i = 0;
752 	while (*confirm != 0xffffffff && i < 20) {
753 		DELAY(1000);
754 		i++;
755 	}
756 	if (*confirm != 0xffffffff) {
757 		if_printf(sc->ifp, "dummy rdma %s failed (%p = 0x%x)",
758 		    (enable ? "enable" : "disable"), confirm, *confirm);
759 	}
760 }
761 
762 static int
763 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
764 {
765 	mcp_cmd_t *buf;
766 	char buf_bytes[sizeof(*buf) + 8];
767 	volatile mcp_cmd_response_t *response = sc->cmd;
768 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
769 	uint32_t dma_low, dma_high;
770 	int err, sleep_total = 0;
771 
772 	/* Ensure buf is aligned to 8 bytes */
773 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
774 
775 	buf->data0 = htobe32(data->data0);
776 	buf->data1 = htobe32(data->data1);
777 	buf->data2 = htobe32(data->data2);
778 	buf->cmd = htobe32(cmd);
779 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
780 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
781 
782 	buf->response_addr.low = htobe32(dma_low);
783 	buf->response_addr.high = htobe32(dma_high);
784 
785 	response->result = 0xffffffff;
786 	wmb();
787 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
788 
789 	/*
790 	 * Wait up to 20ms
791 	 */
792 	err = EAGAIN;
793 	for (sleep_total = 0; sleep_total < 20; sleep_total++) {
794 		wmb();
795 		switch (be32toh(response->result)) {
796 		case 0:
797 			data->data0 = be32toh(response->data);
798 			err = 0;
799 			break;
800 		case 0xffffffff:
801 			DELAY(1000);
802 			break;
803 		case MXGEFW_CMD_UNKNOWN:
804 			err = ENOSYS;
805 			break;
806 		case MXGEFW_CMD_ERROR_UNALIGNED:
807 			err = E2BIG;
808 			break;
809 		case MXGEFW_CMD_ERROR_BUSY:
810 			err = EBUSY;
811 			break;
812 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
813 			err = ENXIO;
814 			break;
815 		default:
816 			if_printf(sc->ifp, "command %d failed, result = %d\n",
817 			    cmd, be32toh(response->result));
818 			err = ENXIO;
819 			break;
820 		}
821 		if (err != EAGAIN)
822 			break;
823 	}
824 	if (err == EAGAIN) {
825 		if_printf(sc->ifp, "command %d timed out result = %d\n",
826 		    cmd, be32toh(response->result));
827 	}
828 	return err;
829 }
830 
831 static int
832 mxge_adopt_running_firmware(mxge_softc_t *sc)
833 {
834 	struct mcp_gen_header *hdr;
835 	const size_t bytes = sizeof(struct mcp_gen_header);
836 	size_t hdr_offset;
837 	int status;
838 
839 	/*
840 	 * Find running firmware header
841 	 */
842 	hdr_offset =
843 	htobe32(*(volatile uint32_t *)(sc->sram + MCP_HEADER_PTR_OFFSET));
844 
845 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
846 		if_printf(sc->ifp, "Running firmware has bad header offset "
847 		    "(%zu)\n", hdr_offset);
848 		return EIO;
849 	}
850 
851 	/*
852 	 * Copy header of running firmware from SRAM to host memory to
853 	 * validate firmware
854 	 */
855 	hdr = kmalloc(bytes, M_DEVBUF, M_WAITOK);
856 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
857 	    rman_get_bushandle(sc->mem_res), hdr_offset, (char *)hdr, bytes);
858 	status = mxge_validate_firmware(sc, hdr);
859 	kfree(hdr, M_DEVBUF);
860 
861 	/*
862 	 * Check to see if adopted firmware has bug where adopting
863 	 * it will cause broadcasts to be filtered unless the NIC
864 	 * is kept in ALLMULTI mode
865 	 */
866 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
867 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
868 		sc->adopted_rx_filter_bug = 1;
869 		if_printf(sc->ifp, "Adopting fw %d.%d.%d: "
870 		    "working around rx filter bug\n",
871 		    sc->fw_ver_major, sc->fw_ver_minor, sc->fw_ver_tiny);
872 	}
873 
874 	return status;
875 }
876 
877 static int
878 mxge_load_firmware(mxge_softc_t *sc, int adopt)
879 {
880 	volatile uint32_t *confirm;
881 	volatile char *submit;
882 	char buf_bytes[72];
883 	uint32_t *buf, size, dma_low, dma_high;
884 	int status, i;
885 
886 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
887 
888 	size = sc->sram_size;
889 	status = mxge_load_firmware_helper(sc, &size);
890 	if (status) {
891 		if (!adopt)
892 			return status;
893 
894 		/*
895 		 * Try to use the currently running firmware, if
896 		 * it is new enough
897 		 */
898 		status = mxge_adopt_running_firmware(sc);
899 		if (status) {
900 			if_printf(sc->ifp,
901 			    "failed to adopt running firmware\n");
902 			return status;
903 		}
904 		if_printf(sc->ifp, "Successfully adopted running firmware\n");
905 
906 		if (sc->tx_boundary == 4096) {
907 			if_printf(sc->ifp,
908 			     "Using firmware currently running on NIC.  "
909 			     "For optimal\n");
910 			if_printf(sc->ifp, "performance consider loading "
911 			     "optimized firmware\n");
912 		}
913 		sc->fw_name = mxge_fw_unaligned;
914 		sc->tx_boundary = 2048;
915 		return 0;
916 	}
917 
918 	/* Clear confirmation addr */
919 	confirm = (volatile uint32_t *)sc->cmd;
920 	*confirm = 0;
921 	wmb();
922 
923 	/*
924 	 * Send a reload command to the bootstrap MCP, and wait for the
925 	 * response in the confirmation address.  The firmware should
926 	 * write a -1 there to indicate it is alive and well
927 	 */
928 
929 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
930 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
931 
932 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
933 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
934 	buf[2] = htobe32(0xffffffff);	/* confirm data */
935 
936 	/*
937 	 * FIX: All newest firmware should un-protect the bottom of
938 	 * the sram before handoff. However, the very first interfaces
939 	 * do not. Therefore the handoff copy must skip the first 8 bytes
940 	 */
941 					/* where the code starts*/
942 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
943 	buf[4] = htobe32(size - 8); 	/* length of code */
944 	buf[5] = htobe32(8);		/* where to copy to */
945 	buf[6] = htobe32(0);		/* where to jump to */
946 
947 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
948 	mxge_pio_copy(submit, buf, 64);
949 	wmb();
950 	DELAY(1000);
951 	wmb();
952 	i = 0;
953 	while (*confirm != 0xffffffff && i < 20) {
954 		DELAY(1000*10);
955 		i++;
956 	}
957 	if (*confirm != 0xffffffff) {
958 		if_printf(sc->ifp,"handoff failed (%p = 0x%x)",
959 		    confirm, *confirm);
960 		return ENXIO;
961 	}
962 	return 0;
963 }
964 
965 static int
966 mxge_update_mac_address(mxge_softc_t *sc)
967 {
968 	mxge_cmd_t cmd;
969 	uint8_t *addr = sc->mac_addr;
970 
971 	cmd.data0 = (addr[0] << 24) | (addr[1] << 16) |
972 	    (addr[2] << 8) | addr[3];
973 	cmd.data1 = (addr[4] << 8) | (addr[5]);
974 	return mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
975 }
976 
977 static int
978 mxge_change_pause(mxge_softc_t *sc, int pause)
979 {
980 	mxge_cmd_t cmd;
981 	int status;
982 
983 	if (pause)
984 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL, &cmd);
985 	else
986 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL, &cmd);
987 	if (status) {
988 		if_printf(sc->ifp, "Failed to set flow control mode\n");
989 		return ENXIO;
990 	}
991 	sc->pause = pause;
992 	return 0;
993 }
994 
995 static void
996 mxge_change_promisc(mxge_softc_t *sc, int promisc)
997 {
998 	mxge_cmd_t cmd;
999 	int status;
1000 
1001 	if (mxge_always_promisc)
1002 		promisc = 1;
1003 
1004 	if (promisc)
1005 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC, &cmd);
1006 	else
1007 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC, &cmd);
1008 	if (status)
1009 		if_printf(sc->ifp, "Failed to set promisc mode\n");
1010 }
1011 
1012 static void
1013 mxge_set_multicast_list(mxge_softc_t *sc)
1014 {
1015 	mxge_cmd_t cmd;
1016 	struct ifmultiaddr *ifma;
1017 	struct ifnet *ifp = sc->ifp;
1018 	int err;
1019 
1020 	/* This firmware is known to not support multicast */
1021 	if (!sc->fw_multicast_support)
1022 		return;
1023 
1024 	/* Disable multicast filtering while we play with the lists*/
1025 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1026 	if (err != 0) {
1027 		if_printf(ifp, "Failed MXGEFW_ENABLE_ALLMULTI, "
1028 		    "error status: %d\n", err);
1029 		return;
1030 	}
1031 
1032 	if (sc->adopted_rx_filter_bug)
1033 		return;
1034 
1035 	if (ifp->if_flags & IFF_ALLMULTI) {
1036 		/* Request to disable multicast filtering, so quit here */
1037 		return;
1038 	}
1039 
1040 	/* Flush all the filters */
1041 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1042 	if (err != 0) {
1043 		if_printf(ifp, "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, "
1044 		    "error status: %d\n", err);
1045 		return;
1046 	}
1047 
1048 	/*
1049 	 * Walk the multicast list, and add each address
1050 	 */
1051 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1052 		if (ifma->ifma_addr->sa_family != AF_LINK)
1053 			continue;
1054 
1055 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1056 		    &cmd.data0, 4);
1057 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1058 		    &cmd.data1, 2);
1059 		cmd.data0 = htonl(cmd.data0);
1060 		cmd.data1 = htonl(cmd.data1);
1061 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1062 		if (err != 0) {
1063 			if_printf(ifp, "Failed MXGEFW_JOIN_MULTICAST_GROUP, "
1064 			    "error status: %d\n", err);
1065 			/* Abort, leaving multicast filtering off */
1066 			return;
1067 		}
1068 	}
1069 
1070 	/* Enable multicast filtering */
1071 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1072 	if (err != 0) {
1073 		if_printf(ifp, "Failed MXGEFW_DISABLE_ALLMULTI, "
1074 		    "error status: %d\n", err);
1075 	}
1076 }
1077 
1078 #if 0
1079 static int
1080 mxge_max_mtu(mxge_softc_t *sc)
1081 {
1082 	mxge_cmd_t cmd;
1083 	int status;
1084 
1085 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1086 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1087 
1088 	/* try to set nbufs to see if it we can
1089 	   use virtually contiguous jumbos */
1090 	cmd.data0 = 0;
1091 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1092 			       &cmd);
1093 	if (status == 0)
1094 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1095 
1096 	/* otherwise, we're limited to MJUMPAGESIZE */
1097 	return MJUMPAGESIZE - MXGEFW_PAD;
1098 }
1099 #endif
1100 
1101 static int
1102 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1103 {
1104 	struct mxge_slice_state *ss;
1105 	mxge_rx_done_t *rx_done;
1106 	volatile uint32_t *irq_claim;
1107 	mxge_cmd_t cmd;
1108 	int slice, status;
1109 
1110 	/*
1111 	 * Try to send a reset command to the card to see if it
1112 	 * is alive
1113 	 */
1114 	memset(&cmd, 0, sizeof (cmd));
1115 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1116 	if (status != 0) {
1117 		if_printf(sc->ifp, "failed reset\n");
1118 		return ENXIO;
1119 	}
1120 
1121 	mxge_dummy_rdma(sc, 1);
1122 
1123 	/* Set the intrq size */
1124 	cmd.data0 = sc->rx_ring_size;
1125 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1126 
1127 	/*
1128 	 * Even though we already know how many slices are supported
1129 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1130 	 * has magic side effects, and must be called after a reset.
1131 	 * It must be called prior to calling any RSS related cmds,
1132 	 * including assigning an interrupt queue for anything but
1133 	 * slice 0.  It must also be called *after*
1134 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1135 	 * the firmware to compute offsets.
1136 	 */
1137 	if (sc->num_slices > 1) {
1138 		/* Ask the maximum number of slices it supports */
1139 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
1140 		if (status != 0) {
1141 			if_printf(sc->ifp, "failed to get number of slices\n");
1142 			return status;
1143 		}
1144 
1145 		/*
1146 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1147 		 * to setting up the interrupt queue DMA
1148 		 */
1149 		cmd.data0 = sc->num_slices;
1150 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1151 #ifdef IFNET_BUF_RING
1152 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1153 #endif
1154 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES, &cmd);
1155 		if (status != 0) {
1156 			if_printf(sc->ifp, "failed to set number of slices\n");
1157 			return status;
1158 		}
1159 	}
1160 
1161 	if (interrupts_setup) {
1162 		/* Now exchange information about interrupts  */
1163 		for (slice = 0; slice < sc->num_slices; slice++) {
1164 			ss = &sc->ss[slice];
1165 
1166 			rx_done = &ss->rx_data.rx_done;
1167 			memset(rx_done->entry, 0, sc->rx_ring_size);
1168 
1169 			cmd.data0 =
1170 			    MXGE_LOWPART_TO_U32(ss->rx_done_dma.dmem_busaddr);
1171 			cmd.data1 =
1172 			    MXGE_HIGHPART_TO_U32(ss->rx_done_dma.dmem_busaddr);
1173 			cmd.data2 = slice;
1174 			status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA,
1175 			    &cmd);
1176 		}
1177 	}
1178 
1179 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET,
1180 	    &cmd);
1181 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1182 
1183 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1184 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1185 
1186 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, &cmd);
1187 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1188 
1189 	if (status != 0) {
1190 		if_printf(sc->ifp, "failed set interrupt parameters\n");
1191 		return status;
1192 	}
1193 
1194 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1195 
1196 	/* Run a DMA benchmark */
1197 	mxge_dma_test(sc, MXGEFW_DMA_TEST);
1198 
1199 	for (slice = 0; slice < sc->num_slices; slice++) {
1200 		ss = &sc->ss[slice];
1201 
1202 		ss->irq_claim = irq_claim + (2 * slice);
1203 
1204 		/* Reset mcp/driver shared state back to 0 */
1205 		ss->rx_data.rx_done.idx = 0;
1206 		ss->tx.req = 0;
1207 		ss->tx.done = 0;
1208 		ss->tx.pkt_done = 0;
1209 		ss->tx.queue_active = 0;
1210 		ss->tx.activate = 0;
1211 		ss->tx.deactivate = 0;
1212 		ss->rx_data.rx_big.cnt = 0;
1213 		ss->rx_data.rx_small.cnt = 0;
1214 		if (ss->fw_stats != NULL)
1215 			bzero(ss->fw_stats, sizeof(*ss->fw_stats));
1216 	}
1217 	sc->rdma_tags_available = 15;
1218 
1219 	status = mxge_update_mac_address(sc);
1220 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1221 	mxge_change_pause(sc, sc->pause);
1222 	mxge_set_multicast_list(sc);
1223 
1224 	if (sc->throttle) {
1225 		cmd.data0 = sc->throttle;
1226 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd))
1227 			if_printf(sc->ifp, "can't enable throttle\n");
1228 	}
1229 	return status;
1230 }
1231 
1232 static int
1233 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1234 {
1235 	mxge_cmd_t cmd;
1236 	mxge_softc_t *sc;
1237 	int err;
1238 	unsigned int throttle;
1239 
1240 	sc = arg1;
1241 	throttle = sc->throttle;
1242 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1243 	if (err != 0)
1244 		return err;
1245 
1246 	if (throttle == sc->throttle)
1247 		return 0;
1248 
1249 	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1250 		return EINVAL;
1251 
1252 	ifnet_serialize_all(sc->ifp);
1253 
1254 	cmd.data0 = throttle;
1255 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1256 	if (err == 0)
1257 		sc->throttle = throttle;
1258 
1259 	ifnet_deserialize_all(sc->ifp);
1260 	return err;
1261 }
1262 
1263 static int
1264 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1265 {
1266 	mxge_softc_t *sc;
1267 	unsigned int intr_coal_delay;
1268 	int err;
1269 
1270 	sc = arg1;
1271 	intr_coal_delay = sc->intr_coal_delay;
1272 	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1273 	if (err != 0)
1274 		return err;
1275 
1276 	if (intr_coal_delay == sc->intr_coal_delay)
1277 		return 0;
1278 
1279 	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1280 		return EINVAL;
1281 
1282 	ifnet_serialize_all(sc->ifp);
1283 
1284 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1285 	sc->intr_coal_delay = intr_coal_delay;
1286 
1287 	ifnet_deserialize_all(sc->ifp);
1288 	return err;
1289 }
1290 
1291 static int
1292 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1293 {
1294 	mxge_softc_t *sc;
1295 	unsigned int enabled;
1296 	int err;
1297 
1298 	sc = arg1;
1299 	enabled = sc->pause;
1300 	err = sysctl_handle_int(oidp, &enabled, arg2, req);
1301 	if (err != 0)
1302 		return err;
1303 
1304 	if (enabled == sc->pause)
1305 		return 0;
1306 
1307 	ifnet_serialize_all(sc->ifp);
1308 	err = mxge_change_pause(sc, enabled);
1309 	ifnet_deserialize_all(sc->ifp);
1310 
1311 	return err;
1312 }
1313 
1314 static int
1315 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1316 {
1317 	int err;
1318 
1319 	if (arg1 == NULL)
1320 		return EFAULT;
1321 	arg2 = be32toh(*(int *)arg1);
1322 	arg1 = NULL;
1323 	err = sysctl_handle_int(oidp, arg1, arg2, req);
1324 
1325 	return err;
1326 }
1327 
1328 static void
1329 mxge_rem_sysctls(mxge_softc_t *sc)
1330 {
1331 	if (sc->ss != NULL) {
1332 		struct mxge_slice_state *ss;
1333 		int slice;
1334 
1335 		for (slice = 0; slice < sc->num_slices; slice++) {
1336 			ss = &sc->ss[slice];
1337 			if (ss->sysctl_tree != NULL) {
1338 				sysctl_ctx_free(&ss->sysctl_ctx);
1339 				ss->sysctl_tree = NULL;
1340 			}
1341 		}
1342 	}
1343 
1344 	if (sc->slice_sysctl_tree != NULL) {
1345 		sysctl_ctx_free(&sc->slice_sysctl_ctx);
1346 		sc->slice_sysctl_tree = NULL;
1347 	}
1348 
1349 	if (sc->sysctl_tree != NULL) {
1350 		sysctl_ctx_free(&sc->sysctl_ctx);
1351 		sc->sysctl_tree = NULL;
1352 	}
1353 }
1354 
1355 static void
1356 mxge_add_sysctls(mxge_softc_t *sc)
1357 {
1358 	struct sysctl_ctx_list *ctx;
1359 	struct sysctl_oid_list *children;
1360 	mcp_irq_data_t *fw;
1361 	struct mxge_slice_state *ss;
1362 	int slice;
1363 	char slice_num[8];
1364 
1365 	ctx = &sc->sysctl_ctx;
1366 	sysctl_ctx_init(ctx);
1367 	sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1368 	    OID_AUTO, device_get_nameunit(sc->dev), CTLFLAG_RD, 0, "");
1369 	if (sc->sysctl_tree == NULL) {
1370 		device_printf(sc->dev, "can't add sysctl node\n");
1371 		return;
1372 	}
1373 
1374 	children = SYSCTL_CHILDREN(sc->sysctl_tree);
1375 	fw = sc->ss[0].fw_stats;
1376 
1377 	/*
1378 	 * Random information
1379 	 */
1380 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version",
1381 	    CTLFLAG_RD, &sc->fw_version, 0, "firmware version");
1382 
1383 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "serial_number",
1384 	    CTLFLAG_RD, &sc->serial_number_string, 0, "serial number");
1385 
1386 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "product_code",
1387 	    CTLFLAG_RD, &sc->product_code_string, 0, "product code");
1388 
1389 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "pcie_link_width",
1390 	    CTLFLAG_RD, &sc->link_width, 0, "link width");
1391 
1392 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_boundary",
1393 	    CTLFLAG_RD, &sc->tx_boundary, 0, "tx boundary");
1394 
1395 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_combine",
1396 	    CTLFLAG_RD, &sc->wc, 0, "write combining PIO");
1397 
1398 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_dma_MBs",
1399 	    CTLFLAG_RD, &sc->read_dma, 0, "DMA Read speed in MB/s");
1400 
1401 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_dma_MBs",
1402 	    CTLFLAG_RD, &sc->write_dma, 0, "DMA Write speed in MB/s");
1403 
1404 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_write_dma_MBs",
1405 	    CTLFLAG_RD, &sc->read_write_dma, 0,
1406 	    "DMA concurrent Read/Write speed in MB/s");
1407 
1408 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "watchdog_resets",
1409 	    CTLFLAG_RD, &sc->watchdog_resets, 0,
1410 	    "Number of times NIC was reset");
1411 
1412 	/*
1413 	 * Performance related tunables
1414 	 */
1415 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "intr_coal_delay",
1416 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_intr_coal, "I",
1417 	    "Interrupt coalescing delay in usecs");
1418 
1419 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "throttle",
1420 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_throttle, "I",
1421 	    "Transmit throttling");
1422 
1423 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "flow_control_enabled",
1424 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_flow_control, "I",
1425 	    "Interrupt coalescing delay in usecs");
1426 
1427 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "deassert_wait",
1428 	    CTLFLAG_RW, &mxge_deassert_wait, 0,
1429 	    "Wait for IRQ line to go low in ihandler");
1430 
1431 	/*
1432 	 * Stats block from firmware is in network byte order.
1433 	 * Need to swap it
1434 	 */
1435 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "link_up",
1436 	    CTLTYPE_INT|CTLFLAG_RD, &fw->link_up, 0,
1437 	    mxge_handle_be32, "I", "link up");
1438 
1439 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_tags_available",
1440 	    CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available, 0,
1441 	    mxge_handle_be32, "I", "rdma_tags_available");
1442 
1443 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_crc32",
1444 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_crc32, 0,
1445 	    mxge_handle_be32, "I", "dropped_bad_crc32");
1446 
1447 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_phy",
1448 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_phy, 0,
1449 	    mxge_handle_be32, "I", "dropped_bad_phy");
1450 
1451 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_error_or_filtered",
1452 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_error_or_filtered, 0,
1453 	    mxge_handle_be32, "I", "dropped_link_error_or_filtered");
1454 
1455 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_overflow",
1456 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow, 0,
1457 	    mxge_handle_be32, "I", "dropped_link_overflow");
1458 
1459 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_multicast_filtered",
1460 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_multicast_filtered, 0,
1461 	    mxge_handle_be32, "I", "dropped_multicast_filtered");
1462 
1463 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_big_buffer",
1464 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer, 0,
1465 	    mxge_handle_be32, "I", "dropped_no_big_buffer");
1466 
1467 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_small_buffer",
1468 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_small_buffer, 0,
1469 	    mxge_handle_be32, "I", "dropped_no_small_buffer");
1470 
1471 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_overrun",
1472 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun, 0,
1473 	    mxge_handle_be32, "I", "dropped_overrun");
1474 
1475 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_pause",
1476 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_pause, 0,
1477 	    mxge_handle_be32, "I", "dropped_pause");
1478 
1479 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_runt",
1480 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt, 0,
1481 	    mxge_handle_be32, "I", "dropped_runt");
1482 
1483 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_unicast_filtered",
1484 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered, 0,
1485 	    mxge_handle_be32, "I", "dropped_unicast_filtered");
1486 
1487 	/* add counters exported for debugging from all slices */
1488 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1489 	sc->slice_sysctl_tree = SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx,
1490 	    children, OID_AUTO, "slice", CTLFLAG_RD, 0, "");
1491 	if (sc->slice_sysctl_tree == NULL) {
1492 		device_printf(sc->dev, "can't add slice sysctl node\n");
1493 		return;
1494 	}
1495 
1496 	for (slice = 0; slice < sc->num_slices; slice++) {
1497 		ss = &sc->ss[slice];
1498 		sysctl_ctx_init(&ss->sysctl_ctx);
1499 		ctx = &ss->sysctl_ctx;
1500 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1501 		ksprintf(slice_num, "%d", slice);
1502 		ss->sysctl_tree = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
1503 		    slice_num, CTLFLAG_RD, 0, "");
1504 		if (ss->sysctl_tree == NULL) {
1505 			device_printf(sc->dev,
1506 			    "can't add %d slice sysctl node\n", slice);
1507 			return;	/* XXX continue? */
1508 		}
1509 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1510 
1511 		/*
1512 		 * XXX change to ULONG
1513 		 */
1514 
1515 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_small_cnt",
1516 		    CTLFLAG_RD, &ss->rx_data.rx_small.cnt, 0, "rx_small_cnt");
1517 
1518 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_big_cnt",
1519 		    CTLFLAG_RD, &ss->rx_data.rx_big.cnt, 0, "rx_small_cnt");
1520 
1521 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_req",
1522 		    CTLFLAG_RD, &ss->tx.req, 0, "tx_req");
1523 
1524 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_done",
1525 		    CTLFLAG_RD, &ss->tx.done, 0, "tx_done");
1526 
1527 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_pkt_done",
1528 		    CTLFLAG_RD, &ss->tx.pkt_done, 0, "tx_done");
1529 
1530 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_queue_active",
1531 		    CTLFLAG_RD, &ss->tx.queue_active, 0, "tx_queue_active");
1532 
1533 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_activate",
1534 		    CTLFLAG_RD, &ss->tx.activate, 0, "tx_activate");
1535 
1536 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_deactivate",
1537 		    CTLFLAG_RD, &ss->tx.deactivate, 0, "tx_deactivate");
1538 	}
1539 }
1540 
1541 /*
1542  * Copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1543  * backwards one at a time and handle ring wraps
1544  */
1545 static __inline void
1546 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1547     mcp_kreq_ether_send_t *src, int cnt)
1548 {
1549 	int idx, starting_slot;
1550 
1551 	starting_slot = tx->req;
1552 	while (cnt > 1) {
1553 		cnt--;
1554 		idx = (starting_slot + cnt) & tx->mask;
1555 		mxge_pio_copy(&tx->lanai[idx], &src[cnt], sizeof(*src));
1556 		wmb();
1557 	}
1558 }
1559 
1560 /*
1561  * Copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1562  * at most 32 bytes at a time, so as to avoid involving the software
1563  * pio handler in the nic.  We re-write the first segment's flags
1564  * to mark them valid only after writing the entire chain
1565  */
1566 static __inline void
1567 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src, int cnt)
1568 {
1569 	int idx, i;
1570 	uint32_t *src_ints;
1571 	volatile uint32_t *dst_ints;
1572 	mcp_kreq_ether_send_t *srcp;
1573 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1574 	uint8_t last_flags;
1575 
1576 	idx = tx->req & tx->mask;
1577 
1578 	last_flags = src->flags;
1579 	src->flags = 0;
1580 	wmb();
1581 	dst = dstp = &tx->lanai[idx];
1582 	srcp = src;
1583 
1584 	if ((idx + cnt) < tx->mask) {
1585 		for (i = 0; i < cnt - 1; i += 2) {
1586 			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1587 			wmb(); /* force write every 32 bytes */
1588 			srcp += 2;
1589 			dstp += 2;
1590 		}
1591 	} else {
1592 		/*
1593 		 * Submit all but the first request, and ensure
1594 		 * that it is submitted below
1595 		 */
1596 		mxge_submit_req_backwards(tx, src, cnt);
1597 		i = 0;
1598 	}
1599 	if (i < cnt) {
1600 		/* Submit the first request */
1601 		mxge_pio_copy(dstp, srcp, sizeof(*src));
1602 		wmb(); /* barrier before setting valid flag */
1603 	}
1604 
1605 	/* Re-write the last 32-bits with the valid flags */
1606 	src->flags = last_flags;
1607 	src_ints = (uint32_t *)src;
1608 	src_ints+=3;
1609 	dst_ints = (volatile uint32_t *)dst;
1610 	dst_ints+=3;
1611 	*dst_ints = *src_ints;
1612 	tx->req += cnt;
1613 	wmb();
1614 }
1615 
1616 static int
1617 mxge_pullup_tso(struct mbuf **mp)
1618 {
1619 	int hoff, iphlen, thoff;
1620 	struct mbuf *m;
1621 
1622 	m = *mp;
1623 	KASSERT(M_WRITABLE(m), ("TSO mbuf not writable"));
1624 
1625 	iphlen = m->m_pkthdr.csum_iphlen;
1626 	thoff = m->m_pkthdr.csum_thlen;
1627 	hoff = m->m_pkthdr.csum_lhlen;
1628 
1629 	KASSERT(iphlen > 0, ("invalid ip hlen"));
1630 	KASSERT(thoff > 0, ("invalid tcp hlen"));
1631 	KASSERT(hoff > 0, ("invalid ether hlen"));
1632 
1633 	if (__predict_false(m->m_len < hoff + iphlen + thoff)) {
1634 		m = m_pullup(m, hoff + iphlen + thoff);
1635 		if (m == NULL) {
1636 			*mp = NULL;
1637 			return ENOBUFS;
1638 		}
1639 		*mp = m;
1640 	}
1641 	return 0;
1642 }
1643 
1644 static int
1645 mxge_encap_tso(mxge_tx_ring_t *tx, struct mxge_buffer_state *info_map,
1646     struct mbuf *m, int busdma_seg_cnt)
1647 {
1648 	mcp_kreq_ether_send_t *req;
1649 	bus_dma_segment_t *seg;
1650 	uint32_t low, high_swapped;
1651 	int len, seglen, cum_len, cum_len_next;
1652 	int next_is_first, chop, cnt, rdma_count, small;
1653 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1654 	uint8_t flags, flags_next;
1655 	struct mxge_buffer_state *info_last;
1656 	bus_dmamap_t map = info_map->map;
1657 
1658 	mss = m->m_pkthdr.tso_segsz;
1659 
1660 	/*
1661 	 * Negative cum_len signifies to the send loop that we are
1662 	 * still in the header portion of the TSO packet.
1663 	 */
1664 	cum_len = -(m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen +
1665 	    m->m_pkthdr.csum_thlen);
1666 
1667 	/*
1668 	 * TSO implies checksum offload on this hardware
1669 	 */
1670 	cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1671 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1672 
1673 	/*
1674 	 * For TSO, pseudo_hdr_offset holds mss.  The firmware figures
1675 	 * out where to put the checksum by parsing the header.
1676 	 */
1677 	pseudo_hdr_offset = htobe16(mss);
1678 
1679 	req = tx->req_list;
1680 	seg = tx->seg_list;
1681 	cnt = 0;
1682 	rdma_count = 0;
1683 
1684 	/*
1685 	 * "rdma_count" is the number of RDMAs belonging to the current
1686 	 * packet BEFORE the current send request.  For non-TSO packets,
1687 	 * this is equal to "count".
1688 	 *
1689 	 * For TSO packets, rdma_count needs to be reset to 0 after a
1690 	 * segment cut.
1691 	 *
1692 	 * The rdma_count field of the send request is the number of
1693 	 * RDMAs of the packet starting at that request.  For TSO send
1694 	 * requests with one ore more cuts in the middle, this is the
1695 	 * number of RDMAs starting after the last cut in the request.
1696 	 * All previous segments before the last cut implicitly have 1
1697 	 * RDMA.
1698 	 *
1699 	 * Since the number of RDMAs is not known beforehand, it must be
1700 	 * filled-in retroactively - after each segmentation cut or at
1701 	 * the end of the entire packet.
1702 	 */
1703 
1704 	while (busdma_seg_cnt) {
1705 		/*
1706 		 * Break the busdma segment up into pieces
1707 		 */
1708 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1709 		high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1710 		len = seg->ds_len;
1711 
1712 		while (len) {
1713 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1714 			seglen = len;
1715 			cum_len_next = cum_len + seglen;
1716 			(req - rdma_count)->rdma_count = rdma_count + 1;
1717 			if (__predict_true(cum_len >= 0)) {
1718 				/* Payload */
1719 				chop = (cum_len_next > mss);
1720 				cum_len_next = cum_len_next % mss;
1721 				next_is_first = (cum_len_next == 0);
1722 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1723 				flags_next |=
1724 				    next_is_first * MXGEFW_FLAGS_FIRST;
1725 				rdma_count |= -(chop | next_is_first);
1726 				rdma_count += chop & !next_is_first;
1727 			} else if (cum_len_next >= 0) {
1728 				/* Header ends */
1729 				rdma_count = -1;
1730 				cum_len_next = 0;
1731 				seglen = -cum_len;
1732 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1733 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1734 				    MXGEFW_FLAGS_FIRST |
1735 				    (small * MXGEFW_FLAGS_SMALL);
1736 			}
1737 
1738 			req->addr_high = high_swapped;
1739 			req->addr_low = htobe32(low);
1740 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1741 			req->pad = 0;
1742 			req->rdma_count = 1;
1743 			req->length = htobe16(seglen);
1744 			req->cksum_offset = cksum_offset;
1745 			req->flags =
1746 			    flags | ((cum_len & 1) * MXGEFW_FLAGS_ALIGN_ODD);
1747 			low += seglen;
1748 			len -= seglen;
1749 			cum_len = cum_len_next;
1750 			flags = flags_next;
1751 			req++;
1752 			cnt++;
1753 			rdma_count++;
1754 			if (__predict_false(cksum_offset > seglen))
1755 				cksum_offset -= seglen;
1756 			else
1757 				cksum_offset = 0;
1758 			if (__predict_false(cnt > tx->max_desc))
1759 				goto drop;
1760 		}
1761 		busdma_seg_cnt--;
1762 		seg++;
1763 	}
1764 	(req - rdma_count)->rdma_count = rdma_count;
1765 
1766 	do {
1767 		req--;
1768 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1769 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1770 
1771 	info_last = &tx->info[((cnt - 1) + tx->req) & tx->mask];
1772 
1773 	info_map->map = info_last->map;
1774 	info_last->map = map;
1775 	info_last->m = m;
1776 
1777 	mxge_submit_req(tx, tx->req_list, cnt);
1778 #ifdef IFNET_BUF_RING
1779 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1780 		/* tell the NIC to start polling this slice */
1781 		*tx->send_go = 1;
1782 		tx->queue_active = 1;
1783 		tx->activate++;
1784 		wmb();
1785 	}
1786 #endif
1787 	return 0;
1788 
1789 drop:
1790 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1791 	m_freem(m);
1792 	return ENOBUFS;
1793 }
1794 
1795 static int
1796 mxge_encap(mxge_tx_ring_t *tx, struct mbuf *m, bus_addr_t zeropad)
1797 {
1798 	mcp_kreq_ether_send_t *req;
1799 	bus_dma_segment_t *seg;
1800 	bus_dmamap_t map;
1801 	int cnt, cum_len, err, i, idx, odd_flag;
1802 	uint16_t pseudo_hdr_offset;
1803 	uint8_t flags, cksum_offset;
1804 	struct mxge_buffer_state *info_map, *info_last;
1805 
1806 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
1807 		err = mxge_pullup_tso(&m);
1808 		if (__predict_false(err))
1809 			return err;
1810 	}
1811 
1812 	/*
1813 	 * Map the frame for DMA
1814 	 */
1815 	idx = tx->req & tx->mask;
1816 	info_map = &tx->info[idx];
1817 	map = info_map->map;
1818 
1819 	err = bus_dmamap_load_mbuf_defrag(tx->dmat, map, &m,
1820 	    tx->seg_list, tx->max_desc - 2, &cnt, BUS_DMA_NOWAIT);
1821 	if (__predict_false(err != 0))
1822 		goto drop;
1823 	bus_dmamap_sync(tx->dmat, map, BUS_DMASYNC_PREWRITE);
1824 
1825 	/*
1826 	 * TSO is different enough, we handle it in another routine
1827 	 */
1828 	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1829 		return mxge_encap_tso(tx, info_map, m, cnt);
1830 
1831 	req = tx->req_list;
1832 	cksum_offset = 0;
1833 	pseudo_hdr_offset = 0;
1834 	flags = MXGEFW_FLAGS_NO_TSO;
1835 
1836 	/*
1837 	 * Checksum offloading
1838 	 */
1839 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1840 		cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1841 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1842 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1843 		req->cksum_offset = cksum_offset;
1844 		flags |= MXGEFW_FLAGS_CKSUM;
1845 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1846 	} else {
1847 		odd_flag = 0;
1848 	}
1849 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1850 		flags |= MXGEFW_FLAGS_SMALL;
1851 
1852 	/*
1853 	 * Convert segments into a request list
1854 	 */
1855 	cum_len = 0;
1856 	seg = tx->seg_list;
1857 	req->flags = MXGEFW_FLAGS_FIRST;
1858 	for (i = 0; i < cnt; i++) {
1859 		req->addr_low = htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1860 		req->addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1861 		req->length = htobe16(seg->ds_len);
1862 		req->cksum_offset = cksum_offset;
1863 		if (cksum_offset > seg->ds_len)
1864 			cksum_offset -= seg->ds_len;
1865 		else
1866 			cksum_offset = 0;
1867 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1868 		req->pad = 0; /* complete solid 16-byte block */
1869 		req->rdma_count = 1;
1870 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1871 		cum_len += seg->ds_len;
1872 		seg++;
1873 		req++;
1874 		req->flags = 0;
1875 	}
1876 	req--;
1877 
1878 	/*
1879 	 * Pad runt to 60 bytes
1880 	 */
1881 	if (cum_len < 60) {
1882 		req++;
1883 		req->addr_low = htobe32(MXGE_LOWPART_TO_U32(zeropad));
1884 		req->addr_high = htobe32(MXGE_HIGHPART_TO_U32(zeropad));
1885 		req->length = htobe16(60 - cum_len);
1886 		req->cksum_offset = 0;
1887 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1888 		req->pad = 0; /* complete solid 16-byte block */
1889 		req->rdma_count = 1;
1890 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1891 		cnt++;
1892 	}
1893 
1894 	tx->req_list[0].rdma_count = cnt;
1895 #if 0
1896 	/* print what the firmware will see */
1897 	for (i = 0; i < cnt; i++) {
1898 		kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1899 		    "cso:%d, flags:0x%x, rdma:%d\n",
1900 		    i, (int)ntohl(tx->req_list[i].addr_high),
1901 		    (int)ntohl(tx->req_list[i].addr_low),
1902 		    (int)ntohs(tx->req_list[i].length),
1903 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1904 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1905 		    tx->req_list[i].rdma_count);
1906 	}
1907 	kprintf("--------------\n");
1908 #endif
1909 	info_last = &tx->info[((cnt - 1) + tx->req) & tx->mask];
1910 
1911 	info_map->map = info_last->map;
1912 	info_last->map = map;
1913 	info_last->m = m;
1914 
1915 	mxge_submit_req(tx, tx->req_list, cnt);
1916 #ifdef IFNET_BUF_RING
1917 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1918 		/* tell the NIC to start polling this slice */
1919 		*tx->send_go = 1;
1920 		tx->queue_active = 1;
1921 		tx->activate++;
1922 		wmb();
1923 	}
1924 #endif
1925 	return 0;
1926 
1927 drop:
1928 	m_freem(m);
1929 	return err;
1930 }
1931 
1932 static void
1933 mxge_start(struct ifnet *ifp, struct ifaltq_subque *ifsq)
1934 {
1935 	mxge_softc_t *sc = ifp->if_softc;
1936 	mxge_tx_ring_t *tx;
1937 	bus_addr_t zeropad;
1938 	int encap = 0;
1939 
1940 	/* XXX Only use the first slice for now */
1941 	tx = &sc->ss[0].tx;
1942 
1943 	ASSERT_ALTQ_SQ_DEFAULT(ifp, ifsq);
1944 	ASSERT_SERIALIZED(&tx->tx_serialize);
1945 
1946 	if ((ifp->if_flags & IFF_RUNNING) == 0 || ifsq_is_oactive(ifsq))
1947 		return;
1948 
1949 	zeropad = sc->zeropad_dma.dmem_busaddr;
1950 	while (tx->mask - (tx->req - tx->done) > tx->max_desc) {
1951 		struct mbuf *m;
1952 		int error;
1953 
1954 		m = ifsq_dequeue(ifsq);
1955 		if (m == NULL)
1956 			goto done;
1957 
1958 		BPF_MTAP(ifp, m);
1959 		error = mxge_encap(tx, m, zeropad);
1960 		if (!error)
1961 			encap = 1;
1962 		else
1963 			IFNET_STAT_INC(ifp, oerrors, 1);
1964 	}
1965 
1966 	/* Ran out of transmit slots */
1967 	ifsq_set_oactive(ifsq);
1968 done:
1969 	if (encap)
1970 		ifp->if_timer = 5;
1971 }
1972 
1973 static void
1974 mxge_watchdog(struct ifnet *ifp)
1975 {
1976 	struct mxge_softc *sc = ifp->if_softc;
1977 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
1978 	mxge_tx_ring_t *tx = &sc->ss[0].tx;
1979 
1980 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
1981 
1982 	/* Check for pause blocking before resetting */
1983 	if (tx->watchdog_rx_pause == rx_pause) {
1984 		mxge_warn_stuck(sc, tx, 0);
1985 		mxge_watchdog_reset(sc);
1986 		return;
1987 	} else {
1988 		if_printf(ifp, "Flow control blocking xmits, "
1989 		    "check link partner\n");
1990 	}
1991 	tx->watchdog_rx_pause = rx_pause;
1992 }
1993 
1994 /*
1995  * Copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
1996  * at most 32 bytes at a time, so as to avoid involving the software
1997  * pio handler in the nic.  We re-write the first segment's low
1998  * DMA address to mark it valid only after we write the entire chunk
1999  * in a burst
2000  */
2001 static __inline void
2002 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2003     mcp_kreq_ether_recv_t *src)
2004 {
2005 	uint32_t low;
2006 
2007 	low = src->addr_low;
2008 	src->addr_low = 0xffffffff;
2009 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2010 	wmb();
2011 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2012 	wmb();
2013 	src->addr_low = low;
2014 	dst->addr_low = low;
2015 	wmb();
2016 }
2017 
2018 static int
2019 mxge_get_buf_small(mxge_rx_ring_t *rx, bus_dmamap_t map, int idx,
2020     boolean_t init)
2021 {
2022 	bus_dma_segment_t seg;
2023 	struct mbuf *m;
2024 	int cnt, err, mflag;
2025 
2026 	mflag = MB_DONTWAIT;
2027 	if (__predict_false(init))
2028 		mflag = MB_WAIT;
2029 
2030 	m = m_gethdr(mflag, MT_DATA);
2031 	if (m == NULL) {
2032 		err = ENOBUFS;
2033 		if (__predict_false(init)) {
2034 			/*
2035 			 * During initialization, there
2036 			 * is nothing to setup; bail out
2037 			 */
2038 			return err;
2039 		}
2040 		goto done;
2041 	}
2042 	m->m_len = m->m_pkthdr.len = MHLEN;
2043 
2044 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2045 	    &seg, 1, &cnt, BUS_DMA_NOWAIT);
2046 	if (err != 0) {
2047 		m_freem(m);
2048 		if (__predict_false(init)) {
2049 			/*
2050 			 * During initialization, there
2051 			 * is nothing to setup; bail out
2052 			 */
2053 			return err;
2054 		}
2055 		goto done;
2056 	}
2057 
2058 	rx->info[idx].m = m;
2059 	rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2060 	rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2061 
2062 done:
2063 	if ((idx & 7) == 7)
2064 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2065 	return err;
2066 }
2067 
2068 static int
2069 mxge_get_buf_big(mxge_rx_ring_t *rx, bus_dmamap_t map, int idx,
2070     boolean_t init)
2071 {
2072 	bus_dma_segment_t seg;
2073 	struct mbuf *m;
2074 	int cnt, err, mflag;
2075 
2076 	mflag = MB_DONTWAIT;
2077 	if (__predict_false(init))
2078 		mflag = MB_WAIT;
2079 
2080 	if (rx->cl_size == MCLBYTES)
2081 		m = m_getcl(mflag, MT_DATA, M_PKTHDR);
2082 	else
2083 		m = m_getjcl(mflag, MT_DATA, M_PKTHDR, MJUMPAGESIZE);
2084 	if (m == NULL) {
2085 		err = ENOBUFS;
2086 		if (__predict_false(init)) {
2087 			/*
2088 			 * During initialization, there
2089 			 * is nothing to setup; bail out
2090 			 */
2091 			return err;
2092 		}
2093 		goto done;
2094 	}
2095 	m->m_len = m->m_pkthdr.len = rx->cl_size;
2096 
2097 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2098 	    &seg, 1, &cnt, BUS_DMA_NOWAIT);
2099 	if (err != 0) {
2100 		m_freem(m);
2101 		if (__predict_false(init)) {
2102 			/*
2103 			 * During initialization, there
2104 			 * is nothing to setup; bail out
2105 			 */
2106 			return err;
2107 		}
2108 		goto done;
2109 	}
2110 
2111 	rx->info[idx].m = m;
2112 	rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2113 	rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2114 
2115 done:
2116 	if ((idx & 7) == 7)
2117 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2118 	return err;
2119 }
2120 
2121 /*
2122  * Myri10GE hardware checksums are not valid if the sender
2123  * padded the frame with non-zero padding.  This is because
2124  * the firmware just does a simple 16-bit 1s complement
2125  * checksum across the entire frame, excluding the first 14
2126  * bytes.  It is best to simply to check the checksum and
2127  * tell the stack about it only if the checksum is good
2128  */
2129 static __inline uint16_t
2130 mxge_rx_csum(struct mbuf *m, int csum)
2131 {
2132 	const struct ether_header *eh;
2133 	const struct ip *ip;
2134 	uint16_t c;
2135 
2136 	eh = mtod(m, const struct ether_header *);
2137 
2138 	/* Only deal with IPv4 TCP & UDP for now */
2139 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2140 		return 1;
2141 
2142 	ip = (const struct ip *)(eh + 1);
2143 	if (__predict_false(ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP))
2144 		return 1;
2145 
2146 #ifdef INET
2147 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2148 	    htonl(ntohs(csum) + ntohs(ip->ip_len) +
2149 	          - (ip->ip_hl << 2) + ip->ip_p));
2150 #else
2151 	c = 1;
2152 #endif
2153 	c ^= 0xffff;
2154 	return c;
2155 }
2156 
2157 static void
2158 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2159 {
2160 	struct ether_vlan_header *evl;
2161 	uint32_t partial;
2162 
2163 	evl = mtod(m, struct ether_vlan_header *);
2164 
2165 	/*
2166 	 * Fix checksum by subtracting EVL_ENCAPLEN bytes after
2167 	 * what the firmware thought was the end of the ethernet
2168 	 * header.
2169 	 */
2170 
2171 	/* Put checksum into host byte order */
2172 	*csum = ntohs(*csum);
2173 
2174 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2175 	*csum += ~partial;
2176 	*csum += ((*csum) < ~partial);
2177 	*csum = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2178 	*csum = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2179 
2180 	/*
2181 	 * Restore checksum to network byte order;
2182 	 * later consumers expect this
2183 	 */
2184 	*csum = htons(*csum);
2185 
2186 	/* save the tag */
2187 	m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2188 	m->m_flags |= M_VLANTAG;
2189 
2190 	/*
2191 	 * Remove the 802.1q header by copying the Ethernet
2192 	 * addresses over it and adjusting the beginning of
2193 	 * the data in the mbuf.  The encapsulated Ethernet
2194 	 * type field is already in place.
2195 	 */
2196 	bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2197 	    ETHER_HDR_LEN - ETHER_TYPE_LEN);
2198 	m_adj(m, EVL_ENCAPLEN);
2199 }
2200 
2201 
2202 static __inline void
2203 mxge_rx_done_big(struct ifnet *ifp, mxge_rx_ring_t *rx,
2204     uint32_t len, uint32_t csum)
2205 {
2206 	struct mbuf *m;
2207 	const struct ether_header *eh;
2208 	bus_dmamap_t old_map;
2209 	int idx;
2210 
2211 	idx = rx->cnt & rx->mask;
2212 	rx->cnt++;
2213 
2214 	/* Save a pointer to the received mbuf */
2215 	m = rx->info[idx].m;
2216 
2217 	/* Try to replace the received mbuf */
2218 	if (mxge_get_buf_big(rx, rx->extra_map, idx, FALSE)) {
2219 		/* Drop the frame -- the old mbuf is re-cycled */
2220 		IFNET_STAT_INC(ifp, ierrors, 1);
2221 		return;
2222 	}
2223 
2224 	/* Unmap the received buffer */
2225 	old_map = rx->info[idx].map;
2226 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2227 	bus_dmamap_unload(rx->dmat, old_map);
2228 
2229 	/* Swap the bus_dmamap_t's */
2230 	rx->info[idx].map = rx->extra_map;
2231 	rx->extra_map = old_map;
2232 
2233 	/*
2234 	 * mcp implicitly skips 1st 2 bytes so that packet is properly
2235 	 * aligned
2236 	 */
2237 	m->m_data += MXGEFW_PAD;
2238 
2239 	m->m_pkthdr.rcvif = ifp;
2240 	m->m_len = m->m_pkthdr.len = len;
2241 
2242 	IFNET_STAT_INC(ifp, ipackets, 1);
2243 
2244 	eh = mtod(m, const struct ether_header *);
2245 	if (eh->ether_type == htons(ETHERTYPE_VLAN))
2246 		mxge_vlan_tag_remove(m, &csum);
2247 
2248 	/* If the checksum is valid, mark it in the mbuf header */
2249 	if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2250 	    mxge_rx_csum(m, csum) == 0) {
2251 		/* Tell the stack that the checksum is good */
2252 		m->m_pkthdr.csum_data = 0xffff;
2253 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2254 		    CSUM_DATA_VALID;
2255 	}
2256 	ifp->if_input(ifp, m);
2257 }
2258 
2259 static __inline void
2260 mxge_rx_done_small(struct ifnet *ifp, mxge_rx_ring_t *rx,
2261     uint32_t len, uint32_t csum)
2262 {
2263 	const struct ether_header *eh;
2264 	struct mbuf *m;
2265 	bus_dmamap_t old_map;
2266 	int idx;
2267 
2268 	idx = rx->cnt & rx->mask;
2269 	rx->cnt++;
2270 
2271 	/* Save a pointer to the received mbuf */
2272 	m = rx->info[idx].m;
2273 
2274 	/* Try to replace the received mbuf */
2275 	if (mxge_get_buf_small(rx, rx->extra_map, idx, FALSE)) {
2276 		/* Drop the frame -- the old mbuf is re-cycled */
2277 		IFNET_STAT_INC(ifp, ierrors, 1);
2278 		return;
2279 	}
2280 
2281 	/* Unmap the received buffer */
2282 	old_map = rx->info[idx].map;
2283 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2284 	bus_dmamap_unload(rx->dmat, old_map);
2285 
2286 	/* Swap the bus_dmamap_t's */
2287 	rx->info[idx].map = rx->extra_map;
2288 	rx->extra_map = old_map;
2289 
2290 	/*
2291 	 * mcp implicitly skips 1st 2 bytes so that packet is properly
2292 	 * aligned
2293 	 */
2294 	m->m_data += MXGEFW_PAD;
2295 
2296 	m->m_pkthdr.rcvif = ifp;
2297 	m->m_len = m->m_pkthdr.len = len;
2298 
2299 	IFNET_STAT_INC(ifp, ipackets, 1);
2300 
2301 	eh = mtod(m, const struct ether_header *);
2302 	if (eh->ether_type == htons(ETHERTYPE_VLAN))
2303 		mxge_vlan_tag_remove(m, &csum);
2304 
2305 	/* If the checksum is valid, mark it in the mbuf header */
2306 	if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2307 	    mxge_rx_csum(m, csum) == 0) {
2308 		/* Tell the stack that the checksum is good */
2309 		m->m_pkthdr.csum_data = 0xffff;
2310 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2311 		    CSUM_DATA_VALID;
2312 	}
2313 	ifp->if_input(ifp, m);
2314 }
2315 
2316 static __inline void
2317 mxge_clean_rx_done(struct ifnet *ifp, struct mxge_rx_data *rx_data)
2318 {
2319 	mxge_rx_done_t *rx_done = &rx_data->rx_done;
2320 
2321 	while (rx_done->entry[rx_done->idx].length != 0) {
2322 		uint16_t length, checksum;
2323 
2324 		length = ntohs(rx_done->entry[rx_done->idx].length);
2325 		rx_done->entry[rx_done->idx].length = 0;
2326 
2327 		checksum = rx_done->entry[rx_done->idx].checksum;
2328 
2329 		if (length <= MXGE_RX_SMALL_BUFLEN) {
2330 			mxge_rx_done_small(ifp, &rx_data->rx_small,
2331 			    length, checksum);
2332 		} else {
2333 			mxge_rx_done_big(ifp, &rx_data->rx_big,
2334 			    length, checksum);
2335 		}
2336 
2337 		rx_done->idx++;
2338 		rx_done->idx &= rx_done->mask;
2339 	}
2340 }
2341 
2342 static __inline void
2343 mxge_tx_done(struct ifnet *ifp, mxge_tx_ring_t *tx, uint32_t mcp_idx)
2344 {
2345 	ASSERT_SERIALIZED(&tx->tx_serialize);
2346 
2347 	while (tx->pkt_done != mcp_idx) {
2348 		struct mbuf *m;
2349 		int idx;
2350 
2351 		idx = tx->done & tx->mask;
2352 		tx->done++;
2353 
2354 		m = tx->info[idx].m;
2355 		/*
2356 		 * mbuf and DMA map only attached to the first
2357 		 * segment per-mbuf.
2358 		 */
2359 		if (m != NULL) {
2360 			tx->pkt_done++;
2361 			IFNET_STAT_INC(ifp, opackets, 1);
2362 			tx->info[idx].m = NULL;
2363 			bus_dmamap_unload(tx->dmat, tx->info[idx].map);
2364 			m_freem(m);
2365 		}
2366 	}
2367 
2368 	/*
2369 	 * If we have space, clear OACTIVE to tell the stack that
2370 	 * its OK to send packets
2371 	 */
2372 	if (tx->req - tx->done < (tx->mask + 1) / 2) {
2373 		ifq_clr_oactive(&ifp->if_snd);
2374 		if (tx->req == tx->done)
2375 			ifp->if_timer = 0;
2376 	}
2377 
2378 	if (!ifq_is_empty(&ifp->if_snd))
2379 		if_devstart(ifp);
2380 
2381 #ifdef IFNET_BUF_RING
2382 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2383 		/* let the NIC stop polling this queue, since there
2384 		 * are no more transmits pending */
2385 		if (tx->req == tx->done) {
2386 			*tx->send_stop = 1;
2387 			tx->queue_active = 0;
2388 			tx->deactivate++;
2389 			wmb();
2390 		}
2391 	}
2392 #endif
2393 }
2394 
2395 static struct mxge_media_type mxge_xfp_media_types[] = {
2396 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2397 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2398 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2399 	{0,		(1 << 5),	"10GBASE-ER"},
2400 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2401 	{0,		(1 << 3),	"10GBASE-SW"},
2402 	{0,		(1 << 2),	"10GBASE-LW"},
2403 	{0,		(1 << 1),	"10GBASE-EW"},
2404 	{0,		(1 << 0),	"Reserved"}
2405 };
2406 
2407 static struct mxge_media_type mxge_sfp_media_types[] = {
2408 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2409 	{0,		(1 << 7),	"Reserved"},
2410 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2411 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2412 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2413 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2414 };
2415 
2416 static void
2417 mxge_media_set(mxge_softc_t *sc, int media_type)
2418 {
2419 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type, 0, NULL);
2420 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2421 	sc->current_media = media_type;
2422 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2423 }
2424 
2425 static void
2426 mxge_media_init(mxge_softc_t *sc)
2427 {
2428 	const char *ptr;
2429 	int i;
2430 
2431 	ifmedia_removeall(&sc->media);
2432 	mxge_media_set(sc, IFM_AUTO);
2433 
2434 	/*
2435 	 * Parse the product code to deterimine the interface type
2436 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2437 	 * after the 3rd dash in the driver's cached copy of the
2438 	 * EEPROM's product code string.
2439 	 */
2440 	ptr = sc->product_code_string;
2441 	if (ptr == NULL) {
2442 		if_printf(sc->ifp, "Missing product code\n");
2443 		return;
2444 	}
2445 
2446 	for (i = 0; i < 3; i++, ptr++) {
2447 		ptr = strchr(ptr, '-');
2448 		if (ptr == NULL) {
2449 			if_printf(sc->ifp, "only %d dashes in PC?!?\n", i);
2450 			return;
2451 		}
2452 	}
2453 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2454 		/* -C is CX4 */
2455 		sc->connector = MXGE_CX4;
2456 		mxge_media_set(sc, IFM_10G_CX4);
2457 	} else if (*ptr == 'Q') {
2458 		/* -Q is Quad Ribbon Fiber */
2459 		sc->connector = MXGE_QRF;
2460 		if_printf(sc->ifp, "Quad Ribbon Fiber Media\n");
2461 		/* DragonFly has no media type for Quad ribbon fiber */
2462 	} else if (*ptr == 'R') {
2463 		/* -R is XFP */
2464 		sc->connector = MXGE_XFP;
2465 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2466 		/* -S or -2S is SFP+ */
2467 		sc->connector = MXGE_SFP;
2468 	} else {
2469 		if_printf(sc->ifp, "Unknown media type: %c\n", *ptr);
2470 	}
2471 }
2472 
2473 /*
2474  * Determine the media type for a NIC.  Some XFPs will identify
2475  * themselves only when their link is up, so this is initiated via a
2476  * link up interrupt.  However, this can potentially take up to
2477  * several milliseconds, so it is run via the watchdog routine, rather
2478  * than in the interrupt handler itself.
2479  */
2480 static void
2481 mxge_media_probe(mxge_softc_t *sc)
2482 {
2483 	mxge_cmd_t cmd;
2484 	const char *cage_type;
2485 	struct mxge_media_type *mxge_media_types = NULL;
2486 	int i, err, ms, mxge_media_type_entries;
2487 	uint32_t byte;
2488 
2489 	sc->need_media_probe = 0;
2490 
2491 	if (sc->connector == MXGE_XFP) {
2492 		/* -R is XFP */
2493 		mxge_media_types = mxge_xfp_media_types;
2494 		mxge_media_type_entries = sizeof(mxge_xfp_media_types) /
2495 		    sizeof(mxge_xfp_media_types[0]);
2496 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2497 		cage_type = "XFP";
2498 	} else 	if (sc->connector == MXGE_SFP) {
2499 		/* -S or -2S is SFP+ */
2500 		mxge_media_types = mxge_sfp_media_types;
2501 		mxge_media_type_entries = sizeof(mxge_sfp_media_types) /
2502 		    sizeof(mxge_sfp_media_types[0]);
2503 		cage_type = "SFP+";
2504 		byte = 3;
2505 	} else {
2506 		/* nothing to do; media type cannot change */
2507 		return;
2508 	}
2509 
2510 	/*
2511 	 * At this point we know the NIC has an XFP cage, so now we
2512 	 * try to determine what is in the cage by using the
2513 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2514 	 * register.  We read just one byte, which may take over
2515 	 * a millisecond
2516 	 */
2517 
2518 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2519 	cmd.data1 = byte;
2520 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2521 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE)
2522 		if_printf(sc->ifp, "failed to read XFP\n");
2523 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT)
2524 		if_printf(sc->ifp, "Type R/S with no XFP!?!?\n");
2525 	if (err != MXGEFW_CMD_OK)
2526 		return;
2527 
2528 	/* Now we wait for the data to be cached */
2529 	cmd.data0 = byte;
2530 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2531 	for (ms = 0; err == EBUSY && ms < 50; ms++) {
2532 		DELAY(1000);
2533 		cmd.data0 = byte;
2534 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2535 	}
2536 	if (err != MXGEFW_CMD_OK) {
2537 		if_printf(sc->ifp, "failed to read %s (%d, %dms)\n",
2538 		    cage_type, err, ms);
2539 		return;
2540 	}
2541 
2542 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2543 		if (bootverbose) {
2544 			if_printf(sc->ifp, "%s:%s\n", cage_type,
2545 			    mxge_media_types[0].name);
2546 		}
2547 		if (sc->current_media != mxge_media_types[0].flag) {
2548 			mxge_media_init(sc);
2549 			mxge_media_set(sc, mxge_media_types[0].flag);
2550 		}
2551 		return;
2552 	}
2553 	for (i = 1; i < mxge_media_type_entries; i++) {
2554 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2555 			if (bootverbose) {
2556 				if_printf(sc->ifp, "%s:%s\n", cage_type,
2557 				    mxge_media_types[i].name);
2558 			}
2559 
2560 			if (sc->current_media != mxge_media_types[i].flag) {
2561 				mxge_media_init(sc);
2562 				mxge_media_set(sc, mxge_media_types[i].flag);
2563 			}
2564 			return;
2565 		}
2566 	}
2567 	if (bootverbose) {
2568 		if_printf(sc->ifp, "%s media 0x%x unknown\n", cage_type,
2569 		    cmd.data0);
2570 	}
2571 }
2572 
2573 static void
2574 mxge_intr_status(struct mxge_softc *sc, const mcp_irq_data_t *stats)
2575 {
2576 	if (sc->link_state != stats->link_up) {
2577 		sc->link_state = stats->link_up;
2578 		if (sc->link_state) {
2579 			sc->ifp->if_link_state = LINK_STATE_UP;
2580 			if_link_state_change(sc->ifp);
2581 			if (bootverbose)
2582 				if_printf(sc->ifp, "link up\n");
2583 		} else {
2584 			sc->ifp->if_link_state = LINK_STATE_DOWN;
2585 			if_link_state_change(sc->ifp);
2586 			if (bootverbose)
2587 				if_printf(sc->ifp, "link down\n");
2588 		}
2589 		sc->need_media_probe = 1;
2590 	}
2591 
2592 	if (sc->rdma_tags_available != be32toh(stats->rdma_tags_available)) {
2593 		sc->rdma_tags_available = be32toh(stats->rdma_tags_available);
2594 		if_printf(sc->ifp, "RDMA timed out! %d tags left\n",
2595 		    sc->rdma_tags_available);
2596 	}
2597 
2598 	if (stats->link_down) {
2599 		sc->down_cnt += stats->link_down;
2600 		sc->link_state = 0;
2601 		sc->ifp->if_link_state = LINK_STATE_DOWN;
2602 		if_link_state_change(sc->ifp);
2603 	}
2604 }
2605 
2606 static void
2607 mxge_serialize_skipmain(struct mxge_softc *sc)
2608 {
2609 	lwkt_serialize_array_enter(sc->serializes, sc->nserialize, 1);
2610 }
2611 
2612 static void
2613 mxge_deserialize_skipmain(struct mxge_softc *sc)
2614 {
2615 	lwkt_serialize_array_exit(sc->serializes, sc->nserialize, 1);
2616 }
2617 
2618 static void
2619 mxge_legacy(void *arg)
2620 {
2621 	struct mxge_slice_state *ss = arg;
2622 	mxge_softc_t *sc = ss->sc;
2623 	mcp_irq_data_t *stats = ss->fw_stats;
2624 	mxge_tx_ring_t *tx = &ss->tx;
2625 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2626 	uint32_t send_done_count;
2627 	uint8_t valid;
2628 
2629 	ASSERT_SERIALIZED(&sc->main_serialize);
2630 
2631 #if 0
2632 	/* an interrupt on a non-zero slice is implicitly valid
2633 	   since MSI-X irqs are not shared */
2634 	if (ss != sc->ss) {
2635 		mxge_clean_rx_done(rx_done);
2636 		*ss->irq_claim = be32toh(3);
2637 		return;
2638 	}
2639 #endif
2640 
2641 	/* Make sure the DMA has finished */
2642 	if (!stats->valid)
2643 		return;
2644 	valid = stats->valid;
2645 
2646 	/* Lower legacy IRQ */
2647 	*sc->irq_deassert = 0;
2648 	if (!mxge_deassert_wait) {
2649 		/* Don't wait for conf. that irq is low */
2650 		stats->valid = 0;
2651 	}
2652 
2653 	mxge_serialize_skipmain(sc);
2654 
2655 	/*
2656 	 * Loop while waiting for legacy irq deassertion
2657 	 * XXX do we really want to loop?
2658 	 */
2659 	do {
2660 		/* Check for transmit completes and receives */
2661 		send_done_count = be32toh(stats->send_done_count);
2662 		while ((send_done_count != tx->pkt_done) ||
2663 		       (rx_done->entry[rx_done->idx].length != 0)) {
2664 			if (send_done_count != tx->pkt_done) {
2665 				mxge_tx_done(&sc->arpcom.ac_if, tx,
2666 				    (int)send_done_count);
2667 			}
2668 			mxge_clean_rx_done(&sc->arpcom.ac_if, &ss->rx_data);
2669 			send_done_count = be32toh(stats->send_done_count);
2670 		}
2671 		if (mxge_deassert_wait)
2672 			wmb();
2673 	} while (*((volatile uint8_t *)&stats->valid));
2674 
2675 	mxge_deserialize_skipmain(sc);
2676 
2677 	/* Fw link & error stats meaningful only on the first slice */
2678 	if (__predict_false(stats->stats_updated))
2679 		mxge_intr_status(sc, stats);
2680 
2681 	/* Check to see if we have rx token to pass back */
2682 	if (valid & 0x1)
2683 	    *ss->irq_claim = be32toh(3);
2684 	*(ss->irq_claim + 1) = be32toh(3);
2685 }
2686 
2687 static void
2688 mxge_msi(void *arg)
2689 {
2690 	struct mxge_slice_state *ss = arg;
2691 	mxge_softc_t *sc = ss->sc;
2692 	mcp_irq_data_t *stats = ss->fw_stats;
2693 	mxge_tx_ring_t *tx = &ss->tx;
2694 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2695 	uint32_t send_done_count;
2696 	uint8_t valid;
2697 
2698 	ASSERT_SERIALIZED(&sc->main_serialize);
2699 
2700 	/* Make sure the DMA has finished */
2701 	if (__predict_false(!stats->valid))
2702 		return;
2703 
2704 	valid = stats->valid;
2705 	stats->valid = 0;
2706 
2707 	/* Check for receives */
2708 	lwkt_serialize_enter(&ss->rx_data.rx_serialize);
2709 	if (rx_done->entry[rx_done->idx].length != 0)
2710 		mxge_clean_rx_done(&sc->arpcom.ac_if, &ss->rx_data);
2711 	lwkt_serialize_exit(&ss->rx_data.rx_serialize);
2712 
2713 	/*
2714 	 * Check for transmit completes
2715 	 *
2716 	 * NOTE:
2717 	 * Since pkt_done is only changed by mxge_tx_done(),
2718 	 * which is called only in interrupt handler, the
2719 	 * check w/o holding tx serializer is MPSAFE.
2720 	 */
2721 	send_done_count = be32toh(stats->send_done_count);
2722 	if (send_done_count != tx->pkt_done) {
2723 		lwkt_serialize_enter(&tx->tx_serialize);
2724 		mxge_tx_done(&sc->arpcom.ac_if, tx, (int)send_done_count);
2725 		lwkt_serialize_exit(&tx->tx_serialize);
2726 	}
2727 
2728 	if (__predict_false(stats->stats_updated))
2729 		mxge_intr_status(sc, stats);
2730 
2731 	/* Check to see if we have rx token to pass back */
2732 	if (valid & 0x1)
2733 	    *ss->irq_claim = be32toh(3);
2734 	*(ss->irq_claim + 1) = be32toh(3);
2735 }
2736 
2737 static void
2738 mxge_init(void *arg)
2739 {
2740 	struct mxge_softc *sc = arg;
2741 
2742 	ASSERT_IFNET_SERIALIZED_ALL(sc->ifp);
2743 	if ((sc->ifp->if_flags & IFF_RUNNING) == 0)
2744 		mxge_open(sc);
2745 }
2746 
2747 static void
2748 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2749 {
2750 	int i;
2751 
2752 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
2753 		if (ss->rx_data.rx_big.info[i].m == NULL)
2754 			continue;
2755 		bus_dmamap_unload(ss->rx_data.rx_big.dmat,
2756 		    ss->rx_data.rx_big.info[i].map);
2757 		m_freem(ss->rx_data.rx_big.info[i].m);
2758 		ss->rx_data.rx_big.info[i].m = NULL;
2759 	}
2760 
2761 	for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
2762 		if (ss->rx_data.rx_small.info[i].m == NULL)
2763 			continue;
2764 		bus_dmamap_unload(ss->rx_data.rx_small.dmat,
2765 		    ss->rx_data.rx_small.info[i].map);
2766 		m_freem(ss->rx_data.rx_small.info[i].m);
2767 		ss->rx_data.rx_small.info[i].m = NULL;
2768 	}
2769 
2770 	/* Transmit ring used only on the first slice */
2771 	if (ss->tx.info == NULL)
2772 		return;
2773 
2774 	for (i = 0; i <= ss->tx.mask; i++) {
2775 		if (ss->tx.info[i].m == NULL)
2776 			continue;
2777 		bus_dmamap_unload(ss->tx.dmat, ss->tx.info[i].map);
2778 		m_freem(ss->tx.info[i].m);
2779 		ss->tx.info[i].m = NULL;
2780 	}
2781 }
2782 
2783 static void
2784 mxge_free_mbufs(mxge_softc_t *sc)
2785 {
2786 	int slice;
2787 
2788 	for (slice = 0; slice < sc->num_slices; slice++)
2789 		mxge_free_slice_mbufs(&sc->ss[slice]);
2790 }
2791 
2792 static void
2793 mxge_free_slice_rings(struct mxge_slice_state *ss)
2794 {
2795 	int i;
2796 
2797 	if (ss->rx_data.rx_done.entry != NULL) {
2798 		mxge_dma_free(&ss->rx_done_dma);
2799 		ss->rx_data.rx_done.entry = NULL;
2800 	}
2801 
2802 	if (ss->tx.req_list != NULL) {
2803 		kfree(ss->tx.req_list, M_DEVBUF);
2804 		ss->tx.req_list = NULL;
2805 	}
2806 
2807 	if (ss->tx.seg_list != NULL) {
2808 		kfree(ss->tx.seg_list, M_DEVBUF);
2809 		ss->tx.seg_list = NULL;
2810 	}
2811 
2812 	if (ss->rx_data.rx_small.shadow != NULL) {
2813 		kfree(ss->rx_data.rx_small.shadow, M_DEVBUF);
2814 		ss->rx_data.rx_small.shadow = NULL;
2815 	}
2816 
2817 	if (ss->rx_data.rx_big.shadow != NULL) {
2818 		kfree(ss->rx_data.rx_big.shadow, M_DEVBUF);
2819 		ss->rx_data.rx_big.shadow = NULL;
2820 	}
2821 
2822 	if (ss->tx.info != NULL) {
2823 		if (ss->tx.dmat != NULL) {
2824 			for (i = 0; i <= ss->tx.mask; i++) {
2825 				bus_dmamap_destroy(ss->tx.dmat,
2826 				    ss->tx.info[i].map);
2827 			}
2828 			bus_dma_tag_destroy(ss->tx.dmat);
2829 		}
2830 		kfree(ss->tx.info, M_DEVBUF);
2831 		ss->tx.info = NULL;
2832 	}
2833 
2834 	if (ss->rx_data.rx_small.info != NULL) {
2835 		if (ss->rx_data.rx_small.dmat != NULL) {
2836 			for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
2837 				bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
2838 				    ss->rx_data.rx_small.info[i].map);
2839 			}
2840 			bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
2841 			    ss->rx_data.rx_small.extra_map);
2842 			bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
2843 		}
2844 		kfree(ss->rx_data.rx_small.info, M_DEVBUF);
2845 		ss->rx_data.rx_small.info = NULL;
2846 	}
2847 
2848 	if (ss->rx_data.rx_big.info != NULL) {
2849 		if (ss->rx_data.rx_big.dmat != NULL) {
2850 			for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
2851 				bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
2852 				    ss->rx_data.rx_big.info[i].map);
2853 			}
2854 			bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
2855 			    ss->rx_data.rx_big.extra_map);
2856 			bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
2857 		}
2858 		kfree(ss->rx_data.rx_big.info, M_DEVBUF);
2859 		ss->rx_data.rx_big.info = NULL;
2860 	}
2861 }
2862 
2863 static void
2864 mxge_free_rings(mxge_softc_t *sc)
2865 {
2866 	int slice;
2867 
2868 	if (sc->ss == NULL)
2869 		return;
2870 
2871 	for (slice = 0; slice < sc->num_slices; slice++)
2872 		mxge_free_slice_rings(&sc->ss[slice]);
2873 }
2874 
2875 static int
2876 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
2877     int tx_ring_entries)
2878 {
2879 	mxge_softc_t *sc = ss->sc;
2880 	size_t bytes;
2881 	int err, i;
2882 
2883 	/*
2884 	 * Allocate per-slice receive resources
2885 	 */
2886 
2887 	ss->rx_data.rx_small.mask = ss->rx_data.rx_big.mask =
2888 	    rx_ring_entries - 1;
2889 	ss->rx_data.rx_done.mask = (2 * rx_ring_entries) - 1;
2890 
2891 	/* Allocate the rx shadow rings */
2892 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_small.shadow);
2893 	ss->rx_data.rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2894 
2895 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_big.shadow);
2896 	ss->rx_data.rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2897 
2898 	/* Allocate the rx host info rings */
2899 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_small.info);
2900 	ss->rx_data.rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2901 
2902 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_big.info);
2903 	ss->rx_data.rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2904 
2905 	/* Allocate the rx busdma resources */
2906 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2907 				 1,			/* alignment */
2908 				 4096,			/* boundary */
2909 				 BUS_SPACE_MAXADDR,	/* low */
2910 				 BUS_SPACE_MAXADDR,	/* high */
2911 				 NULL, NULL,		/* filter */
2912 				 MHLEN,			/* maxsize */
2913 				 1,			/* num segs */
2914 				 MHLEN,			/* maxsegsize */
2915 				 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
2916 				 			/* flags */
2917 				 &ss->rx_data.rx_small.dmat); /* tag */
2918 	if (err != 0) {
2919 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2920 		    err);
2921 		return err;
2922 	}
2923 
2924 	err = bus_dmamap_create(ss->rx_data.rx_small.dmat, BUS_DMA_WAITOK,
2925 	    &ss->rx_data.rx_small.extra_map);
2926 	if (err != 0) {
2927 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n", err);
2928 		bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
2929 		ss->rx_data.rx_small.dmat = NULL;
2930 		return err;
2931 	}
2932 	for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
2933 		err = bus_dmamap_create(ss->rx_data.rx_small.dmat,
2934 		    BUS_DMA_WAITOK, &ss->rx_data.rx_small.info[i].map);
2935 		if (err != 0) {
2936 			int j;
2937 
2938 			device_printf(sc->dev, "Err %d rx_small dmamap\n", err);
2939 
2940 			for (j = 0; j < i; ++j) {
2941 				bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
2942 				    ss->rx_data.rx_small.info[j].map);
2943 			}
2944 			bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
2945 			    ss->rx_data.rx_small.extra_map);
2946 			bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
2947 			ss->rx_data.rx_small.dmat = NULL;
2948 			return err;
2949 		}
2950 	}
2951 
2952 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2953 				 1,			/* alignment */
2954 				 4096,			/* boundary */
2955 				 BUS_SPACE_MAXADDR,	/* low */
2956 				 BUS_SPACE_MAXADDR,	/* high */
2957 				 NULL, NULL,		/* filter */
2958 				 4096,			/* maxsize */
2959 				 1,			/* num segs */
2960 				 4096,			/* maxsegsize*/
2961 				 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
2962 				 			/* flags */
2963 				 &ss->rx_data.rx_big.dmat); /* tag */
2964 	if (err != 0) {
2965 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2966 		    err);
2967 		return err;
2968 	}
2969 
2970 	err = bus_dmamap_create(ss->rx_data.rx_big.dmat, BUS_DMA_WAITOK,
2971 	    &ss->rx_data.rx_big.extra_map);
2972 	if (err != 0) {
2973 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n", err);
2974 		bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
2975 		ss->rx_data.rx_big.dmat = NULL;
2976 		return err;
2977 	}
2978 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
2979 		err = bus_dmamap_create(ss->rx_data.rx_big.dmat, BUS_DMA_WAITOK,
2980 		    &ss->rx_data.rx_big.info[i].map);
2981 		if (err != 0) {
2982 			int j;
2983 
2984 			device_printf(sc->dev, "Err %d rx_big dmamap\n", err);
2985 			for (j = 0; j < i; ++j) {
2986 				bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
2987 				    ss->rx_data.rx_big.info[j].map);
2988 			}
2989 			bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
2990 			    ss->rx_data.rx_big.extra_map);
2991 			bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
2992 			ss->rx_data.rx_big.dmat = NULL;
2993 			return err;
2994 		}
2995 	}
2996 
2997 	/*
2998 	 * Now allocate TX resources
2999 	 */
3000 
3001 	ss->tx.mask = tx_ring_entries - 1;
3002 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3003 
3004 	/*
3005 	 * Allocate the tx request copy block; MUST be at least 8 bytes
3006 	 * aligned
3007 	 */
3008 	bytes = sizeof(*ss->tx.req_list) * (ss->tx.max_desc + 4);
3009 	ss->tx.req_list = kmalloc_cachealign(__VM_CACHELINE_ALIGN(bytes),
3010 	    M_DEVBUF, M_WAITOK);
3011 
3012 	/* Allocate the tx busdma segment list */
3013 	bytes = sizeof(*ss->tx.seg_list) * ss->tx.max_desc;
3014 	ss->tx.seg_list = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3015 
3016 	/* Allocate the tx host info ring */
3017 	bytes = tx_ring_entries * sizeof(*ss->tx.info);
3018 	ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3019 
3020 	/* Allocate the tx busdma resources */
3021 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3022 				 1,			/* alignment */
3023 				 sc->tx_boundary,	/* boundary */
3024 				 BUS_SPACE_MAXADDR,	/* low */
3025 				 BUS_SPACE_MAXADDR,	/* high */
3026 				 NULL, NULL,		/* filter */
3027 				 IP_MAXPACKET +
3028 				 sizeof(struct ether_vlan_header),
3029 				 			/* maxsize */
3030 				 ss->tx.max_desc - 2,	/* num segs */
3031 				 sc->tx_boundary,	/* maxsegsz */
3032 				 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW |
3033 				 BUS_DMA_ONEBPAGE,	/* flags */
3034 				 &ss->tx.dmat);		/* tag */
3035 	if (err != 0) {
3036 		device_printf(sc->dev, "Err %d allocating tx dmat\n", err);
3037 		return err;
3038 	}
3039 
3040 	/*
3041 	 * Now use these tags to setup DMA maps for each slot in the ring
3042 	 */
3043 	for (i = 0; i <= ss->tx.mask; i++) {
3044 		err = bus_dmamap_create(ss->tx.dmat,
3045 		    BUS_DMA_WAITOK | BUS_DMA_ONEBPAGE, &ss->tx.info[i].map);
3046 		if (err != 0) {
3047 			int j;
3048 
3049 			device_printf(sc->dev, "Err %d tx dmamap\n", err);
3050 			for (j = 0; j < i; ++j) {
3051 				bus_dmamap_destroy(ss->tx.dmat,
3052 				    ss->tx.info[j].map);
3053 			}
3054 			bus_dma_tag_destroy(ss->tx.dmat);
3055 			ss->tx.dmat = NULL;
3056 			return err;
3057 		}
3058 	}
3059 	return 0;
3060 }
3061 
3062 static int
3063 mxge_alloc_rings(mxge_softc_t *sc)
3064 {
3065 	mxge_cmd_t cmd;
3066 	int tx_ring_size;
3067 	int tx_ring_entries, rx_ring_entries;
3068 	int err, slice;
3069 
3070 	/* Get ring sizes */
3071 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3072 	if (err != 0) {
3073 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3074 		return err;
3075 	}
3076 	tx_ring_size = cmd.data0;
3077 
3078 	tx_ring_entries = tx_ring_size / sizeof(mcp_kreq_ether_send_t);
3079 	rx_ring_entries = sc->rx_ring_size / sizeof(mcp_dma_addr_t);
3080 	ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3081 	ifq_set_ready(&sc->ifp->if_snd);
3082 
3083 	for (slice = 0; slice < sc->num_slices; slice++) {
3084 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3085 		    rx_ring_entries, tx_ring_entries);
3086 		if (err != 0) {
3087 			device_printf(sc->dev,
3088 			    "alloc %d slice rings failed\n", slice);
3089 			return err;
3090 		}
3091 	}
3092 	return 0;
3093 }
3094 
3095 static void
3096 mxge_choose_params(int mtu, int *cl_size)
3097 {
3098 	int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3099 
3100 	if (bufsize < MCLBYTES) {
3101 		*cl_size = MCLBYTES;
3102 	} else {
3103 		KASSERT(bufsize < MJUMPAGESIZE, ("invalid MTU %d", mtu));
3104 		*cl_size = MJUMPAGESIZE;
3105 	}
3106 }
3107 
3108 static int
3109 mxge_slice_open(struct mxge_slice_state *ss, int cl_size)
3110 {
3111 	mxge_cmd_t cmd;
3112 	int err, i, slice;
3113 
3114 	slice = ss - ss->sc->ss;
3115 
3116 	/*
3117 	 * Get the lanai pointers to the send and receive rings
3118 	 */
3119 	err = 0;
3120 #ifndef IFNET_BUF_RING
3121 	/* We currently only send from the first slice */
3122 	if (slice == 0) {
3123 #endif
3124 		cmd.data0 = slice;
3125 		err = mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3126 		ss->tx.lanai = (volatile mcp_kreq_ether_send_t *)
3127 		    (ss->sc->sram + cmd.data0);
3128 		ss->tx.send_go = (volatile uint32_t *)
3129 		    (ss->sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3130 		ss->tx.send_stop = (volatile uint32_t *)
3131 		    (ss->sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3132 #ifndef IFNET_BUF_RING
3133 	}
3134 #endif
3135 
3136 	cmd.data0 = slice;
3137 	err |= mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3138 	ss->rx_data.rx_small.lanai =
3139 	    (volatile mcp_kreq_ether_recv_t *)(ss->sc->sram + cmd.data0);
3140 
3141 	cmd.data0 = slice;
3142 	err |= mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3143 	ss->rx_data.rx_big.lanai =
3144 	    (volatile mcp_kreq_ether_recv_t *)(ss->sc->sram + cmd.data0);
3145 
3146 	if (err != 0) {
3147 		if_printf(ss->sc->ifp,
3148 		    "failed to get ring sizes or locations\n");
3149 		return EIO;
3150 	}
3151 
3152 	/*
3153 	 * Stock small receive ring
3154 	 */
3155 	for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
3156 		err = mxge_get_buf_small(&ss->rx_data.rx_small,
3157 		    ss->rx_data.rx_small.info[i].map, i, TRUE);
3158 		if (err) {
3159 			if_printf(ss->sc->ifp, "alloced %d/%d smalls\n", i,
3160 			    ss->rx_data.rx_small.mask + 1);
3161 			return ENOMEM;
3162 		}
3163 	}
3164 
3165 	/*
3166 	 * Stock big receive ring
3167 	 */
3168 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
3169 		ss->rx_data.rx_big.shadow[i].addr_low = 0xffffffff;
3170 		ss->rx_data.rx_big.shadow[i].addr_high = 0xffffffff;
3171 	}
3172 
3173 	ss->rx_data.rx_big.cl_size = cl_size;
3174 
3175 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
3176 		err = mxge_get_buf_big(&ss->rx_data.rx_big,
3177 		    ss->rx_data.rx_big.info[i].map, i, TRUE);
3178 		if (err) {
3179 			if_printf(ss->sc->ifp, "alloced %d/%d bigs\n", i,
3180 			    ss->rx_data.rx_big.mask + 1);
3181 			return ENOMEM;
3182 		}
3183 	}
3184 	return 0;
3185 }
3186 
3187 static int
3188 mxge_open(mxge_softc_t *sc)
3189 {
3190 	struct ifnet *ifp = sc->ifp;
3191 	mxge_cmd_t cmd;
3192 	int err, slice, cl_size, i;
3193 	bus_addr_t bus;
3194 	volatile uint8_t *itable;
3195 	struct mxge_slice_state *ss;
3196 
3197 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
3198 
3199 	/* Copy the MAC address in case it was overridden */
3200 	bcopy(IF_LLADDR(ifp), sc->mac_addr, ETHER_ADDR_LEN);
3201 
3202 	err = mxge_reset(sc, 1);
3203 	if (err != 0) {
3204 		if_printf(ifp, "failed to reset\n");
3205 		return EIO;
3206 	}
3207 
3208 	if (sc->num_slices > 1) {
3209 		/* Setup the indirection table */
3210 		cmd.data0 = sc->num_slices;
3211 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE, &cmd);
3212 
3213 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET, &cmd);
3214 		if (err != 0) {
3215 			if_printf(ifp, "failed to setup rss tables\n");
3216 			return err;
3217 		}
3218 
3219 		/* Just enable an identity mapping */
3220 		itable = sc->sram + cmd.data0;
3221 		for (i = 0; i < sc->num_slices; i++)
3222 			itable[i] = (uint8_t)i;
3223 
3224 		cmd.data0 = 1;
3225 		cmd.data1 = MXGEFW_RSS_HASH_TYPE_TCP_IPV4;
3226 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3227 		if (err != 0) {
3228 			if_printf(ifp, "failed to enable slices\n");
3229 			return err;
3230 		}
3231 	}
3232 
3233 	cmd.data0 = MXGEFW_TSO_MODE_NDIS;
3234 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_TSO_MODE, &cmd);
3235 	if (err) {
3236 		/*
3237 		 * Can't change TSO mode to NDIS, never allow TSO then
3238 		 */
3239 		if_printf(ifp, "failed to set TSO mode\n");
3240 		ifp->if_capenable &= ~IFCAP_TSO;
3241 		ifp->if_capabilities &= ~IFCAP_TSO;
3242 		ifp->if_hwassist &= ~CSUM_TSO;
3243 	}
3244 
3245 	mxge_choose_params(ifp->if_mtu, &cl_size);
3246 
3247 	cmd.data0 = 1;
3248 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS, &cmd);
3249 	/*
3250 	 * Error is only meaningful if we're trying to set
3251 	 * MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1
3252 	 */
3253 
3254 	/*
3255 	 * Give the firmware the mtu and the big and small buffer
3256 	 * sizes.  The firmware wants the big buf size to be a power
3257 	 * of two. Luckily, DragonFly's clusters are powers of two
3258 	 */
3259 	cmd.data0 = ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3260 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3261 
3262 	cmd.data0 = MXGE_RX_SMALL_BUFLEN;
3263 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE, &cmd);
3264 
3265 	cmd.data0 = cl_size;
3266 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3267 
3268 	if (err != 0) {
3269 		if_printf(ifp, "failed to setup params\n");
3270 		goto abort;
3271 	}
3272 
3273 	/* Now give him the pointer to the stats block */
3274 	for (slice = 0; slice < sc->num_slices; slice++) {
3275 		ss = &sc->ss[slice];
3276 		cmd.data0 = MXGE_LOWPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3277 		cmd.data1 = MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3278 		cmd.data2 = sizeof(struct mcp_irq_data);
3279 		cmd.data2 |= (slice << 16);
3280 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3281 	}
3282 
3283 	if (err != 0) {
3284 		bus = sc->ss->fw_stats_dma.dmem_busaddr;
3285 		bus += offsetof(struct mcp_irq_data, send_done_count);
3286 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3287 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3288 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3289 		    &cmd);
3290 
3291 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3292 		sc->fw_multicast_support = 0;
3293 	} else {
3294 		sc->fw_multicast_support = 1;
3295 	}
3296 
3297 	if (err != 0) {
3298 		if_printf(ifp, "failed to setup params\n");
3299 		goto abort;
3300 	}
3301 
3302 	for (slice = 0; slice < sc->num_slices; slice++) {
3303 		err = mxge_slice_open(&sc->ss[slice], cl_size);
3304 		if (err != 0) {
3305 			if_printf(ifp, "couldn't open slice %d\n", slice);
3306 			goto abort;
3307 		}
3308 	}
3309 
3310 	/* Finally, start the firmware running */
3311 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3312 	if (err) {
3313 		if_printf(ifp, "Couldn't bring up link\n");
3314 		goto abort;
3315 	}
3316 	ifp->if_flags |= IFF_RUNNING;
3317 	ifq_clr_oactive(&ifp->if_snd);
3318 	ifp->if_timer = 0;
3319 
3320 	return 0;
3321 
3322 abort:
3323 	mxge_free_mbufs(sc);
3324 	return err;
3325 }
3326 
3327 static void
3328 mxge_close(mxge_softc_t *sc, int down)
3329 {
3330 	struct ifnet *ifp = sc->ifp;
3331 	mxge_cmd_t cmd;
3332 	int err, old_down_cnt;
3333 
3334 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
3335 
3336 	ifp->if_flags &= ~IFF_RUNNING;
3337 	ifq_clr_oactive(&ifp->if_snd);
3338 	ifp->if_timer = 0;
3339 
3340 	if (!down) {
3341 		old_down_cnt = sc->down_cnt;
3342 		wmb();
3343 
3344 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3345 		if (err)
3346 			if_printf(ifp, "Couldn't bring down link\n");
3347 
3348 		if (old_down_cnt == sc->down_cnt) {
3349 			/* Wait for down irq */
3350 			ifnet_deserialize_all(ifp);
3351 			DELAY(10 * sc->intr_coal_delay);
3352 			ifnet_serialize_all(ifp);
3353 		}
3354 
3355 		wmb();
3356 		if (old_down_cnt == sc->down_cnt)
3357 			if_printf(ifp, "never got down irq\n");
3358 	}
3359 	mxge_free_mbufs(sc);
3360 }
3361 
3362 static void
3363 mxge_setup_cfg_space(mxge_softc_t *sc)
3364 {
3365 	device_t dev = sc->dev;
3366 	int reg;
3367 	uint16_t lnk, pectl;
3368 
3369 	/* Find the PCIe link width and set max read request to 4KB */
3370 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3371 		lnk = pci_read_config(dev, reg + 0x12, 2);
3372 		sc->link_width = (lnk >> 4) & 0x3f;
3373 
3374 		if (sc->pectl == 0) {
3375 			pectl = pci_read_config(dev, reg + 0x8, 2);
3376 			pectl = (pectl & ~0x7000) | (5 << 12);
3377 			pci_write_config(dev, reg + 0x8, pectl, 2);
3378 			sc->pectl = pectl;
3379 		} else {
3380 			/* Restore saved pectl after watchdog reset */
3381 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3382 		}
3383 	}
3384 
3385 	/* Enable DMA and memory space access */
3386 	pci_enable_busmaster(dev);
3387 }
3388 
3389 static uint32_t
3390 mxge_read_reboot(mxge_softc_t *sc)
3391 {
3392 	device_t dev = sc->dev;
3393 	uint32_t vs;
3394 
3395 	/* Find the vendor specific offset */
3396 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3397 		if_printf(sc->ifp, "could not find vendor specific offset\n");
3398 		return (uint32_t)-1;
3399 	}
3400 	/* Enable read32 mode */
3401 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3402 	/* Tell NIC which register to read */
3403 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3404 	return pci_read_config(dev, vs + 0x14, 4);
3405 }
3406 
3407 static void
3408 mxge_watchdog_reset(mxge_softc_t *sc)
3409 {
3410 	struct pci_devinfo *dinfo;
3411 	int err, running;
3412 	uint32_t reboot;
3413 	uint16_t cmd;
3414 
3415 	err = ENXIO;
3416 
3417 	if_printf(sc->ifp, "Watchdog reset!\n");
3418 
3419 	/*
3420 	 * Check to see if the NIC rebooted.  If it did, then all of
3421 	 * PCI config space has been reset, and things like the
3422 	 * busmaster bit will be zero.  If this is the case, then we
3423 	 * must restore PCI config space before the NIC can be used
3424 	 * again
3425 	 */
3426 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3427 	if (cmd == 0xffff) {
3428 		/*
3429 		 * Maybe the watchdog caught the NIC rebooting; wait
3430 		 * up to 100ms for it to finish.  If it does not come
3431 		 * back, then give up
3432 		 */
3433 		DELAY(1000*100);
3434 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3435 		if (cmd == 0xffff)
3436 			if_printf(sc->ifp, "NIC disappeared!\n");
3437 	}
3438 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3439 		/* Print the reboot status */
3440 		reboot = mxge_read_reboot(sc);
3441 		if_printf(sc->ifp, "NIC rebooted, status = 0x%x\n", reboot);
3442 
3443 		running = sc->ifp->if_flags & IFF_RUNNING;
3444 		if (running) {
3445 			/*
3446 			 * Quiesce NIC so that TX routines will not try to
3447 			 * xmit after restoration of BAR
3448 			 */
3449 
3450 			/* Mark the link as down */
3451 			if (sc->link_state) {
3452 				sc->ifp->if_link_state = LINK_STATE_DOWN;
3453 				if_link_state_change(sc->ifp);
3454 			}
3455 			mxge_close(sc, 1);
3456 		}
3457 		/* Restore PCI configuration space */
3458 		dinfo = device_get_ivars(sc->dev);
3459 		pci_cfg_restore(sc->dev, dinfo);
3460 
3461 		/* And redo any changes we made to our config space */
3462 		mxge_setup_cfg_space(sc);
3463 
3464 		/* Reload f/w */
3465 		err = mxge_load_firmware(sc, 0);
3466 		if (err)
3467 			if_printf(sc->ifp, "Unable to re-load f/w\n");
3468 		if (running && !err) {
3469 			err = mxge_open(sc);
3470 			if_devstart_sched(sc->ifp);
3471 		}
3472 		sc->watchdog_resets++;
3473 	} else {
3474 		if_printf(sc->ifp, "NIC did not reboot, not resetting\n");
3475 		err = 0;
3476 	}
3477 	if (err) {
3478 		if_printf(sc->ifp, "watchdog reset failed\n");
3479 	} else {
3480 		if (sc->dying == 2)
3481 			sc->dying = 0;
3482 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3483 	}
3484 }
3485 
3486 static void
3487 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3488 {
3489 	if_printf(sc->ifp, "slice %d struck? ring state:\n", slice);
3490 	if_printf(sc->ifp, "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3491 	    tx->req, tx->done, tx->queue_active);
3492 	if_printf(sc->ifp, "tx.activate=%d tx.deactivate=%d\n",
3493 	    tx->activate, tx->deactivate);
3494 	if_printf(sc->ifp, "pkt_done=%d fw=%d\n",
3495 	    tx->pkt_done, be32toh(sc->ss->fw_stats->send_done_count));
3496 }
3497 
3498 static u_long
3499 mxge_update_stats(mxge_softc_t *sc)
3500 {
3501 	u_long ipackets, opackets, pkts;
3502 
3503 	IFNET_STAT_GET(sc->ifp, ipackets, ipackets);
3504 	IFNET_STAT_GET(sc->ifp, opackets, opackets);
3505 
3506 	pkts = ipackets - sc->ipackets;
3507 	pkts += opackets - sc->opackets;
3508 
3509 	sc->ipackets = ipackets;
3510 	sc->opackets = opackets;
3511 
3512 	return pkts;
3513 }
3514 
3515 static void
3516 mxge_tick(void *arg)
3517 {
3518 	mxge_softc_t *sc = arg;
3519 	u_long pkts = 0;
3520 	int err = 0;
3521 	int ticks;
3522 
3523 	lwkt_serialize_enter(&sc->main_serialize);
3524 
3525 	ticks = mxge_ticks;
3526 	if (sc->ifp->if_flags & IFF_RUNNING) {
3527 		/* Aggregate stats from different slices */
3528 		pkts = mxge_update_stats(sc);
3529 		if (sc->need_media_probe)
3530 			mxge_media_probe(sc);
3531 	}
3532 	if (pkts == 0) {
3533 		uint16_t cmd;
3534 
3535 		/* Ensure NIC did not suffer h/w fault while idle */
3536 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3537 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3538 			sc->dying = 2;
3539 			mxge_serialize_skipmain(sc);
3540 			mxge_watchdog_reset(sc);
3541 			mxge_deserialize_skipmain(sc);
3542 			err = ENXIO;
3543 		}
3544 
3545 		/* Look less often if NIC is idle */
3546 		ticks *= 4;
3547 	}
3548 
3549 	if (err == 0)
3550 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
3551 
3552 	lwkt_serialize_exit(&sc->main_serialize);
3553 }
3554 
3555 static int
3556 mxge_media_change(struct ifnet *ifp)
3557 {
3558 	return EINVAL;
3559 }
3560 
3561 static int
3562 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3563 {
3564 	struct ifnet *ifp = sc->ifp;
3565 	int real_mtu, old_mtu;
3566 	int err = 0;
3567 
3568 	real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3569 	if (mtu > sc->max_mtu || real_mtu < 60)
3570 		return EINVAL;
3571 
3572 	old_mtu = ifp->if_mtu;
3573 	ifp->if_mtu = mtu;
3574 	if (ifp->if_flags & IFF_RUNNING) {
3575 		mxge_close(sc, 0);
3576 		err = mxge_open(sc);
3577 		if (err != 0) {
3578 			ifp->if_mtu = old_mtu;
3579 			mxge_close(sc, 0);
3580 			mxge_open(sc);
3581 		}
3582 	}
3583 	return err;
3584 }
3585 
3586 static void
3587 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3588 {
3589 	mxge_softc_t *sc = ifp->if_softc;
3590 
3591 
3592 	if (sc == NULL)
3593 		return;
3594 	ifmr->ifm_status = IFM_AVALID;
3595 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
3596 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3597 	ifmr->ifm_active |= sc->current_media;
3598 }
3599 
3600 static int
3601 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data,
3602     struct ucred *cr __unused)
3603 {
3604 	mxge_softc_t *sc = ifp->if_softc;
3605 	struct ifreq *ifr = (struct ifreq *)data;
3606 	int err, mask;
3607 
3608 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
3609 	err = 0;
3610 
3611 	switch (command) {
3612 	case SIOCSIFMTU:
3613 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3614 		break;
3615 
3616 	case SIOCSIFFLAGS:
3617 		if (sc->dying)
3618 			return EINVAL;
3619 
3620 		if (ifp->if_flags & IFF_UP) {
3621 			if (!(ifp->if_flags & IFF_RUNNING)) {
3622 				err = mxge_open(sc);
3623 			} else {
3624 				/*
3625 				 * Take care of PROMISC and ALLMULTI
3626 				 * flag changes
3627 				 */
3628 				mxge_change_promisc(sc,
3629 				    ifp->if_flags & IFF_PROMISC);
3630 				mxge_set_multicast_list(sc);
3631 			}
3632 		} else {
3633 			if (ifp->if_flags & IFF_RUNNING)
3634 				mxge_close(sc, 0);
3635 		}
3636 		break;
3637 
3638 	case SIOCADDMULTI:
3639 	case SIOCDELMULTI:
3640 		mxge_set_multicast_list(sc);
3641 		break;
3642 
3643 	case SIOCSIFCAP:
3644 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3645 		if (mask & IFCAP_TXCSUM) {
3646 			ifp->if_capenable ^= IFCAP_TXCSUM;
3647 			if (ifp->if_capenable & IFCAP_TXCSUM)
3648 				ifp->if_hwassist |= CSUM_TCP | CSUM_UDP;
3649 			else
3650 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
3651 		}
3652 		if (mask & IFCAP_TSO) {
3653 			ifp->if_capenable ^= IFCAP_TSO;
3654 			if (ifp->if_capenable & IFCAP_TSO)
3655 				ifp->if_hwassist |= CSUM_TSO;
3656 			else
3657 				ifp->if_hwassist &= ~CSUM_TSO;
3658 		}
3659 		if (mask & IFCAP_RXCSUM)
3660 			ifp->if_capenable ^= IFCAP_RXCSUM;
3661 		if (mask & IFCAP_VLAN_HWTAGGING)
3662 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3663 		break;
3664 
3665 	case SIOCGIFMEDIA:
3666 		mxge_media_probe(sc);
3667 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3668 		    &sc->media, command);
3669 		break;
3670 
3671 	default:
3672 		err = ether_ioctl(ifp, command, data);
3673 		break;
3674 	}
3675 	return err;
3676 }
3677 
3678 static void
3679 mxge_fetch_tunables(mxge_softc_t *sc)
3680 {
3681 	sc->intr_coal_delay = mxge_intr_coal_delay;
3682 	if (sc->intr_coal_delay < 0 || sc->intr_coal_delay > (10 * 1000))
3683 		sc->intr_coal_delay = MXGE_INTR_COAL_DELAY;
3684 
3685 	/* XXX */
3686 	if (mxge_ticks == 0)
3687 		mxge_ticks = hz / 2;
3688 
3689 	sc->pause = mxge_flow_control;
3690 
3691 	sc->throttle = mxge_throttle;
3692 	if (sc->throttle && sc->throttle > MXGE_MAX_THROTTLE)
3693 		sc->throttle = MXGE_MAX_THROTTLE;
3694 	if (sc->throttle && sc->throttle < MXGE_MIN_THROTTLE)
3695 		sc->throttle = MXGE_MIN_THROTTLE;
3696 }
3697 
3698 static void
3699 mxge_free_slices(mxge_softc_t *sc)
3700 {
3701 	struct mxge_slice_state *ss;
3702 	int i;
3703 
3704 	if (sc->ss == NULL)
3705 		return;
3706 
3707 	for (i = 0; i < sc->num_slices; i++) {
3708 		ss = &sc->ss[i];
3709 		if (ss->fw_stats != NULL) {
3710 			mxge_dma_free(&ss->fw_stats_dma);
3711 			ss->fw_stats = NULL;
3712 		}
3713 		if (ss->rx_data.rx_done.entry != NULL) {
3714 			mxge_dma_free(&ss->rx_done_dma);
3715 			ss->rx_data.rx_done.entry = NULL;
3716 		}
3717 	}
3718 	kfree(sc->ss, M_DEVBUF);
3719 	sc->ss = NULL;
3720 }
3721 
3722 static int
3723 mxge_alloc_slices(mxge_softc_t *sc)
3724 {
3725 	mxge_cmd_t cmd;
3726 	struct mxge_slice_state *ss;
3727 	size_t bytes;
3728 	int err, i, max_intr_slots;
3729 
3730 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3731 	if (err != 0) {
3732 		device_printf(sc->dev, "Cannot determine rx ring size\n");
3733 		return err;
3734 	}
3735 	sc->rx_ring_size = cmd.data0;
3736 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
3737 
3738 	bytes = sizeof(*sc->ss) * sc->num_slices;
3739 	sc->ss = kmalloc_cachealign(bytes, M_DEVBUF, M_WAITOK | M_ZERO);
3740 
3741 	for (i = 0; i < sc->num_slices; i++) {
3742 		ss = &sc->ss[i];
3743 
3744 		ss->sc = sc;
3745 
3746 		lwkt_serialize_init(&ss->rx_data.rx_serialize);
3747 		lwkt_serialize_init(&ss->tx.tx_serialize);
3748 
3749 		/*
3750 		 * Allocate per-slice rx interrupt queues
3751 		 */
3752 		bytes = max_intr_slots * sizeof(*ss->rx_data.rx_done.entry);
3753 		err = mxge_dma_alloc(sc, &ss->rx_done_dma, bytes, 4096);
3754 		if (err != 0) {
3755 			device_printf(sc->dev,
3756 			    "alloc %d slice rx_done failed\n", i);
3757 			return err;
3758 		}
3759 		ss->rx_data.rx_done.entry = ss->rx_done_dma.dmem_addr;
3760 
3761 		/*
3762 		 * Allocate the per-slice firmware stats
3763 		 */
3764 		bytes = sizeof(*ss->fw_stats);
3765 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
3766 		    sizeof(*ss->fw_stats), 64);
3767 		if (err != 0) {
3768 			device_printf(sc->dev,
3769 			    "alloc %d fw_stats failed\n", i);
3770 			return err;
3771 		}
3772 		ss->fw_stats = ss->fw_stats_dma.dmem_addr;
3773 	}
3774 	return 0;
3775 }
3776 
3777 static void
3778 mxge_slice_probe(mxge_softc_t *sc)
3779 {
3780 	mxge_cmd_t cmd;
3781 	const char *old_fw;
3782 	int msix_cnt, status, max_intr_slots;
3783 
3784 	sc->num_slices = 1;
3785 
3786 	/*
3787 	 * XXX
3788 	 *
3789 	 * Don't enable multiple slices if they are not enabled,
3790 	 * or if this is not an SMP system
3791 	 */
3792 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
3793 		return;
3794 
3795 	/* see how many MSI-X interrupts are available */
3796 	msix_cnt = pci_msix_count(sc->dev);
3797 	if (msix_cnt < 2)
3798 		return;
3799 
3800 	/* now load the slice aware firmware see what it supports */
3801 	old_fw = sc->fw_name;
3802 	if (old_fw == mxge_fw_aligned)
3803 		sc->fw_name = mxge_fw_rss_aligned;
3804 	else
3805 		sc->fw_name = mxge_fw_rss_unaligned;
3806 	status = mxge_load_firmware(sc, 0);
3807 	if (status != 0) {
3808 		device_printf(sc->dev, "Falling back to a single slice\n");
3809 		return;
3810 	}
3811 
3812 	/* try to send a reset command to the card to see if it
3813 	   is alive */
3814 	memset(&cmd, 0, sizeof (cmd));
3815 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
3816 	if (status != 0) {
3817 		device_printf(sc->dev, "failed reset\n");
3818 		goto abort_with_fw;
3819 	}
3820 
3821 	/* get rx ring size */
3822 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3823 	if (status != 0) {
3824 		device_printf(sc->dev, "Cannot determine rx ring size\n");
3825 		goto abort_with_fw;
3826 	}
3827 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
3828 
3829 	/* tell it the size of the interrupt queues */
3830 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
3831 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
3832 	if (status != 0) {
3833 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
3834 		goto abort_with_fw;
3835 	}
3836 
3837 	/* ask the maximum number of slices it supports */
3838 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
3839 	if (status != 0) {
3840 		device_printf(sc->dev,
3841 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
3842 		goto abort_with_fw;
3843 	}
3844 	sc->num_slices = cmd.data0;
3845 	if (sc->num_slices > msix_cnt)
3846 		sc->num_slices = msix_cnt;
3847 
3848 	if (mxge_max_slices == -1) {
3849 		/* cap to number of CPUs in system */
3850 		if (sc->num_slices > ncpus)
3851 			sc->num_slices = ncpus;
3852 	} else {
3853 		if (sc->num_slices > mxge_max_slices)
3854 			sc->num_slices = mxge_max_slices;
3855 	}
3856 	/* make sure it is a power of two */
3857 	while (sc->num_slices & (sc->num_slices - 1))
3858 		sc->num_slices--;
3859 
3860 	if (bootverbose)
3861 		device_printf(sc->dev, "using %d slices\n",
3862 			      sc->num_slices);
3863 
3864 	return;
3865 
3866 abort_with_fw:
3867 	sc->fw_name = old_fw;
3868 	(void) mxge_load_firmware(sc, 0);
3869 }
3870 
3871 #if 0
3872 static int
3873 mxge_add_msix_irqs(mxge_softc_t *sc)
3874 {
3875 	size_t bytes;
3876 	int count, err, i, rid;
3877 
3878 	rid = PCIR_BAR(2);
3879 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
3880 						    &rid, RF_ACTIVE);
3881 
3882 	if (sc->msix_table_res == NULL) {
3883 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
3884 		return ENXIO;
3885 	}
3886 
3887 	count = sc->num_slices;
3888 	err = pci_alloc_msix(sc->dev, &count);
3889 	if (err != 0) {
3890 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
3891 			      "err = %d \n", sc->num_slices, err);
3892 		goto abort_with_msix_table;
3893 	}
3894 	if (count < sc->num_slices) {
3895 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
3896 			      count, sc->num_slices);
3897 		device_printf(sc->dev,
3898 			      "Try setting hw.mxge.max_slices to %d\n",
3899 			      count);
3900 		err = ENOSPC;
3901 		goto abort_with_msix;
3902 	}
3903 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
3904 	sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
3905 	if (sc->msix_irq_res == NULL) {
3906 		err = ENOMEM;
3907 		goto abort_with_msix;
3908 	}
3909 
3910 	for (i = 0; i < sc->num_slices; i++) {
3911 		rid = i + 1;
3912 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
3913 							  SYS_RES_IRQ,
3914 							  &rid, RF_ACTIVE);
3915 		if (sc->msix_irq_res[i] == NULL) {
3916 			device_printf(sc->dev, "couldn't allocate IRQ res"
3917 				      " for message %d\n", i);
3918 			err = ENXIO;
3919 			goto abort_with_res;
3920 		}
3921 	}
3922 
3923 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
3924 	sc->msix_ih =  kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
3925 
3926 	for (i = 0; i < sc->num_slices; i++) {
3927 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
3928 				     INTR_MPSAFE,
3929 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i],
3930 				     sc->ifp->if_serializer);
3931 		if (err != 0) {
3932 			device_printf(sc->dev, "couldn't setup intr for "
3933 				      "message %d\n", i);
3934 			goto abort_with_intr;
3935 		}
3936 	}
3937 
3938 	if (bootverbose) {
3939 		device_printf(sc->dev, "using %d msix IRQs:",
3940 			      sc->num_slices);
3941 		for (i = 0; i < sc->num_slices; i++)
3942 			kprintf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
3943 		kprintf("\n");
3944 	}
3945 	return (0);
3946 
3947 abort_with_intr:
3948 	for (i = 0; i < sc->num_slices; i++) {
3949 		if (sc->msix_ih[i] != NULL) {
3950 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
3951 					  sc->msix_ih[i]);
3952 			sc->msix_ih[i] = NULL;
3953 		}
3954 	}
3955 	kfree(sc->msix_ih, M_DEVBUF);
3956 
3957 
3958 abort_with_res:
3959 	for (i = 0; i < sc->num_slices; i++) {
3960 		rid = i + 1;
3961 		if (sc->msix_irq_res[i] != NULL)
3962 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
3963 					     sc->msix_irq_res[i]);
3964 		sc->msix_irq_res[i] = NULL;
3965 	}
3966 	kfree(sc->msix_irq_res, M_DEVBUF);
3967 
3968 
3969 abort_with_msix:
3970 	pci_release_msi(sc->dev);
3971 
3972 abort_with_msix_table:
3973 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
3974 			     sc->msix_table_res);
3975 
3976 	return err;
3977 }
3978 #endif
3979 
3980 static int
3981 mxge_add_single_irq(mxge_softc_t *sc)
3982 {
3983 	driver_intr_t *intr_func;
3984 	u_int irq_flags;
3985 
3986 	sc->irq_type = pci_alloc_1intr(sc->dev, mxge_msi_enable,
3987 	    &sc->irq_rid, &irq_flags);
3988 
3989 	sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
3990 	    &sc->irq_rid, irq_flags);
3991 	if (sc->irq_res == NULL) {
3992 		device_printf(sc->dev, "could not alloc interrupt\n");
3993 		return ENXIO;
3994 	}
3995 
3996 	if (sc->irq_type == PCI_INTR_TYPE_LEGACY)
3997 		intr_func = mxge_legacy;
3998 	else
3999 		intr_func = mxge_msi;
4000 
4001 	return bus_setup_intr(sc->dev, sc->irq_res, INTR_MPSAFE,
4002 	    intr_func, &sc->ss[0], &sc->ih, &sc->main_serialize);
4003 }
4004 
4005 #if 0
4006 static void
4007 mxge_rem_msix_irqs(mxge_softc_t *sc)
4008 {
4009 	int i, rid;
4010 
4011 	for (i = 0; i < sc->num_slices; i++) {
4012 		if (sc->msix_ih[i] != NULL) {
4013 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4014 					  sc->msix_ih[i]);
4015 			sc->msix_ih[i] = NULL;
4016 		}
4017 	}
4018 	kfree(sc->msix_ih, M_DEVBUF);
4019 
4020 	for (i = 0; i < sc->num_slices; i++) {
4021 		rid = i + 1;
4022 		if (sc->msix_irq_res[i] != NULL)
4023 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4024 					     sc->msix_irq_res[i]);
4025 		sc->msix_irq_res[i] = NULL;
4026 	}
4027 	kfree(sc->msix_irq_res, M_DEVBUF);
4028 
4029 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4030 			     sc->msix_table_res);
4031 
4032 	pci_release_msi(sc->dev);
4033 	return;
4034 }
4035 #endif
4036 
4037 static int
4038 mxge_add_irq(mxge_softc_t *sc)
4039 {
4040 #if 0
4041 	int err;
4042 
4043 	if (sc->num_slices > 1)
4044 		err = mxge_add_msix_irqs(sc);
4045 	else
4046 		err = mxge_add_single_irq(sc);
4047 
4048 	if (0 && err == 0 && sc->num_slices > 1) {
4049 		mxge_rem_msix_irqs(sc);
4050 		err = mxge_add_msix_irqs(sc);
4051 	}
4052 	return err;
4053 #else
4054 	return mxge_add_single_irq(sc);
4055 #endif
4056 }
4057 
4058 static void
4059 mxge_setup_serialize(struct mxge_softc *sc)
4060 {
4061 	int i = 0, slice;
4062 
4063 	/* Main + rx + tx */
4064 	sc->nserialize = (2 * sc->num_slices) + 1;
4065 	sc->serializes =
4066 	    kmalloc(sc->nserialize * sizeof(struct lwkt_serialize *),
4067 	        M_DEVBUF, M_WAITOK | M_ZERO);
4068 
4069 	/*
4070 	 * Setup serializes
4071 	 *
4072 	 * NOTE: Order is critical
4073 	 */
4074 
4075 	KKASSERT(i < sc->nserialize);
4076 	sc->serializes[i++] = &sc->main_serialize;
4077 
4078 	for (slice = 0; slice < sc->num_slices; ++slice) {
4079 		KKASSERT(i < sc->nserialize);
4080 		sc->serializes[i++] = &sc->ss[slice].rx_data.rx_serialize;
4081 	}
4082 
4083 	for (slice = 0; slice < sc->num_slices; ++slice) {
4084 		KKASSERT(i < sc->nserialize);
4085 		sc->serializes[i++] = &sc->ss[slice].tx.tx_serialize;
4086 	}
4087 
4088 	KKASSERT(i == sc->nserialize);
4089 }
4090 
4091 static void
4092 mxge_serialize(struct ifnet *ifp, enum ifnet_serialize slz)
4093 {
4094 	struct mxge_softc *sc = ifp->if_softc;
4095 
4096 	ifnet_serialize_array_enter(sc->serializes, sc->nserialize, slz);
4097 }
4098 
4099 static void
4100 mxge_deserialize(struct ifnet *ifp, enum ifnet_serialize slz)
4101 {
4102 	struct mxge_softc *sc = ifp->if_softc;
4103 
4104 	ifnet_serialize_array_exit(sc->serializes, sc->nserialize, slz);
4105 }
4106 
4107 static int
4108 mxge_tryserialize(struct ifnet *ifp, enum ifnet_serialize slz)
4109 {
4110 	struct mxge_softc *sc = ifp->if_softc;
4111 
4112 	return ifnet_serialize_array_try(sc->serializes, sc->nserialize, slz);
4113 }
4114 
4115 #ifdef INVARIANTS
4116 
4117 static void
4118 mxge_serialize_assert(struct ifnet *ifp, enum ifnet_serialize slz,
4119     boolean_t serialized)
4120 {
4121 	struct mxge_softc *sc = ifp->if_softc;
4122 
4123 	ifnet_serialize_array_assert(sc->serializes, sc->nserialize,
4124 	    slz, serialized);
4125 }
4126 
4127 #endif	/* INVARIANTS */
4128 
4129 static int
4130 mxge_attach(device_t dev)
4131 {
4132 	mxge_softc_t *sc = device_get_softc(dev);
4133 	struct ifnet *ifp = &sc->arpcom.ac_if;
4134 	int err, rid;
4135 
4136 	/*
4137 	 * Avoid rewriting half the lines in this file to use
4138 	 * &sc->arpcom.ac_if instead
4139 	 */
4140 	sc->ifp = ifp;
4141 	sc->dev = dev;
4142 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4143 	ifmedia_init(&sc->media, 0, mxge_media_change, mxge_media_status);
4144 
4145 	lwkt_serialize_init(&sc->main_serialize);
4146 
4147 	mxge_fetch_tunables(sc);
4148 
4149 	err = bus_dma_tag_create(NULL,			/* parent */
4150 				 1,			/* alignment */
4151 				 0,			/* boundary */
4152 				 BUS_SPACE_MAXADDR,	/* low */
4153 				 BUS_SPACE_MAXADDR,	/* high */
4154 				 NULL, NULL,		/* filter */
4155 				 BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
4156 				 0, 			/* num segs */
4157 				 BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
4158 				 0,			/* flags */
4159 				 &sc->parent_dmat);	/* tag */
4160 	if (err != 0) {
4161 		device_printf(dev, "Err %d allocating parent dmat\n", err);
4162 		goto failed;
4163 	}
4164 
4165 	callout_init_mp(&sc->co_hdl);
4166 
4167 	mxge_setup_cfg_space(sc);
4168 
4169 	/*
4170 	 * Map the board into the kernel
4171 	 */
4172 	rid = PCIR_BARS;
4173 	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
4174 	    &rid, RF_ACTIVE);
4175 	if (sc->mem_res == NULL) {
4176 		device_printf(dev, "could not map memory\n");
4177 		err = ENXIO;
4178 		goto failed;
4179 	}
4180 
4181 	sc->sram = rman_get_virtual(sc->mem_res);
4182 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4183 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4184 		device_printf(dev, "impossible memory region size %ld\n",
4185 		    rman_get_size(sc->mem_res));
4186 		err = ENXIO;
4187 		goto failed;
4188 	}
4189 
4190 	/*
4191 	 * Make NULL terminated copy of the EEPROM strings section of
4192 	 * lanai SRAM
4193 	 */
4194 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4195 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4196 	    rman_get_bushandle(sc->mem_res),
4197 	    sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4198 	    sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE - 2);
4199 	err = mxge_parse_strings(sc);
4200 	if (err != 0) {
4201 		device_printf(dev, "parse EEPROM string failed\n");
4202 		goto failed;
4203 	}
4204 
4205 	/*
4206 	 * Enable write combining for efficient use of PCIe bus
4207 	 */
4208 	mxge_enable_wc(sc);
4209 
4210 	/*
4211 	 * Allocate the out of band DMA memory
4212 	 */
4213 	err = mxge_dma_alloc(sc, &sc->cmd_dma, sizeof(mxge_cmd_t), 64);
4214 	if (err != 0) {
4215 		device_printf(dev, "alloc cmd DMA buf failed\n");
4216 		goto failed;
4217 	}
4218 	sc->cmd = sc->cmd_dma.dmem_addr;
4219 
4220 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4221 	if (err != 0) {
4222 		device_printf(dev, "alloc zeropad DMA buf failed\n");
4223 		goto failed;
4224 	}
4225 
4226 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4227 	if (err != 0) {
4228 		device_printf(dev, "alloc dmabench DMA buf failed\n");
4229 		goto failed;
4230 	}
4231 
4232 	/* Select & load the firmware */
4233 	err = mxge_select_firmware(sc);
4234 	if (err != 0) {
4235 		device_printf(dev, "select firmware failed\n");
4236 		goto failed;
4237 	}
4238 
4239 	mxge_slice_probe(sc);
4240 	err = mxge_alloc_slices(sc);
4241 	if (err != 0) {
4242 		device_printf(dev, "alloc slices failed\n");
4243 		goto failed;
4244 	}
4245 
4246 	/* Setup serializes */
4247 	mxge_setup_serialize(sc);
4248 
4249 	err = mxge_reset(sc, 0);
4250 	if (err != 0) {
4251 		device_printf(dev, "reset failed\n");
4252 		goto failed;
4253 	}
4254 
4255 	err = mxge_alloc_rings(sc);
4256 	if (err != 0) {
4257 		device_printf(dev, "failed to allocate rings\n");
4258 		goto failed;
4259 	}
4260 
4261 	ifp->if_baudrate = IF_Gbps(10UL);
4262 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO;
4263 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4264 
4265 	ifp->if_capabilities |= IFCAP_VLAN_MTU;
4266 #if 0
4267 	/* Well, its software, sigh */
4268 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
4269 #endif
4270 	ifp->if_capenable = ifp->if_capabilities;
4271 
4272 	ifp->if_softc = sc;
4273 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4274 	ifp->if_init = mxge_init;
4275 	ifp->if_ioctl = mxge_ioctl;
4276 	ifp->if_start = mxge_start;
4277 	ifp->if_watchdog = mxge_watchdog;
4278 	ifp->if_serialize = mxge_serialize;
4279 	ifp->if_deserialize = mxge_deserialize;
4280 	ifp->if_tryserialize = mxge_tryserialize;
4281 #ifdef INVARIANTS
4282 	ifp->if_serialize_assert = mxge_serialize_assert;
4283 #endif
4284 
4285 	/* Increase TSO burst length */
4286 	ifp->if_tsolen = (32 * ETHERMTU);
4287 
4288 	/* Initialise the ifmedia structure */
4289 	mxge_media_init(sc);
4290 	mxge_media_probe(sc);
4291 
4292 	ether_ifattach(ifp, sc->mac_addr, NULL);
4293 
4294 	/*
4295 	 * XXX
4296 	 * We are not ready to do "gather" jumbo frame, so
4297 	 * limit MTU to MJUMPAGESIZE
4298 	 */
4299 	sc->max_mtu = MJUMPAGESIZE -
4300 	    ETHER_HDR_LEN - EVL_ENCAPLEN - MXGEFW_PAD - 1;
4301 	sc->dying = 0;
4302 
4303 	/* must come after ether_ifattach() */
4304 	err = mxge_add_irq(sc);
4305 	if (err != 0) {
4306 		device_printf(dev, "alloc and setup intr failed\n");
4307 		ether_ifdetach(ifp);
4308 		goto failed;
4309 	}
4310 
4311 	ifq_set_cpuid(&ifp->if_snd, rman_get_cpuid(sc->irq_res));
4312 	ifq_set_hw_serialize(&ifp->if_snd, &sc->ss[0].tx.tx_serialize);
4313 
4314 	mxge_add_sysctls(sc);
4315 
4316 	callout_reset_bycpu(&sc->co_hdl, mxge_ticks, mxge_tick, sc,
4317 	    rman_get_cpuid(sc->irq_res));
4318 	return 0;
4319 
4320 failed:
4321 	mxge_detach(dev);
4322 	return err;
4323 }
4324 
4325 static int
4326 mxge_detach(device_t dev)
4327 {
4328 	mxge_softc_t *sc = device_get_softc(dev);
4329 
4330 	if (device_is_attached(dev)) {
4331 		struct ifnet *ifp = sc->ifp;
4332 
4333 		ifnet_serialize_all(ifp);
4334 
4335 		sc->dying = 1;
4336 		if (ifp->if_flags & IFF_RUNNING)
4337 			mxge_close(sc, 1);
4338 		callout_stop(&sc->co_hdl);
4339 
4340 		bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4341 
4342 		ifnet_deserialize_all(ifp);
4343 
4344 		callout_terminate(&sc->co_hdl);
4345 
4346 		ether_ifdetach(ifp);
4347 	}
4348 	ifmedia_removeall(&sc->media);
4349 
4350 	if (sc->cmd != NULL && sc->zeropad_dma.dmem_addr != NULL &&
4351 	    sc->sram != NULL)
4352 		mxge_dummy_rdma(sc, 0);
4353 
4354 	mxge_rem_sysctls(sc);
4355 	mxge_free_rings(sc);
4356 
4357 	/* MUST after sysctls and rings are freed */
4358 	mxge_free_slices(sc);
4359 
4360 	if (sc->dmabench_dma.dmem_addr != NULL)
4361 		mxge_dma_free(&sc->dmabench_dma);
4362 	if (sc->zeropad_dma.dmem_addr != NULL)
4363 		mxge_dma_free(&sc->zeropad_dma);
4364 	if (sc->cmd_dma.dmem_addr != NULL)
4365 		mxge_dma_free(&sc->cmd_dma);
4366 
4367 	if (sc->irq_res != NULL) {
4368 		bus_release_resource(dev, SYS_RES_IRQ, sc->irq_rid,
4369 		    sc->irq_res);
4370 	}
4371 	if (sc->irq_type == PCI_INTR_TYPE_MSI)
4372 		pci_release_msi(dev);
4373 
4374 	if (sc->mem_res != NULL) {
4375 		bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS,
4376 		    sc->mem_res);
4377 	}
4378 
4379 	if (sc->parent_dmat != NULL)
4380 		bus_dma_tag_destroy(sc->parent_dmat);
4381 
4382 	return 0;
4383 }
4384 
4385 static int
4386 mxge_shutdown(device_t dev)
4387 {
4388 	return 0;
4389 }
4390