xref: /freebsd/sys/dev/mxge/if_mxge.c (revision c1d255d3)
1 /******************************************************************************
2 SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 
4 Copyright (c) 2006-2013, Myricom Inc.
5 All rights reserved.
6 
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9 
10  1. Redistributions of source code must retain the above copyright notice,
11     this list of conditions and the following disclaimer.
12 
13  2. Neither the name of the Myricom Inc, nor the names of its
14     contributors may be used to endorse or promote products derived from
15     this software without specific prior written permission.
16 
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 POSSIBILITY OF SUCH DAMAGE.
28 
29 ***************************************************************************/
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/linker.h>
37 #include <sys/firmware.h>
38 #include <sys/endian.h>
39 #include <sys/sockio.h>
40 #include <sys/mbuf.h>
41 #include <sys/malloc.h>
42 #include <sys/kdb.h>
43 #include <sys/kernel.h>
44 #include <sys/lock.h>
45 #include <sys/module.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
48 #include <sys/sx.h>
49 #include <sys/taskqueue.h>
50 #include <contrib/zlib/zlib.h>
51 #include <dev/zlib/zcalloc.h>
52 
53 #include <net/if.h>
54 #include <net/if_var.h>
55 #include <net/if_arp.h>
56 #include <net/ethernet.h>
57 #include <net/if_dl.h>
58 #include <net/if_media.h>
59 
60 #include <net/bpf.h>
61 
62 #include <net/if_types.h>
63 #include <net/if_vlan_var.h>
64 
65 #include <netinet/in_systm.h>
66 #include <netinet/in.h>
67 #include <netinet/ip.h>
68 #include <netinet/ip6.h>
69 #include <netinet/tcp.h>
70 #include <netinet/tcp_lro.h>
71 #include <netinet6/ip6_var.h>
72 
73 #include <machine/bus.h>
74 #include <machine/in_cksum.h>
75 #include <machine/resource.h>
76 #include <sys/bus.h>
77 #include <sys/rman.h>
78 #include <sys/smp.h>
79 
80 #include <dev/pci/pcireg.h>
81 #include <dev/pci/pcivar.h>
82 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
83 
84 #include <vm/vm.h>		/* for pmap_mapdev() */
85 #include <vm/pmap.h>
86 
87 #if defined(__i386) || defined(__amd64)
88 #include <machine/specialreg.h>
89 #endif
90 
91 #include <dev/mxge/mxge_mcp.h>
92 #include <dev/mxge/mcp_gen_header.h>
93 /*#define MXGE_FAKE_IFP*/
94 #include <dev/mxge/if_mxge_var.h>
95 #ifdef IFNET_BUF_RING
96 #include <sys/buf_ring.h>
97 #endif
98 
99 #include "opt_inet.h"
100 #include "opt_inet6.h"
101 
102 /* tunable params */
103 static int mxge_nvidia_ecrc_enable = 1;
104 static int mxge_force_firmware = 0;
105 static int mxge_intr_coal_delay = 30;
106 static int mxge_deassert_wait = 1;
107 static int mxge_flow_control = 1;
108 static int mxge_verbose = 0;
109 static int mxge_ticks;
110 static int mxge_max_slices = 1;
111 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
112 static int mxge_always_promisc = 0;
113 static int mxge_initial_mtu = ETHERMTU_JUMBO;
114 static int mxge_throttle = 0;
115 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
116 static char *mxge_fw_aligned = "mxge_eth_z8e";
117 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
118 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
119 
120 static int mxge_probe(device_t dev);
121 static int mxge_attach(device_t dev);
122 static int mxge_detach(device_t dev);
123 static int mxge_shutdown(device_t dev);
124 static void mxge_intr(void *arg);
125 
126 static device_method_t mxge_methods[] =
127 {
128   /* Device interface */
129   DEVMETHOD(device_probe, mxge_probe),
130   DEVMETHOD(device_attach, mxge_attach),
131   DEVMETHOD(device_detach, mxge_detach),
132   DEVMETHOD(device_shutdown, mxge_shutdown),
133 
134   DEVMETHOD_END
135 };
136 
137 static driver_t mxge_driver =
138 {
139   "mxge",
140   mxge_methods,
141   sizeof(mxge_softc_t),
142 };
143 
144 static devclass_t mxge_devclass;
145 
146 /* Declare ourselves to be a child of the PCI bus.*/
147 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
148 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
149 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
150 
151 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
152 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
153 static int mxge_close(mxge_softc_t *sc, int down);
154 static int mxge_open(mxge_softc_t *sc);
155 static void mxge_tick(void *arg);
156 
157 static int
158 mxge_probe(device_t dev)
159 {
160 	int rev;
161 
162 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
163 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
164 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
165 		rev = pci_get_revid(dev);
166 		switch (rev) {
167 		case MXGE_PCI_REV_Z8E:
168 			device_set_desc(dev, "Myri10G-PCIE-8A");
169 			break;
170 		case MXGE_PCI_REV_Z8ES:
171 			device_set_desc(dev, "Myri10G-PCIE-8B");
172 			break;
173 		default:
174 			device_set_desc(dev, "Myri10G-PCIE-8??");
175 			device_printf(dev, "Unrecognized rev %d NIC\n",
176 				      rev);
177 			break;
178 		}
179 		return 0;
180 	}
181 	return ENXIO;
182 }
183 
184 static void
185 mxge_enable_wc(mxge_softc_t *sc)
186 {
187 #if defined(__i386) || defined(__amd64)
188 	vm_offset_t len;
189 	int err;
190 
191 	sc->wc = 1;
192 	len = rman_get_size(sc->mem_res);
193 	err = pmap_change_attr((vm_offset_t) sc->sram,
194 			       len, PAT_WRITE_COMBINING);
195 	if (err != 0) {
196 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
197 			      err);
198 		sc->wc = 0;
199 	}
200 #endif
201 }
202 
203 /* callback to get our DMA address */
204 static void
205 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
206 			 int error)
207 {
208 	if (error == 0) {
209 		*(bus_addr_t *) arg = segs->ds_addr;
210 	}
211 }
212 
213 static int
214 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
215 		   bus_size_t alignment)
216 {
217 	int err;
218 	device_t dev = sc->dev;
219 	bus_size_t boundary, maxsegsize;
220 
221 	if (bytes > 4096 && alignment == 4096) {
222 		boundary = 0;
223 		maxsegsize = bytes;
224 	} else {
225 		boundary = 4096;
226 		maxsegsize = 4096;
227 	}
228 
229 	/* allocate DMAable memory tags */
230 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
231 				 alignment,		/* alignment */
232 				 boundary,		/* boundary */
233 				 BUS_SPACE_MAXADDR,	/* low */
234 				 BUS_SPACE_MAXADDR,	/* high */
235 				 NULL, NULL,		/* filter */
236 				 bytes,			/* maxsize */
237 				 1,			/* num segs */
238 				 maxsegsize,		/* maxsegsize */
239 				 BUS_DMA_COHERENT,	/* flags */
240 				 NULL, NULL,		/* lock */
241 				 &dma->dmat);		/* tag */
242 	if (err != 0) {
243 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
244 		return err;
245 	}
246 
247 	/* allocate DMAable memory & map */
248 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
249 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
250 				| BUS_DMA_ZERO),  &dma->map);
251 	if (err != 0) {
252 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
253 		goto abort_with_dmat;
254 	}
255 
256 	/* load the memory */
257 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
258 			      mxge_dmamap_callback,
259 			      (void *)&dma->bus_addr, 0);
260 	if (err != 0) {
261 		device_printf(dev, "couldn't load map (err = %d)\n", err);
262 		goto abort_with_mem;
263 	}
264 	return 0;
265 
266 abort_with_mem:
267 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
268 abort_with_dmat:
269 	(void)bus_dma_tag_destroy(dma->dmat);
270 	return err;
271 }
272 
273 static void
274 mxge_dma_free(mxge_dma_t *dma)
275 {
276 	bus_dmamap_unload(dma->dmat, dma->map);
277 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
278 	(void)bus_dma_tag_destroy(dma->dmat);
279 }
280 
281 /*
282  * The eeprom strings on the lanaiX have the format
283  * SN=x\0
284  * MAC=x:x:x:x:x:x\0
285  * PC=text\0
286  */
287 
288 static int
289 mxge_parse_strings(mxge_softc_t *sc)
290 {
291 	char *ptr;
292 	int i, found_mac, found_sn2;
293 	char *endptr;
294 
295 	ptr = sc->eeprom_strings;
296 	found_mac = 0;
297 	found_sn2 = 0;
298 	while (*ptr != '\0') {
299 		if (strncmp(ptr, "MAC=", 4) == 0) {
300 			ptr += 4;
301 			for (i = 0;;) {
302 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
303 				if (endptr - ptr != 2)
304 					goto abort;
305 				ptr = endptr;
306 				if (++i == 6)
307 					break;
308 				if (*ptr++ != ':')
309 					goto abort;
310 			}
311 			found_mac = 1;
312 		} else if (strncmp(ptr, "PC=", 3) == 0) {
313 			ptr += 3;
314 			strlcpy(sc->product_code_string, ptr,
315 			    sizeof(sc->product_code_string));
316 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
317 			ptr += 3;
318 			strlcpy(sc->serial_number_string, ptr,
319 			    sizeof(sc->serial_number_string));
320 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
321 			/* SN2 takes precedence over SN */
322 			ptr += 4;
323 			found_sn2 = 1;
324 			strlcpy(sc->serial_number_string, ptr,
325 			    sizeof(sc->serial_number_string));
326 		}
327 		while (*ptr++ != '\0') {}
328 	}
329 
330 	if (found_mac)
331 		return 0;
332 
333  abort:
334 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
335 
336 	return ENXIO;
337 }
338 
339 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
340 static void
341 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
342 {
343 	uint32_t val;
344 	unsigned long base, off;
345 	char *va, *cfgptr;
346 	device_t pdev, mcp55;
347 	uint16_t vendor_id, device_id, word;
348 	uintptr_t bus, slot, func, ivend, idev;
349 	uint32_t *ptr32;
350 
351 	if (!mxge_nvidia_ecrc_enable)
352 		return;
353 
354 	pdev = device_get_parent(device_get_parent(sc->dev));
355 	if (pdev == NULL) {
356 		device_printf(sc->dev, "could not find parent?\n");
357 		return;
358 	}
359 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
360 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
361 
362 	if (vendor_id != 0x10de)
363 		return;
364 
365 	base = 0;
366 
367 	if (device_id == 0x005d) {
368 		/* ck804, base address is magic */
369 		base = 0xe0000000UL;
370 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
371 		/* mcp55, base address stored in chipset */
372 		mcp55 = pci_find_bsf(0, 0, 0);
373 		if (mcp55 &&
374 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
375 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
376 			word = pci_read_config(mcp55, 0x90, 2);
377 			base = ((unsigned long)word & 0x7ffeU) << 25;
378 		}
379 	}
380 	if (!base)
381 		return;
382 
383 	/* XXXX
384 	   Test below is commented because it is believed that doing
385 	   config read/write beyond 0xff will access the config space
386 	   for the next larger function.  Uncomment this and remove
387 	   the hacky pmap_mapdev() way of accessing config space when
388 	   FreeBSD grows support for extended pcie config space access
389 	*/
390 #if 0
391 	/* See if we can, by some miracle, access the extended
392 	   config space */
393 	val = pci_read_config(pdev, 0x178, 4);
394 	if (val != 0xffffffff) {
395 		val |= 0x40;
396 		pci_write_config(pdev, 0x178, val, 4);
397 		return;
398 	}
399 #endif
400 	/* Rather than using normal pci config space writes, we must
401 	 * map the Nvidia config space ourselves.  This is because on
402 	 * opteron/nvidia class machine the 0xe000000 mapping is
403 	 * handled by the nvidia chipset, that means the internal PCI
404 	 * device (the on-chip northbridge), or the amd-8131 bridge
405 	 * and things behind them are not visible by this method.
406 	 */
407 
408 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
409 		      PCI_IVAR_BUS, &bus);
410 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
411 		      PCI_IVAR_SLOT, &slot);
412 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
413 		      PCI_IVAR_FUNCTION, &func);
414 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
415 		      PCI_IVAR_VENDOR, &ivend);
416 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
417 		      PCI_IVAR_DEVICE, &idev);
418 
419 	off =  base
420 		+ 0x00100000UL * (unsigned long)bus
421 		+ 0x00001000UL * (unsigned long)(func
422 						 + 8 * slot);
423 
424 	/* map it into the kernel */
425 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
426 
427 	if (va == NULL) {
428 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
429 		return;
430 	}
431 	/* get a pointer to the config space mapped into the kernel */
432 	cfgptr = va + (off & PAGE_MASK);
433 
434 	/* make sure that we can really access it */
435 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
436 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
437 	if (! (vendor_id == ivend && device_id == idev)) {
438 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
439 			      vendor_id, device_id);
440 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
441 		return;
442 	}
443 
444 	ptr32 = (uint32_t*)(cfgptr + 0x178);
445 	val = *ptr32;
446 
447 	if (val == 0xffffffff) {
448 		device_printf(sc->dev, "extended mapping failed\n");
449 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
450 		return;
451 	}
452 	*ptr32 = val | 0x40;
453 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
454 	if (mxge_verbose)
455 		device_printf(sc->dev,
456 			      "Enabled ECRC on upstream Nvidia bridge "
457 			      "at %d:%d:%d\n",
458 			      (int)bus, (int)slot, (int)func);
459 	return;
460 }
461 #else
462 static void
463 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
464 {
465 	device_printf(sc->dev,
466 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
467 	return;
468 }
469 #endif
470 
471 static int
472 mxge_dma_test(mxge_softc_t *sc, int test_type)
473 {
474 	mxge_cmd_t cmd;
475 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
476 	int status;
477 	uint32_t len;
478 	char *test = " ";
479 
480 	/* Run a small DMA test.
481 	 * The magic multipliers to the length tell the firmware
482 	 * to do DMA read, write, or read+write tests.  The
483 	 * results are returned in cmd.data0.  The upper 16
484 	 * bits of the return is the number of transfers completed.
485 	 * The lower 16 bits is the time in 0.5us ticks that the
486 	 * transfers took to complete.
487 	 */
488 
489 	len = sc->tx_boundary;
490 
491 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
492 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
493 	cmd.data2 = len * 0x10000;
494 	status = mxge_send_cmd(sc, test_type, &cmd);
495 	if (status != 0) {
496 		test = "read";
497 		goto abort;
498 	}
499 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
500 		(cmd.data0 & 0xffff);
501 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
502 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
503 	cmd.data2 = len * 0x1;
504 	status = mxge_send_cmd(sc, test_type, &cmd);
505 	if (status != 0) {
506 		test = "write";
507 		goto abort;
508 	}
509 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
510 		(cmd.data0 & 0xffff);
511 
512 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
513 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
514 	cmd.data2 = len * 0x10001;
515 	status = mxge_send_cmd(sc, test_type, &cmd);
516 	if (status != 0) {
517 		test = "read/write";
518 		goto abort;
519 	}
520 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
521 		(cmd.data0 & 0xffff);
522 
523 abort:
524 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
525 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
526 			      test, status);
527 
528 	return status;
529 }
530 
531 /*
532  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
533  * when the PCI-E Completion packets are aligned on an 8-byte
534  * boundary.  Some PCI-E chip sets always align Completion packets; on
535  * the ones that do not, the alignment can be enforced by enabling
536  * ECRC generation (if supported).
537  *
538  * When PCI-E Completion packets are not aligned, it is actually more
539  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
540  *
541  * If the driver can neither enable ECRC nor verify that it has
542  * already been enabled, then it must use a firmware image which works
543  * around unaligned completion packets (ethp_z8e.dat), and it should
544  * also ensure that it never gives the device a Read-DMA which is
545  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
546  * enabled, then the driver should use the aligned (eth_z8e.dat)
547  * firmware image, and set tx_boundary to 4KB.
548  */
549 
550 static int
551 mxge_firmware_probe(mxge_softc_t *sc)
552 {
553 	device_t dev = sc->dev;
554 	int reg, status;
555 	uint16_t pectl;
556 
557 	sc->tx_boundary = 4096;
558 	/*
559 	 * Verify the max read request size was set to 4KB
560 	 * before trying the test with 4KB.
561 	 */
562 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
563 		pectl = pci_read_config(dev, reg + 0x8, 2);
564 		if ((pectl & (5 << 12)) != (5 << 12)) {
565 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
566 				      pectl);
567 			sc->tx_boundary = 2048;
568 		}
569 	}
570 
571 	/*
572 	 * load the optimized firmware (which assumes aligned PCIe
573 	 * completions) in order to see if it works on this host.
574 	 */
575 	sc->fw_name = mxge_fw_aligned;
576 	status = mxge_load_firmware(sc, 1);
577 	if (status != 0) {
578 		return status;
579 	}
580 
581 	/*
582 	 * Enable ECRC if possible
583 	 */
584 	mxge_enable_nvidia_ecrc(sc);
585 
586 	/*
587 	 * Run a DMA test which watches for unaligned completions and
588 	 * aborts on the first one seen.  Not required on Z8ES or newer.
589 	 */
590 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
591 		return 0;
592 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
593 	if (status == 0)
594 		return 0; /* keep the aligned firmware */
595 
596 	if (status != E2BIG)
597 		device_printf(dev, "DMA test failed: %d\n", status);
598 	if (status == ENOSYS)
599 		device_printf(dev, "Falling back to ethp! "
600 			      "Please install up to date fw\n");
601 	return status;
602 }
603 
604 static int
605 mxge_select_firmware(mxge_softc_t *sc)
606 {
607 	int aligned = 0;
608 	int force_firmware = mxge_force_firmware;
609 
610 	if (sc->throttle)
611 		force_firmware = sc->throttle;
612 
613 	if (force_firmware != 0) {
614 		if (force_firmware == 1)
615 			aligned = 1;
616 		else
617 			aligned = 0;
618 		if (mxge_verbose)
619 			device_printf(sc->dev,
620 				      "Assuming %s completions (forced)\n",
621 				      aligned ? "aligned" : "unaligned");
622 		goto abort;
623 	}
624 
625 	/* if the PCIe link width is 4 or less, we can use the aligned
626 	   firmware and skip any checks */
627 	if (sc->link_width != 0 && sc->link_width <= 4) {
628 		device_printf(sc->dev,
629 			      "PCIe x%d Link, expect reduced performance\n",
630 			      sc->link_width);
631 		aligned = 1;
632 		goto abort;
633 	}
634 
635 	if (0 == mxge_firmware_probe(sc))
636 		return 0;
637 
638 abort:
639 	if (aligned) {
640 		sc->fw_name = mxge_fw_aligned;
641 		sc->tx_boundary = 4096;
642 	} else {
643 		sc->fw_name = mxge_fw_unaligned;
644 		sc->tx_boundary = 2048;
645 	}
646 	return (mxge_load_firmware(sc, 0));
647 }
648 
649 static int
650 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
651 {
652 
653 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
654 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
655 			      be32toh(hdr->mcp_type));
656 		return EIO;
657 	}
658 
659 	/* save firmware version for sysctl */
660 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
661 	if (mxge_verbose)
662 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
663 
664 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
665 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
666 
667 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
668 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
669 		device_printf(sc->dev, "Found firmware version %s\n",
670 			      sc->fw_version);
671 		device_printf(sc->dev, "Driver needs %d.%d\n",
672 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
673 		return EINVAL;
674 	}
675 	return 0;
676 
677 }
678 
679 static int
680 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
681 {
682 	z_stream zs;
683 	char *inflate_buffer;
684 	const struct firmware *fw;
685 	const mcp_gen_header_t *hdr;
686 	unsigned hdr_offset;
687 	int status;
688 	unsigned int i;
689 	char dummy;
690 	size_t fw_len;
691 
692 	fw = firmware_get(sc->fw_name);
693 	if (fw == NULL) {
694 		device_printf(sc->dev, "Could not find firmware image %s\n",
695 			      sc->fw_name);
696 		return ENOENT;
697 	}
698 
699 	/* setup zlib and decompress f/w */
700 	bzero(&zs, sizeof (zs));
701 	zs.zalloc = zcalloc_nowait;
702 	zs.zfree = zcfree;
703 	status = inflateInit(&zs);
704 	if (status != Z_OK) {
705 		status = EIO;
706 		goto abort_with_fw;
707 	}
708 
709 	/* the uncompressed size is stored as the firmware version,
710 	   which would otherwise go unused */
711 	fw_len = (size_t) fw->version;
712 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
713 	if (inflate_buffer == NULL)
714 		goto abort_with_zs;
715 	zs.avail_in = fw->datasize;
716 	zs.next_in = __DECONST(char *, fw->data);
717 	zs.avail_out = fw_len;
718 	zs.next_out = inflate_buffer;
719 	status = inflate(&zs, Z_FINISH);
720 	if (status != Z_STREAM_END) {
721 		device_printf(sc->dev, "zlib %d\n", status);
722 		status = EIO;
723 		goto abort_with_buffer;
724 	}
725 
726 	/* check id */
727 	hdr_offset = htobe32(*(const uint32_t *)
728 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
729 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
730 		device_printf(sc->dev, "Bad firmware file");
731 		status = EIO;
732 		goto abort_with_buffer;
733 	}
734 	hdr = (const void*)(inflate_buffer + hdr_offset);
735 
736 	status = mxge_validate_firmware(sc, hdr);
737 	if (status != 0)
738 		goto abort_with_buffer;
739 
740 	/* Copy the inflated firmware to NIC SRAM. */
741 	for (i = 0; i < fw_len; i += 256) {
742 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
743 			      inflate_buffer + i,
744 			      min(256U, (unsigned)(fw_len - i)));
745 		wmb();
746 		dummy = *sc->sram;
747 		wmb();
748 	}
749 
750 	*limit = fw_len;
751 	status = 0;
752 abort_with_buffer:
753 	free(inflate_buffer, M_TEMP);
754 abort_with_zs:
755 	inflateEnd(&zs);
756 abort_with_fw:
757 	firmware_put(fw, FIRMWARE_UNLOAD);
758 	return status;
759 }
760 
761 /*
762  * Enable or disable periodic RDMAs from the host to make certain
763  * chipsets resend dropped PCIe messages
764  */
765 
766 static void
767 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
768 {
769 	char buf_bytes[72];
770 	volatile uint32_t *confirm;
771 	volatile char *submit;
772 	uint32_t *buf, dma_low, dma_high;
773 	int i;
774 
775 	buf = (uint32_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
776 
777 	/* clear confirmation addr */
778 	confirm = (volatile uint32_t *)sc->cmd;
779 	*confirm = 0;
780 	wmb();
781 
782 	/* send an rdma command to the PCIe engine, and wait for the
783 	   response in the confirmation address.  The firmware should
784 	   write a -1 there to indicate it is alive and well
785 	*/
786 
787 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
788 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
789 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
790 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
791 	buf[2] = htobe32(0xffffffff);		/* confirm data */
792 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
793 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
794 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
795 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
796 	buf[5] = htobe32(enable);			/* enable? */
797 
798 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
799 
800 	mxge_pio_copy(submit, buf, 64);
801 	wmb();
802 	DELAY(1000);
803 	wmb();
804 	i = 0;
805 	while (*confirm != 0xffffffff && i < 20) {
806 		DELAY(1000);
807 		i++;
808 	}
809 	if (*confirm != 0xffffffff) {
810 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
811 			      (enable ? "enable" : "disable"), confirm,
812 			      *confirm);
813 	}
814 	return;
815 }
816 
817 static int
818 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
819 {
820 	mcp_cmd_t *buf;
821 	char buf_bytes[sizeof(*buf) + 8];
822 	volatile mcp_cmd_response_t *response = sc->cmd;
823 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
824 	uint32_t dma_low, dma_high;
825 	int err, sleep_total = 0;
826 
827 	/* ensure buf is aligned to 8 bytes */
828 	buf = (mcp_cmd_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
829 
830 	buf->data0 = htobe32(data->data0);
831 	buf->data1 = htobe32(data->data1);
832 	buf->data2 = htobe32(data->data2);
833 	buf->cmd = htobe32(cmd);
834 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
835 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
836 
837 	buf->response_addr.low = htobe32(dma_low);
838 	buf->response_addr.high = htobe32(dma_high);
839 	mtx_lock(&sc->cmd_mtx);
840 	response->result = 0xffffffff;
841 	wmb();
842 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
843 
844 	/* wait up to 20ms */
845 	err = EAGAIN;
846 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
847 		bus_dmamap_sync(sc->cmd_dma.dmat,
848 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
849 		wmb();
850 		switch (be32toh(response->result)) {
851 		case 0:
852 			data->data0 = be32toh(response->data);
853 			err = 0;
854 			break;
855 		case 0xffffffff:
856 			DELAY(1000);
857 			break;
858 		case MXGEFW_CMD_UNKNOWN:
859 			err = ENOSYS;
860 			break;
861 		case MXGEFW_CMD_ERROR_UNALIGNED:
862 			err = E2BIG;
863 			break;
864 		case MXGEFW_CMD_ERROR_BUSY:
865 			err = EBUSY;
866 			break;
867 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
868 			err = ENXIO;
869 			break;
870 		default:
871 			device_printf(sc->dev,
872 				      "mxge: command %d "
873 				      "failed, result = %d\n",
874 				      cmd, be32toh(response->result));
875 			err = ENXIO;
876 			break;
877 		}
878 		if (err != EAGAIN)
879 			break;
880 	}
881 	if (err == EAGAIN)
882 		device_printf(sc->dev, "mxge: command %d timed out"
883 			      "result = %d\n",
884 			      cmd, be32toh(response->result));
885 	mtx_unlock(&sc->cmd_mtx);
886 	return err;
887 }
888 
889 static int
890 mxge_adopt_running_firmware(mxge_softc_t *sc)
891 {
892 	struct mcp_gen_header *hdr;
893 	const size_t bytes = sizeof (struct mcp_gen_header);
894 	size_t hdr_offset;
895 	int status;
896 
897 	/* find running firmware header */
898 	hdr_offset = htobe32(*(volatile uint32_t *)
899 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
900 
901 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
902 		device_printf(sc->dev,
903 			      "Running firmware has bad header offset (%d)\n",
904 			      (int)hdr_offset);
905 		return EIO;
906 	}
907 
908 	/* copy header of running firmware from SRAM to host memory to
909 	 * validate firmware */
910 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
911 	if (hdr == NULL) {
912 		device_printf(sc->dev, "could not malloc firmware hdr\n");
913 		return ENOMEM;
914 	}
915 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
916 				rman_get_bushandle(sc->mem_res),
917 				hdr_offset, (char *)hdr, bytes);
918 	status = mxge_validate_firmware(sc, hdr);
919 	free(hdr, M_DEVBUF);
920 
921 	/*
922 	 * check to see if adopted firmware has bug where adopting
923 	 * it will cause broadcasts to be filtered unless the NIC
924 	 * is kept in ALLMULTI mode
925 	 */
926 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
927 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
928 		sc->adopted_rx_filter_bug = 1;
929 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
930 			      "working around rx filter bug\n",
931 			      sc->fw_ver_major, sc->fw_ver_minor,
932 			      sc->fw_ver_tiny);
933 	}
934 
935 	return status;
936 }
937 
938 static int
939 mxge_load_firmware(mxge_softc_t *sc, int adopt)
940 {
941 	volatile uint32_t *confirm;
942 	volatile char *submit;
943 	char buf_bytes[72];
944 	uint32_t *buf, size, dma_low, dma_high;
945 	int status, i;
946 
947 	buf = (uint32_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
948 
949 	size = sc->sram_size;
950 	status = mxge_load_firmware_helper(sc, &size);
951 	if (status) {
952 		if (!adopt)
953 			return status;
954 		/* Try to use the currently running firmware, if
955 		   it is new enough */
956 		status = mxge_adopt_running_firmware(sc);
957 		if (status) {
958 			device_printf(sc->dev,
959 				      "failed to adopt running firmware\n");
960 			return status;
961 		}
962 		device_printf(sc->dev,
963 			      "Successfully adopted running firmware\n");
964 		if (sc->tx_boundary == 4096) {
965 			device_printf(sc->dev,
966 				"Using firmware currently running on NIC"
967 				 ".  For optimal\n");
968 			device_printf(sc->dev,
969 				 "performance consider loading optimized "
970 				 "firmware\n");
971 		}
972 		sc->fw_name = mxge_fw_unaligned;
973 		sc->tx_boundary = 2048;
974 		return 0;
975 	}
976 	/* clear confirmation addr */
977 	confirm = (volatile uint32_t *)sc->cmd;
978 	*confirm = 0;
979 	wmb();
980 	/* send a reload command to the bootstrap MCP, and wait for the
981 	   response in the confirmation address.  The firmware should
982 	   write a -1 there to indicate it is alive and well
983 	*/
984 
985 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
986 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
987 
988 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
989 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
990 	buf[2] = htobe32(0xffffffff);	/* confirm data */
991 
992 	/* FIX: All newest firmware should un-protect the bottom of
993 	   the sram before handoff. However, the very first interfaces
994 	   do not. Therefore the handoff copy must skip the first 8 bytes
995 	*/
996 					/* where the code starts*/
997 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
998 	buf[4] = htobe32(size - 8); 	/* length of code */
999 	buf[5] = htobe32(8);		/* where to copy to */
1000 	buf[6] = htobe32(0);		/* where to jump to */
1001 
1002 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1003 	mxge_pio_copy(submit, buf, 64);
1004 	wmb();
1005 	DELAY(1000);
1006 	wmb();
1007 	i = 0;
1008 	while (*confirm != 0xffffffff && i < 20) {
1009 		DELAY(1000*10);
1010 		i++;
1011 		bus_dmamap_sync(sc->cmd_dma.dmat,
1012 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1013 	}
1014 	if (*confirm != 0xffffffff) {
1015 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1016 			confirm, *confirm);
1017 
1018 		return ENXIO;
1019 	}
1020 	return 0;
1021 }
1022 
1023 static int
1024 mxge_update_mac_address(mxge_softc_t *sc)
1025 {
1026 	mxge_cmd_t cmd;
1027 	uint8_t *addr = sc->mac_addr;
1028 	int status;
1029 
1030 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1031 		     | (addr[2] << 8) | addr[3]);
1032 
1033 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1034 
1035 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1036 	return status;
1037 }
1038 
1039 static int
1040 mxge_change_pause(mxge_softc_t *sc, int pause)
1041 {
1042 	mxge_cmd_t cmd;
1043 	int status;
1044 
1045 	if (pause)
1046 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1047 				       &cmd);
1048 	else
1049 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1050 				       &cmd);
1051 
1052 	if (status) {
1053 		device_printf(sc->dev, "Failed to set flow control mode\n");
1054 		return ENXIO;
1055 	}
1056 	sc->pause = pause;
1057 	return 0;
1058 }
1059 
1060 static void
1061 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1062 {
1063 	mxge_cmd_t cmd;
1064 	int status;
1065 
1066 	if (mxge_always_promisc)
1067 		promisc = 1;
1068 
1069 	if (promisc)
1070 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1071 				       &cmd);
1072 	else
1073 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1074 				       &cmd);
1075 
1076 	if (status) {
1077 		device_printf(sc->dev, "Failed to set promisc mode\n");
1078 	}
1079 }
1080 
1081 struct mxge_add_maddr_ctx {
1082 	mxge_softc_t *sc;
1083 	int error;
1084 };
1085 
1086 static u_int
1087 mxge_add_maddr(void *arg, struct sockaddr_dl *sdl, u_int cnt)
1088 {
1089 	struct mxge_add_maddr_ctx *ctx = arg;
1090 	mxge_cmd_t cmd;
1091 
1092 	if (ctx->error != 0)
1093 		return (0);
1094 	bcopy(LLADDR(sdl), &cmd.data0, 4);
1095 	bcopy(LLADDR(sdl) + 4, &cmd.data1, 2);
1096 	cmd.data0 = htonl(cmd.data0);
1097 	cmd.data1 = htonl(cmd.data1);
1098 
1099 	ctx->error = mxge_send_cmd(ctx->sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1100 
1101 	return (1);
1102 }
1103 
1104 static void
1105 mxge_set_multicast_list(mxge_softc_t *sc)
1106 {
1107 	struct mxge_add_maddr_ctx ctx;
1108 	struct ifnet *ifp = sc->ifp;
1109 	mxge_cmd_t cmd;
1110 	int err;
1111 
1112 	/* This firmware is known to not support multicast */
1113 	if (!sc->fw_multicast_support)
1114 		return;
1115 
1116 	/* Disable multicast filtering while we play with the lists*/
1117 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1118 	if (err != 0) {
1119 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1120 		       " error status: %d\n", err);
1121 		return;
1122 	}
1123 
1124 	if (sc->adopted_rx_filter_bug)
1125 		return;
1126 
1127 	if (ifp->if_flags & IFF_ALLMULTI)
1128 		/* request to disable multicast filtering, so quit here */
1129 		return;
1130 
1131 	/* Flush all the filters */
1132 
1133 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1134 	if (err != 0) {
1135 		device_printf(sc->dev,
1136 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1137 			      ", error status: %d\n", err);
1138 		return;
1139 	}
1140 
1141 	/* Walk the multicast list, and add each address */
1142 	ctx.sc = sc;
1143 	ctx.error = 0;
1144 	if_foreach_llmaddr(ifp, mxge_add_maddr, &ctx);
1145 	if (ctx.error != 0) {
1146 		device_printf(sc->dev, "Failed MXGEFW_JOIN_MULTICAST_GROUP, "
1147 		    "error status:" "%d\t", ctx.error);
1148 		/* abort, leaving multicast filtering off */
1149 		return;
1150 	}
1151 
1152 	/* Enable multicast filtering */
1153 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1154 	if (err != 0) {
1155 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1156 		       ", error status: %d\n", err);
1157 	}
1158 }
1159 
1160 static int
1161 mxge_max_mtu(mxge_softc_t *sc)
1162 {
1163 	mxge_cmd_t cmd;
1164 	int status;
1165 
1166 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1167 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1168 
1169 	/* try to set nbufs to see if it we can
1170 	   use virtually contiguous jumbos */
1171 	cmd.data0 = 0;
1172 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1173 			       &cmd);
1174 	if (status == 0)
1175 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1176 
1177 	/* otherwise, we're limited to MJUMPAGESIZE */
1178 	return MJUMPAGESIZE - MXGEFW_PAD;
1179 }
1180 
1181 static int
1182 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1183 {
1184 	struct mxge_slice_state *ss;
1185 	mxge_rx_done_t *rx_done;
1186 	volatile uint32_t *irq_claim;
1187 	mxge_cmd_t cmd;
1188 	int slice, status;
1189 
1190 	/* try to send a reset command to the card to see if it
1191 	   is alive */
1192 	memset(&cmd, 0, sizeof (cmd));
1193 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1194 	if (status != 0) {
1195 		device_printf(sc->dev, "failed reset\n");
1196 		return ENXIO;
1197 	}
1198 
1199 	mxge_dummy_rdma(sc, 1);
1200 
1201 	/* set the intrq size */
1202 	cmd.data0 = sc->rx_ring_size;
1203 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1204 
1205 	/*
1206 	 * Even though we already know how many slices are supported
1207 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1208 	 * has magic side effects, and must be called after a reset.
1209 	 * It must be called prior to calling any RSS related cmds,
1210 	 * including assigning an interrupt queue for anything but
1211 	 * slice 0.  It must also be called *after*
1212 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1213 	 * the firmware to compute offsets.
1214 	 */
1215 
1216 	if (sc->num_slices > 1) {
1217 		/* ask the maximum number of slices it supports */
1218 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1219 					   &cmd);
1220 		if (status != 0) {
1221 			device_printf(sc->dev,
1222 				      "failed to get number of slices\n");
1223 			return status;
1224 		}
1225 		/*
1226 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1227 		 * to setting up the interrupt queue DMA
1228 		 */
1229 		cmd.data0 = sc->num_slices;
1230 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1231 #ifdef IFNET_BUF_RING
1232 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1233 #endif
1234 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1235 					   &cmd);
1236 		if (status != 0) {
1237 			device_printf(sc->dev,
1238 				      "failed to set number of slices\n");
1239 			return status;
1240 		}
1241 	}
1242 
1243 	if (interrupts_setup) {
1244 		/* Now exchange information about interrupts  */
1245 		for (slice = 0; slice < sc->num_slices; slice++) {
1246 			rx_done = &sc->ss[slice].rx_done;
1247 			memset(rx_done->entry, 0, sc->rx_ring_size);
1248 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1249 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1250 			cmd.data2 = slice;
1251 			status |= mxge_send_cmd(sc,
1252 						MXGEFW_CMD_SET_INTRQ_DMA,
1253 						&cmd);
1254 		}
1255 	}
1256 
1257 	status |= mxge_send_cmd(sc,
1258 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1259 
1260 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1261 
1262 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1263 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1264 
1265 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1266 				&cmd);
1267 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1268 	if (status != 0) {
1269 		device_printf(sc->dev, "failed set interrupt parameters\n");
1270 		return status;
1271 	}
1272 
1273 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1274 
1275 	/* run a DMA benchmark */
1276 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1277 
1278 	for (slice = 0; slice < sc->num_slices; slice++) {
1279 		ss = &sc->ss[slice];
1280 
1281 		ss->irq_claim = irq_claim + (2 * slice);
1282 		/* reset mcp/driver shared state back to 0 */
1283 		ss->rx_done.idx = 0;
1284 		ss->rx_done.cnt = 0;
1285 		ss->tx.req = 0;
1286 		ss->tx.done = 0;
1287 		ss->tx.pkt_done = 0;
1288 		ss->tx.queue_active = 0;
1289 		ss->tx.activate = 0;
1290 		ss->tx.deactivate = 0;
1291 		ss->tx.wake = 0;
1292 		ss->tx.defrag = 0;
1293 		ss->tx.stall = 0;
1294 		ss->rx_big.cnt = 0;
1295 		ss->rx_small.cnt = 0;
1296 		ss->lc.lro_bad_csum = 0;
1297 		ss->lc.lro_queued = 0;
1298 		ss->lc.lro_flushed = 0;
1299 		if (ss->fw_stats != NULL) {
1300 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1301 		}
1302 	}
1303 	sc->rdma_tags_available = 15;
1304 	status = mxge_update_mac_address(sc);
1305 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1306 	mxge_change_pause(sc, sc->pause);
1307 	mxge_set_multicast_list(sc);
1308 	if (sc->throttle) {
1309 		cmd.data0 = sc->throttle;
1310 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1311 				  &cmd)) {
1312 			device_printf(sc->dev,
1313 				      "can't enable throttle\n");
1314 		}
1315 	}
1316 	return status;
1317 }
1318 
1319 static int
1320 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1321 {
1322 	mxge_cmd_t cmd;
1323 	mxge_softc_t *sc;
1324 	int err;
1325 	unsigned int throttle;
1326 
1327 	sc = arg1;
1328 	throttle = sc->throttle;
1329 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1330 	if (err != 0) {
1331 		return err;
1332 	}
1333 
1334 	if (throttle == sc->throttle)
1335 		return 0;
1336 
1337 	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1338 		return EINVAL;
1339 
1340 	mtx_lock(&sc->driver_mtx);
1341 	cmd.data0 = throttle;
1342 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1343 	if (err == 0)
1344 		sc->throttle = throttle;
1345 	mtx_unlock(&sc->driver_mtx);
1346 	return err;
1347 }
1348 
1349 static int
1350 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1351 {
1352 	mxge_softc_t *sc;
1353 	unsigned int intr_coal_delay;
1354 	int err;
1355 
1356 	sc = arg1;
1357 	intr_coal_delay = sc->intr_coal_delay;
1358 	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1359 	if (err != 0) {
1360 		return err;
1361 	}
1362 	if (intr_coal_delay == sc->intr_coal_delay)
1363 		return 0;
1364 
1365 	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1366 		return EINVAL;
1367 
1368 	mtx_lock(&sc->driver_mtx);
1369 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1370 	sc->intr_coal_delay = intr_coal_delay;
1371 
1372 	mtx_unlock(&sc->driver_mtx);
1373 	return err;
1374 }
1375 
1376 static int
1377 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1378 {
1379 	mxge_softc_t *sc;
1380 	unsigned int enabled;
1381 	int err;
1382 
1383 	sc = arg1;
1384 	enabled = sc->pause;
1385 	err = sysctl_handle_int(oidp, &enabled, arg2, req);
1386 	if (err != 0) {
1387 		return err;
1388 	}
1389 	if (enabled == sc->pause)
1390 		return 0;
1391 
1392 	mtx_lock(&sc->driver_mtx);
1393 	err = mxge_change_pause(sc, enabled);
1394 	mtx_unlock(&sc->driver_mtx);
1395 	return err;
1396 }
1397 
1398 static int
1399 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1400 {
1401 	int err;
1402 
1403 	if (arg1 == NULL)
1404 		return EFAULT;
1405 	arg2 = be32toh(*(int *)arg1);
1406 	arg1 = NULL;
1407 	err = sysctl_handle_int(oidp, arg1, arg2, req);
1408 
1409 	return err;
1410 }
1411 
1412 static void
1413 mxge_rem_sysctls(mxge_softc_t *sc)
1414 {
1415 	struct mxge_slice_state *ss;
1416 	int slice;
1417 
1418 	if (sc->slice_sysctl_tree == NULL)
1419 		return;
1420 
1421 	for (slice = 0; slice < sc->num_slices; slice++) {
1422 		ss = &sc->ss[slice];
1423 		if (ss == NULL || ss->sysctl_tree == NULL)
1424 			continue;
1425 		sysctl_ctx_free(&ss->sysctl_ctx);
1426 		ss->sysctl_tree = NULL;
1427 	}
1428 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1429 	sc->slice_sysctl_tree = NULL;
1430 }
1431 
1432 static void
1433 mxge_add_sysctls(mxge_softc_t *sc)
1434 {
1435 	struct sysctl_ctx_list *ctx;
1436 	struct sysctl_oid_list *children;
1437 	mcp_irq_data_t *fw;
1438 	struct mxge_slice_state *ss;
1439 	int slice;
1440 	char slice_num[8];
1441 
1442 	ctx = device_get_sysctl_ctx(sc->dev);
1443 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1444 	fw = sc->ss[0].fw_stats;
1445 
1446 	/* random information */
1447 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1448 		       "firmware_version",
1449 		       CTLFLAG_RD, sc->fw_version,
1450 		       0, "firmware version");
1451 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1452 		       "serial_number",
1453 		       CTLFLAG_RD, sc->serial_number_string,
1454 		       0, "serial number");
1455 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1456 		       "product_code",
1457 		       CTLFLAG_RD, sc->product_code_string,
1458 		       0, "product_code");
1459 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1460 		       "pcie_link_width",
1461 		       CTLFLAG_RD, &sc->link_width,
1462 		       0, "tx_boundary");
1463 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1464 		       "tx_boundary",
1465 		       CTLFLAG_RD, &sc->tx_boundary,
1466 		       0, "tx_boundary");
1467 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1468 		       "write_combine",
1469 		       CTLFLAG_RD, &sc->wc,
1470 		       0, "write combining PIO?");
1471 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1472 		       "read_dma_MBs",
1473 		       CTLFLAG_RD, &sc->read_dma,
1474 		       0, "DMA Read speed in MB/s");
1475 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1476 		       "write_dma_MBs",
1477 		       CTLFLAG_RD, &sc->write_dma,
1478 		       0, "DMA Write speed in MB/s");
1479 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1480 		       "read_write_dma_MBs",
1481 		       CTLFLAG_RD, &sc->read_write_dma,
1482 		       0, "DMA concurrent Read/Write speed in MB/s");
1483 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1484 		       "watchdog_resets",
1485 		       CTLFLAG_RD, &sc->watchdog_resets,
1486 		       0, "Number of times NIC was reset");
1487 
1488 	/* performance related tunables */
1489 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1490 	    "intr_coal_delay", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1491 	    sc, 0, mxge_change_intr_coal, "I",
1492 	    "interrupt coalescing delay in usecs");
1493 
1494 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1495 	    "throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1496 	    mxge_change_throttle, "I", "transmit throttling");
1497 
1498 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1499 	    "flow_control_enabled",
1500 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1501 	    mxge_change_flow_control, "I",
1502 	    "interrupt coalescing delay in usecs");
1503 
1504 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1505 		       "deassert_wait",
1506 		       CTLFLAG_RW, &mxge_deassert_wait,
1507 		       0, "Wait for IRQ line to go low in ihandler");
1508 
1509 	/* stats block from firmware is in network byte order.
1510 	   Need to swap it */
1511 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1512 	    "link_up", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1513 	    &fw->link_up, 0, mxge_handle_be32, "I", "link up");
1514 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1515 	    "rdma_tags_available", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1516 	    &fw->rdma_tags_available, 0, mxge_handle_be32, "I",
1517 	    "rdma_tags_available");
1518 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1519 	    "dropped_bad_crc32", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1520 	    &fw->dropped_bad_crc32, 0, mxge_handle_be32, "I",
1521 	    "dropped_bad_crc32");
1522 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1523 	    "dropped_bad_phy", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1524 	    &fw->dropped_bad_phy, 0, mxge_handle_be32, "I", "dropped_bad_phy");
1525 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1526 	    "dropped_link_error_or_filtered",
1527 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1528 	    &fw->dropped_link_error_or_filtered, 0, mxge_handle_be32, "I",
1529 	    "dropped_link_error_or_filtered");
1530 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1531 	    "dropped_link_overflow",
1532 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1533 	    &fw->dropped_link_overflow, 0, mxge_handle_be32, "I",
1534 	    "dropped_link_overflow");
1535 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1536 	    "dropped_multicast_filtered",
1537 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1538 	    &fw->dropped_multicast_filtered, 0, mxge_handle_be32, "I",
1539 	    "dropped_multicast_filtered");
1540 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1541 	    "dropped_no_big_buffer",
1542 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1543 	    &fw->dropped_no_big_buffer, 0, mxge_handle_be32, "I",
1544 	    "dropped_no_big_buffer");
1545 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1546 	    "dropped_no_small_buffer",
1547 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1548 	    &fw->dropped_no_small_buffer, 0, mxge_handle_be32, "I",
1549 	    "dropped_no_small_buffer");
1550 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1551 	    "dropped_overrun",
1552 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1553 	    &fw->dropped_overrun, 0, mxge_handle_be32, "I",
1554 	    "dropped_overrun");
1555 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1556 	    "dropped_pause", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1557 	    &fw->dropped_pause, 0, mxge_handle_be32, "I", "dropped_pause");
1558 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1559 	    "dropped_runt", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1560 	    &fw->dropped_runt, 0, mxge_handle_be32, "I", "dropped_runt");
1561 
1562 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1563 	    "dropped_unicast_filtered",
1564 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1565 	    &fw->dropped_unicast_filtered, 0, mxge_handle_be32, "I",
1566 	    "dropped_unicast_filtered");
1567 
1568 	/* verbose printing? */
1569 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1570 		       "verbose",
1571 		       CTLFLAG_RW, &mxge_verbose,
1572 		       0, "verbose printing");
1573 
1574 	/* add counters exported for debugging from all slices */
1575 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1576 	sc->slice_sysctl_tree =
1577 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1578 		    "slice", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
1579 
1580 	for (slice = 0; slice < sc->num_slices; slice++) {
1581 		ss = &sc->ss[slice];
1582 		sysctl_ctx_init(&ss->sysctl_ctx);
1583 		ctx = &ss->sysctl_ctx;
1584 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1585 		sprintf(slice_num, "%d", slice);
1586 		ss->sysctl_tree =
1587 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1588 			    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
1589 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1590 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1591 			       "rx_small_cnt",
1592 			       CTLFLAG_RD, &ss->rx_small.cnt,
1593 			       0, "rx_small_cnt");
1594 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1595 			       "rx_big_cnt",
1596 			       CTLFLAG_RD, &ss->rx_big.cnt,
1597 			       0, "rx_small_cnt");
1598 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1599 			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1600 			       0, "number of lro merge queues flushed");
1601 
1602 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1603 			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1604 			       0, "number of bad csums preventing LRO");
1605 
1606 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1607 			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1608 			       0, "number of frames appended to lro merge"
1609 			       "queues");
1610 
1611 #ifndef IFNET_BUF_RING
1612 		/* only transmit from slice 0 for now */
1613 		if (slice > 0)
1614 			continue;
1615 #endif
1616 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1617 			       "tx_req",
1618 			       CTLFLAG_RD, &ss->tx.req,
1619 			       0, "tx_req");
1620 
1621 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1622 			       "tx_done",
1623 			       CTLFLAG_RD, &ss->tx.done,
1624 			       0, "tx_done");
1625 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1626 			       "tx_pkt_done",
1627 			       CTLFLAG_RD, &ss->tx.pkt_done,
1628 			       0, "tx_done");
1629 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1630 			       "tx_stall",
1631 			       CTLFLAG_RD, &ss->tx.stall,
1632 			       0, "tx_stall");
1633 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1634 			       "tx_wake",
1635 			       CTLFLAG_RD, &ss->tx.wake,
1636 			       0, "tx_wake");
1637 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1638 			       "tx_defrag",
1639 			       CTLFLAG_RD, &ss->tx.defrag,
1640 			       0, "tx_defrag");
1641 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1642 			       "tx_queue_active",
1643 			       CTLFLAG_RD, &ss->tx.queue_active,
1644 			       0, "tx_queue_active");
1645 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1646 			       "tx_activate",
1647 			       CTLFLAG_RD, &ss->tx.activate,
1648 			       0, "tx_activate");
1649 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1650 			       "tx_deactivate",
1651 			       CTLFLAG_RD, &ss->tx.deactivate,
1652 			       0, "tx_deactivate");
1653 	}
1654 }
1655 
1656 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1657    backwards one at a time and handle ring wraps */
1658 
1659 static inline void
1660 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1661 			    mcp_kreq_ether_send_t *src, int cnt)
1662 {
1663 	int idx, starting_slot;
1664 	starting_slot = tx->req;
1665 	while (cnt > 1) {
1666 		cnt--;
1667 		idx = (starting_slot + cnt) & tx->mask;
1668 		mxge_pio_copy(&tx->lanai[idx],
1669 			      &src[cnt], sizeof(*src));
1670 		wmb();
1671 	}
1672 }
1673 
1674 /*
1675  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1676  * at most 32 bytes at a time, so as to avoid involving the software
1677  * pio handler in the nic.   We re-write the first segment's flags
1678  * to mark them valid only after writing the entire chain
1679  */
1680 
1681 static inline void
1682 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1683 		  int cnt)
1684 {
1685 	int idx, i;
1686 	uint32_t *src_ints;
1687 	volatile uint32_t *dst_ints;
1688 	mcp_kreq_ether_send_t *srcp;
1689 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1690 	uint8_t last_flags;
1691 
1692 	idx = tx->req & tx->mask;
1693 
1694 	last_flags = src->flags;
1695 	src->flags = 0;
1696 	wmb();
1697 	dst = dstp = &tx->lanai[idx];
1698 	srcp = src;
1699 
1700 	if ((idx + cnt) < tx->mask) {
1701 		for (i = 0; i < (cnt - 1); i += 2) {
1702 			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1703 			wmb(); /* force write every 32 bytes */
1704 			srcp += 2;
1705 			dstp += 2;
1706 		}
1707 	} else {
1708 		/* submit all but the first request, and ensure
1709 		   that it is submitted below */
1710 		mxge_submit_req_backwards(tx, src, cnt);
1711 		i = 0;
1712 	}
1713 	if (i < cnt) {
1714 		/* submit the first request */
1715 		mxge_pio_copy(dstp, srcp, sizeof(*src));
1716 		wmb(); /* barrier before setting valid flag */
1717 	}
1718 
1719 	/* re-write the last 32-bits with the valid flags */
1720 	src->flags = last_flags;
1721 	src_ints = (uint32_t *)src;
1722 	src_ints+=3;
1723 	dst_ints = (volatile uint32_t *)dst;
1724 	dst_ints+=3;
1725 	*dst_ints =  *src_ints;
1726 	tx->req += cnt;
1727 	wmb();
1728 }
1729 
1730 static int
1731 mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1732     struct mxge_pkt_info *pi)
1733 {
1734 	struct ether_vlan_header *eh;
1735 	uint16_t etype;
1736 	int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1737 #if IFCAP_TSO6 && defined(INET6)
1738 	int nxt;
1739 #endif
1740 
1741 	eh = mtod(m, struct ether_vlan_header *);
1742 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1743 		etype = ntohs(eh->evl_proto);
1744 		pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1745 	} else {
1746 		etype = ntohs(eh->evl_encap_proto);
1747 		pi->ip_off = ETHER_HDR_LEN;
1748 	}
1749 
1750 	switch (etype) {
1751 	case ETHERTYPE_IP:
1752 		/*
1753 		 * ensure ip header is in first mbuf, copy it to a
1754 		 * scratch buffer if not
1755 		 */
1756 		pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1757 		pi->ip6 = NULL;
1758 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1759 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1760 			    ss->scratch);
1761 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1762 		}
1763 		pi->ip_hlen = pi->ip->ip_hl << 2;
1764 		if (!tso)
1765 			return 0;
1766 
1767 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1768 		    sizeof(struct tcphdr))) {
1769 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1770 			    sizeof(struct tcphdr), ss->scratch);
1771 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1772 		}
1773 		pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1774 		break;
1775 #if IFCAP_TSO6 && defined(INET6)
1776 	case ETHERTYPE_IPV6:
1777 		pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1778 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1779 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1780 			    ss->scratch);
1781 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1782 		}
1783 		nxt = 0;
1784 		pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1785 		pi->ip_hlen -= pi->ip_off;
1786 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1787 			return EINVAL;
1788 
1789 		if (!tso)
1790 			return 0;
1791 
1792 		if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1793 			return EINVAL;
1794 
1795 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1796 		    sizeof(struct tcphdr))) {
1797 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1798 			    sizeof(struct tcphdr), ss->scratch);
1799 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1800 		}
1801 		pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1802 		break;
1803 #endif
1804 	default:
1805 		return EINVAL;
1806 	}
1807 	return 0;
1808 }
1809 
1810 #if IFCAP_TSO4
1811 
1812 static void
1813 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1814 	       int busdma_seg_cnt, struct mxge_pkt_info *pi)
1815 {
1816 	mxge_tx_ring_t *tx;
1817 	mcp_kreq_ether_send_t *req;
1818 	bus_dma_segment_t *seg;
1819 	uint32_t low, high_swapped;
1820 	int len, seglen, cum_len, cum_len_next;
1821 	int next_is_first, chop, cnt, rdma_count, small;
1822 	uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1823 	uint8_t flags, flags_next;
1824 	static int once;
1825 
1826 	mss = m->m_pkthdr.tso_segsz;
1827 
1828 	/* negative cum_len signifies to the
1829 	 * send loop that we are still in the
1830 	 * header portion of the TSO packet.
1831 	 */
1832 
1833 	cksum_offset = pi->ip_off + pi->ip_hlen;
1834 	cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1835 
1836 	/* TSO implies checksum offload on this hardware */
1837 	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1838 		/*
1839 		 * If packet has full TCP csum, replace it with pseudo hdr
1840 		 * sum that the NIC expects, otherwise the NIC will emit
1841 		 * packets with bad TCP checksums.
1842 		 */
1843 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1844 		if (pi->ip6) {
1845 #if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1846 			m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1847 			sum = in6_cksum_pseudo(pi->ip6,
1848 			    m->m_pkthdr.len - cksum_offset,
1849 			    IPPROTO_TCP, 0);
1850 #endif
1851 		} else {
1852 #ifdef INET
1853 			m->m_pkthdr.csum_flags |= CSUM_TCP;
1854 			sum = in_pseudo(pi->ip->ip_src.s_addr,
1855 			    pi->ip->ip_dst.s_addr,
1856 			    htons(IPPROTO_TCP + (m->m_pkthdr.len -
1857 				    cksum_offset)));
1858 #endif
1859 		}
1860 		m_copyback(m, offsetof(struct tcphdr, th_sum) +
1861 		    cksum_offset, sizeof(sum), (caddr_t)&sum);
1862 	}
1863 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1864 
1865 	/* for TSO, pseudo_hdr_offset holds mss.
1866 	 * The firmware figures out where to put
1867 	 * the checksum by parsing the header. */
1868 	pseudo_hdr_offset = htobe16(mss);
1869 
1870 	if (pi->ip6) {
1871 		/*
1872 		 * for IPv6 TSO, the "checksum offset" is re-purposed
1873 		 * to store the TCP header len
1874 		 */
1875 		cksum_offset = (pi->tcp->th_off << 2);
1876 	}
1877 
1878 	tx = &ss->tx;
1879 	req = tx->req_list;
1880 	seg = tx->seg_list;
1881 	cnt = 0;
1882 	rdma_count = 0;
1883 	/* "rdma_count" is the number of RDMAs belonging to the
1884 	 * current packet BEFORE the current send request. For
1885 	 * non-TSO packets, this is equal to "count".
1886 	 * For TSO packets, rdma_count needs to be reset
1887 	 * to 0 after a segment cut.
1888 	 *
1889 	 * The rdma_count field of the send request is
1890 	 * the number of RDMAs of the packet starting at
1891 	 * that request. For TSO send requests with one ore more cuts
1892 	 * in the middle, this is the number of RDMAs starting
1893 	 * after the last cut in the request. All previous
1894 	 * segments before the last cut implicitly have 1 RDMA.
1895 	 *
1896 	 * Since the number of RDMAs is not known beforehand,
1897 	 * it must be filled-in retroactively - after each
1898 	 * segmentation cut or at the end of the entire packet.
1899 	 */
1900 
1901 	while (busdma_seg_cnt) {
1902 		/* Break the busdma segment up into pieces*/
1903 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1904 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1905 		len = seg->ds_len;
1906 
1907 		while (len) {
1908 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1909 			seglen = len;
1910 			cum_len_next = cum_len + seglen;
1911 			(req-rdma_count)->rdma_count = rdma_count + 1;
1912 			if (__predict_true(cum_len >= 0)) {
1913 				/* payload */
1914 				chop = (cum_len_next > mss);
1915 				cum_len_next = cum_len_next % mss;
1916 				next_is_first = (cum_len_next == 0);
1917 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1918 				flags_next |= next_is_first *
1919 					MXGEFW_FLAGS_FIRST;
1920 				rdma_count |= -(chop | next_is_first);
1921 				rdma_count += chop & !next_is_first;
1922 			} else if (cum_len_next >= 0) {
1923 				/* header ends */
1924 				rdma_count = -1;
1925 				cum_len_next = 0;
1926 				seglen = -cum_len;
1927 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1928 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1929 					MXGEFW_FLAGS_FIRST |
1930 					(small * MXGEFW_FLAGS_SMALL);
1931 			    }
1932 
1933 			req->addr_high = high_swapped;
1934 			req->addr_low = htobe32(low);
1935 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1936 			req->pad = 0;
1937 			req->rdma_count = 1;
1938 			req->length = htobe16(seglen);
1939 			req->cksum_offset = cksum_offset;
1940 			req->flags = flags | ((cum_len & 1) *
1941 					      MXGEFW_FLAGS_ALIGN_ODD);
1942 			low += seglen;
1943 			len -= seglen;
1944 			cum_len = cum_len_next;
1945 			flags = flags_next;
1946 			req++;
1947 			cnt++;
1948 			rdma_count++;
1949 			if (cksum_offset != 0 && !pi->ip6) {
1950 				if (__predict_false(cksum_offset > seglen))
1951 					cksum_offset -= seglen;
1952 				else
1953 					cksum_offset = 0;
1954 			}
1955 			if (__predict_false(cnt > tx->max_desc))
1956 				goto drop;
1957 		}
1958 		busdma_seg_cnt--;
1959 		seg++;
1960 	}
1961 	(req-rdma_count)->rdma_count = rdma_count;
1962 
1963 	do {
1964 		req--;
1965 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1966 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1967 
1968 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1969 	mxge_submit_req(tx, tx->req_list, cnt);
1970 #ifdef IFNET_BUF_RING
1971 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1972 		/* tell the NIC to start polling this slice */
1973 		*tx->send_go = 1;
1974 		tx->queue_active = 1;
1975 		tx->activate++;
1976 		wmb();
1977 	}
1978 #endif
1979 	return;
1980 
1981 drop:
1982 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1983 	m_freem(m);
1984 	ss->oerrors++;
1985 	if (!once) {
1986 		printf("tx->max_desc exceeded via TSO!\n");
1987 		printf("mss = %d, %ld, %d!\n", mss,
1988 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1989 		once = 1;
1990 	}
1991 	return;
1992 
1993 }
1994 
1995 #endif /* IFCAP_TSO4 */
1996 
1997 #ifdef MXGE_NEW_VLAN_API
1998 /*
1999  * We reproduce the software vlan tag insertion from
2000  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2001  * vlan tag insertion. We need to advertise this in order to have the
2002  * vlan interface respect our csum offload flags.
2003  */
2004 static struct mbuf *
2005 mxge_vlan_tag_insert(struct mbuf *m)
2006 {
2007 	struct ether_vlan_header *evl;
2008 
2009 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
2010 	if (__predict_false(m == NULL))
2011 		return NULL;
2012 	if (m->m_len < sizeof(*evl)) {
2013 		m = m_pullup(m, sizeof(*evl));
2014 		if (__predict_false(m == NULL))
2015 			return NULL;
2016 	}
2017 	/*
2018 	 * Transform the Ethernet header into an Ethernet header
2019 	 * with 802.1Q encapsulation.
2020 	 */
2021 	evl = mtod(m, struct ether_vlan_header *);
2022 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2023 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2024 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2025 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2026 	m->m_flags &= ~M_VLANTAG;
2027 	return m;
2028 }
2029 #endif /* MXGE_NEW_VLAN_API */
2030 
2031 static void
2032 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2033 {
2034 	struct mxge_pkt_info pi = {0,0,0,0};
2035 	mxge_softc_t *sc;
2036 	mcp_kreq_ether_send_t *req;
2037 	bus_dma_segment_t *seg;
2038 	struct mbuf *m_tmp;
2039 	struct ifnet *ifp;
2040 	mxge_tx_ring_t *tx;
2041 	int cnt, cum_len, err, i, idx, odd_flag;
2042 	uint16_t pseudo_hdr_offset;
2043 	uint8_t flags, cksum_offset;
2044 
2045 	sc = ss->sc;
2046 	ifp = sc->ifp;
2047 	tx = &ss->tx;
2048 
2049 #ifdef MXGE_NEW_VLAN_API
2050 	if (m->m_flags & M_VLANTAG) {
2051 		m = mxge_vlan_tag_insert(m);
2052 		if (__predict_false(m == NULL))
2053 			goto drop_without_m;
2054 	}
2055 #endif
2056 	if (m->m_pkthdr.csum_flags &
2057 	    (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2058 		if (mxge_parse_tx(ss, m, &pi))
2059 			goto drop;
2060 	}
2061 
2062 	/* (try to) map the frame for DMA */
2063 	idx = tx->req & tx->mask;
2064 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2065 				      m, tx->seg_list, &cnt,
2066 				      BUS_DMA_NOWAIT);
2067 	if (__predict_false(err == EFBIG)) {
2068 		/* Too many segments in the chain.  Try
2069 		   to defrag */
2070 		m_tmp = m_defrag(m, M_NOWAIT);
2071 		if (m_tmp == NULL) {
2072 			goto drop;
2073 		}
2074 		ss->tx.defrag++;
2075 		m = m_tmp;
2076 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2077 					      tx->info[idx].map,
2078 					      m, tx->seg_list, &cnt,
2079 					      BUS_DMA_NOWAIT);
2080 	}
2081 	if (__predict_false(err != 0)) {
2082 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2083 			      " packet len = %d\n", err, m->m_pkthdr.len);
2084 		goto drop;
2085 	}
2086 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2087 			BUS_DMASYNC_PREWRITE);
2088 	tx->info[idx].m = m;
2089 
2090 #if IFCAP_TSO4
2091 	/* TSO is different enough, we handle it in another routine */
2092 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2093 		mxge_encap_tso(ss, m, cnt, &pi);
2094 		return;
2095 	}
2096 #endif
2097 
2098 	req = tx->req_list;
2099 	cksum_offset = 0;
2100 	pseudo_hdr_offset = 0;
2101 	flags = MXGEFW_FLAGS_NO_TSO;
2102 
2103 	/* checksum offloading? */
2104 	if (m->m_pkthdr.csum_flags &
2105 	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2106 		/* ensure ip header is in first mbuf, copy
2107 		   it to a scratch buffer if not */
2108 		cksum_offset = pi.ip_off + pi.ip_hlen;
2109 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2110 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2111 		req->cksum_offset = cksum_offset;
2112 		flags |= MXGEFW_FLAGS_CKSUM;
2113 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2114 	} else {
2115 		odd_flag = 0;
2116 	}
2117 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2118 		flags |= MXGEFW_FLAGS_SMALL;
2119 
2120 	/* convert segments into a request list */
2121 	cum_len = 0;
2122 	seg = tx->seg_list;
2123 	req->flags = MXGEFW_FLAGS_FIRST;
2124 	for (i = 0; i < cnt; i++) {
2125 		req->addr_low =
2126 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2127 		req->addr_high =
2128 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2129 		req->length = htobe16(seg->ds_len);
2130 		req->cksum_offset = cksum_offset;
2131 		if (cksum_offset > seg->ds_len)
2132 			cksum_offset -= seg->ds_len;
2133 		else
2134 			cksum_offset = 0;
2135 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2136 		req->pad = 0; /* complete solid 16-byte block */
2137 		req->rdma_count = 1;
2138 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2139 		cum_len += seg->ds_len;
2140 		seg++;
2141 		req++;
2142 		req->flags = 0;
2143 	}
2144 	req--;
2145 	/* pad runts to 60 bytes */
2146 	if (cum_len < 60) {
2147 		req++;
2148 		req->addr_low =
2149 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2150 		req->addr_high =
2151 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2152 		req->length = htobe16(60 - cum_len);
2153 		req->cksum_offset = 0;
2154 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2155 		req->pad = 0; /* complete solid 16-byte block */
2156 		req->rdma_count = 1;
2157 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2158 		cnt++;
2159 	}
2160 
2161 	tx->req_list[0].rdma_count = cnt;
2162 #if 0
2163 	/* print what the firmware will see */
2164 	for (i = 0; i < cnt; i++) {
2165 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2166 		    "cso:%d, flags:0x%x, rdma:%d\n",
2167 		    i, (int)ntohl(tx->req_list[i].addr_high),
2168 		    (int)ntohl(tx->req_list[i].addr_low),
2169 		    (int)ntohs(tx->req_list[i].length),
2170 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2171 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2172 		    tx->req_list[i].rdma_count);
2173 	}
2174 	printf("--------------\n");
2175 #endif
2176 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2177 	mxge_submit_req(tx, tx->req_list, cnt);
2178 #ifdef IFNET_BUF_RING
2179 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2180 		/* tell the NIC to start polling this slice */
2181 		*tx->send_go = 1;
2182 		tx->queue_active = 1;
2183 		tx->activate++;
2184 		wmb();
2185 	}
2186 #endif
2187 	return;
2188 
2189 drop:
2190 	m_freem(m);
2191 drop_without_m:
2192 	ss->oerrors++;
2193 	return;
2194 }
2195 
2196 #ifdef IFNET_BUF_RING
2197 static void
2198 mxge_qflush(struct ifnet *ifp)
2199 {
2200 	mxge_softc_t *sc = ifp->if_softc;
2201 	mxge_tx_ring_t *tx;
2202 	struct mbuf *m;
2203 	int slice;
2204 
2205 	for (slice = 0; slice < sc->num_slices; slice++) {
2206 		tx = &sc->ss[slice].tx;
2207 		mtx_lock(&tx->mtx);
2208 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2209 			m_freem(m);
2210 		mtx_unlock(&tx->mtx);
2211 	}
2212 	if_qflush(ifp);
2213 }
2214 
2215 static inline void
2216 mxge_start_locked(struct mxge_slice_state *ss)
2217 {
2218 	mxge_softc_t *sc;
2219 	struct mbuf *m;
2220 	struct ifnet *ifp;
2221 	mxge_tx_ring_t *tx;
2222 
2223 	sc = ss->sc;
2224 	ifp = sc->ifp;
2225 	tx = &ss->tx;
2226 
2227 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2228 		m = drbr_dequeue(ifp, tx->br);
2229 		if (m == NULL) {
2230 			return;
2231 		}
2232 		/* let BPF see it */
2233 		BPF_MTAP(ifp, m);
2234 
2235 		/* give it to the nic */
2236 		mxge_encap(ss, m);
2237 	}
2238 	/* ran out of transmit slots */
2239 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2240 	    && (!drbr_empty(ifp, tx->br))) {
2241 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2242 		tx->stall++;
2243 	}
2244 }
2245 
2246 static int
2247 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2248 {
2249 	mxge_softc_t *sc;
2250 	struct ifnet *ifp;
2251 	mxge_tx_ring_t *tx;
2252 	int err;
2253 
2254 	sc = ss->sc;
2255 	ifp = sc->ifp;
2256 	tx = &ss->tx;
2257 
2258 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2259 	    IFF_DRV_RUNNING) {
2260 		err = drbr_enqueue(ifp, tx->br, m);
2261 		return (err);
2262 	}
2263 
2264 	if (!drbr_needs_enqueue(ifp, tx->br) &&
2265 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2266 		/* let BPF see it */
2267 		BPF_MTAP(ifp, m);
2268 		/* give it to the nic */
2269 		mxge_encap(ss, m);
2270 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2271 		return (err);
2272 	}
2273 	if (!drbr_empty(ifp, tx->br))
2274 		mxge_start_locked(ss);
2275 	return (0);
2276 }
2277 
2278 static int
2279 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2280 {
2281 	mxge_softc_t *sc = ifp->if_softc;
2282 	struct mxge_slice_state *ss;
2283 	mxge_tx_ring_t *tx;
2284 	int err = 0;
2285 	int slice;
2286 
2287 	slice = m->m_pkthdr.flowid;
2288 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2289 
2290 	ss = &sc->ss[slice];
2291 	tx = &ss->tx;
2292 
2293 	if (mtx_trylock(&tx->mtx)) {
2294 		err = mxge_transmit_locked(ss, m);
2295 		mtx_unlock(&tx->mtx);
2296 	} else {
2297 		err = drbr_enqueue(ifp, tx->br, m);
2298 	}
2299 
2300 	return (err);
2301 }
2302 
2303 #else
2304 
2305 static inline void
2306 mxge_start_locked(struct mxge_slice_state *ss)
2307 {
2308 	mxge_softc_t *sc;
2309 	struct mbuf *m;
2310 	struct ifnet *ifp;
2311 	mxge_tx_ring_t *tx;
2312 
2313 	sc = ss->sc;
2314 	ifp = sc->ifp;
2315 	tx = &ss->tx;
2316 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2317 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2318 		if (m == NULL) {
2319 			return;
2320 		}
2321 		/* let BPF see it */
2322 		BPF_MTAP(ifp, m);
2323 
2324 		/* give it to the nic */
2325 		mxge_encap(ss, m);
2326 	}
2327 	/* ran out of transmit slots */
2328 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2329 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2330 		tx->stall++;
2331 	}
2332 }
2333 #endif
2334 static void
2335 mxge_start(struct ifnet *ifp)
2336 {
2337 	mxge_softc_t *sc = ifp->if_softc;
2338 	struct mxge_slice_state *ss;
2339 
2340 	/* only use the first slice for now */
2341 	ss = &sc->ss[0];
2342 	mtx_lock(&ss->tx.mtx);
2343 	mxge_start_locked(ss);
2344 	mtx_unlock(&ss->tx.mtx);
2345 }
2346 
2347 /*
2348  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2349  * at most 32 bytes at a time, so as to avoid involving the software
2350  * pio handler in the nic.   We re-write the first segment's low
2351  * DMA address to mark it valid only after we write the entire chunk
2352  * in a burst
2353  */
2354 static inline void
2355 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2356 		mcp_kreq_ether_recv_t *src)
2357 {
2358 	uint32_t low;
2359 
2360 	low = src->addr_low;
2361 	src->addr_low = 0xffffffff;
2362 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2363 	wmb();
2364 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2365 	wmb();
2366 	src->addr_low = low;
2367 	dst->addr_low = low;
2368 	wmb();
2369 }
2370 
2371 static int
2372 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2373 {
2374 	bus_dma_segment_t seg;
2375 	struct mbuf *m;
2376 	mxge_rx_ring_t *rx = &ss->rx_small;
2377 	int cnt, err;
2378 
2379 	m = m_gethdr(M_NOWAIT, MT_DATA);
2380 	if (m == NULL) {
2381 		rx->alloc_fail++;
2382 		err = ENOBUFS;
2383 		goto done;
2384 	}
2385 	m->m_len = MHLEN;
2386 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2387 				      &seg, &cnt, BUS_DMA_NOWAIT);
2388 	if (err != 0) {
2389 		m_free(m);
2390 		goto done;
2391 	}
2392 	rx->info[idx].m = m;
2393 	rx->shadow[idx].addr_low =
2394 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2395 	rx->shadow[idx].addr_high =
2396 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2397 
2398 done:
2399 	if ((idx & 7) == 7)
2400 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2401 	return err;
2402 }
2403 
2404 static int
2405 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2406 {
2407 	bus_dma_segment_t seg[3];
2408 	struct mbuf *m;
2409 	mxge_rx_ring_t *rx = &ss->rx_big;
2410 	int cnt, err, i;
2411 
2412 	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2413 	if (m == NULL) {
2414 		rx->alloc_fail++;
2415 		err = ENOBUFS;
2416 		goto done;
2417 	}
2418 	m->m_len = rx->mlen;
2419 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2420 				      seg, &cnt, BUS_DMA_NOWAIT);
2421 	if (err != 0) {
2422 		m_free(m);
2423 		goto done;
2424 	}
2425 	rx->info[idx].m = m;
2426 	rx->shadow[idx].addr_low =
2427 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2428 	rx->shadow[idx].addr_high =
2429 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2430 
2431 #if MXGE_VIRT_JUMBOS
2432 	for (i = 1; i < cnt; i++) {
2433 		rx->shadow[idx + i].addr_low =
2434 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2435 		rx->shadow[idx + i].addr_high =
2436 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2437        }
2438 #endif
2439 
2440 done:
2441        for (i = 0; i < rx->nbufs; i++) {
2442 		if ((idx & 7) == 7) {
2443 			mxge_submit_8rx(&rx->lanai[idx - 7],
2444 					&rx->shadow[idx - 7]);
2445 		}
2446 		idx++;
2447 	}
2448 	return err;
2449 }
2450 
2451 #ifdef INET6
2452 
2453 static uint16_t
2454 mxge_csum_generic(uint16_t *raw, int len)
2455 {
2456 	uint32_t csum;
2457 
2458 	csum = 0;
2459 	while (len > 0) {
2460 		csum += *raw;
2461 		raw++;
2462 		len -= 2;
2463 	}
2464 	csum = (csum >> 16) + (csum & 0xffff);
2465 	csum = (csum >> 16) + (csum & 0xffff);
2466 	return (uint16_t)csum;
2467 }
2468 
2469 static inline uint16_t
2470 mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2471 {
2472 	uint32_t partial;
2473 	int nxt, cksum_offset;
2474 	struct ip6_hdr *ip6 = p;
2475 	uint16_t c;
2476 
2477 	nxt = ip6->ip6_nxt;
2478 	cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2479 	if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2480 		cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2481 					   IPPROTO_IPV6, &nxt);
2482 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2483 			return (1);
2484 	}
2485 
2486 	/*
2487 	 * IPv6 headers do not contain a checksum, and hence
2488 	 * do not checksum to zero, so they don't "fall out"
2489 	 * of the partial checksum calculation like IPv4
2490 	 * headers do.  We need to fix the partial checksum by
2491 	 * subtracting the checksum of the IPv6 header.
2492 	 */
2493 
2494 	partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2495 				    ETHER_HDR_LEN);
2496 	csum += ~partial;
2497 	csum +=	 (csum < ~partial);
2498 	csum = (csum >> 16) + (csum & 0xFFFF);
2499 	csum = (csum >> 16) + (csum & 0xFFFF);
2500 	c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2501 			     csum);
2502 	c ^= 0xffff;
2503 	return (c);
2504 }
2505 #endif /* INET6 */
2506 /*
2507  *  Myri10GE hardware checksums are not valid if the sender
2508  *  padded the frame with non-zero padding.  This is because
2509  *  the firmware just does a simple 16-bit 1s complement
2510  *  checksum across the entire frame, excluding the first 14
2511  *  bytes.  It is best to simply to check the checksum and
2512  *  tell the stack about it only if the checksum is good
2513  */
2514 
2515 static inline uint16_t
2516 mxge_rx_csum(struct mbuf *m, int csum)
2517 {
2518 	struct ether_header *eh;
2519 #ifdef INET
2520 	struct ip *ip;
2521 #endif
2522 #if defined(INET) || defined(INET6)
2523 	int cap = m->m_pkthdr.rcvif->if_capenable;
2524 #endif
2525 	uint16_t c, etype;
2526 
2527 	eh = mtod(m, struct ether_header *);
2528 	etype = ntohs(eh->ether_type);
2529 	switch (etype) {
2530 #ifdef INET
2531 	case ETHERTYPE_IP:
2532 		if ((cap & IFCAP_RXCSUM) == 0)
2533 			return (1);
2534 		ip = (struct ip *)(eh + 1);
2535 		if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2536 			return (1);
2537 		c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2538 			      htonl(ntohs(csum) + ntohs(ip->ip_len) -
2539 				    (ip->ip_hl << 2) + ip->ip_p));
2540 		c ^= 0xffff;
2541 		break;
2542 #endif
2543 #ifdef INET6
2544 	case ETHERTYPE_IPV6:
2545 		if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2546 			return (1);
2547 		c = mxge_rx_csum6((eh + 1), m, csum);
2548 		break;
2549 #endif
2550 	default:
2551 		c = 1;
2552 	}
2553 	return (c);
2554 }
2555 
2556 static void
2557 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2558 {
2559 	struct ether_vlan_header *evl;
2560 	struct ether_header *eh;
2561 	uint32_t partial;
2562 
2563 	evl = mtod(m, struct ether_vlan_header *);
2564 	eh = mtod(m, struct ether_header *);
2565 
2566 	/*
2567 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2568 	 * after what the firmware thought was the end of the ethernet
2569 	 * header.
2570 	 */
2571 
2572 	/* put checksum into host byte order */
2573 	*csum = ntohs(*csum);
2574 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2575 	(*csum) += ~partial;
2576 	(*csum) +=  ((*csum) < ~partial);
2577 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2578 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2579 
2580 	/* restore checksum to network byte order;
2581 	   later consumers expect this */
2582 	*csum = htons(*csum);
2583 
2584 	/* save the tag */
2585 #ifdef MXGE_NEW_VLAN_API
2586 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2587 #else
2588 	{
2589 		struct m_tag *mtag;
2590 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2591 				   M_NOWAIT);
2592 		if (mtag == NULL)
2593 			return;
2594 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2595 		m_tag_prepend(m, mtag);
2596 	}
2597 
2598 #endif
2599 	m->m_flags |= M_VLANTAG;
2600 
2601 	/*
2602 	 * Remove the 802.1q header by copying the Ethernet
2603 	 * addresses over it and adjusting the beginning of
2604 	 * the data in the mbuf.  The encapsulated Ethernet
2605 	 * type field is already in place.
2606 	 */
2607 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2608 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2609 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2610 }
2611 
2612 static inline void
2613 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2614 		 uint32_t csum, int lro)
2615 {
2616 	mxge_softc_t *sc;
2617 	struct ifnet *ifp;
2618 	struct mbuf *m;
2619 	struct ether_header *eh;
2620 	mxge_rx_ring_t *rx;
2621 	bus_dmamap_t old_map;
2622 	int idx;
2623 
2624 	sc = ss->sc;
2625 	ifp = sc->ifp;
2626 	rx = &ss->rx_big;
2627 	idx = rx->cnt & rx->mask;
2628 	rx->cnt += rx->nbufs;
2629 	/* save a pointer to the received mbuf */
2630 	m = rx->info[idx].m;
2631 	/* try to replace the received mbuf */
2632 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2633 		/* drop the frame -- the old mbuf is re-cycled */
2634 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2635 		return;
2636 	}
2637 
2638 	/* unmap the received buffer */
2639 	old_map = rx->info[idx].map;
2640 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2641 	bus_dmamap_unload(rx->dmat, old_map);
2642 
2643 	/* swap the bus_dmamap_t's */
2644 	rx->info[idx].map = rx->extra_map;
2645 	rx->extra_map = old_map;
2646 
2647 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2648 	 * aligned */
2649 	m->m_data += MXGEFW_PAD;
2650 
2651 	m->m_pkthdr.rcvif = ifp;
2652 	m->m_len = m->m_pkthdr.len = len;
2653 	ss->ipackets++;
2654 	eh = mtod(m, struct ether_header *);
2655 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2656 		mxge_vlan_tag_remove(m, &csum);
2657 	}
2658 	/* flowid only valid if RSS hashing is enabled */
2659 	if (sc->num_slices > 1) {
2660 		m->m_pkthdr.flowid = (ss - sc->ss);
2661 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2662 	}
2663 	/* if the checksum is valid, mark it in the mbuf header */
2664 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2665 	    (0 == mxge_rx_csum(m, csum))) {
2666 		/* Tell the stack that the  checksum is good */
2667 		m->m_pkthdr.csum_data = 0xffff;
2668 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2669 			CSUM_DATA_VALID;
2670 
2671 #if defined(INET) || defined (INET6)
2672 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2673 			return;
2674 #endif
2675 	}
2676 	/* pass the frame up the stack */
2677 	(*ifp->if_input)(ifp, m);
2678 }
2679 
2680 static inline void
2681 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2682 		   uint32_t csum, int lro)
2683 {
2684 	mxge_softc_t *sc;
2685 	struct ifnet *ifp;
2686 	struct ether_header *eh;
2687 	struct mbuf *m;
2688 	mxge_rx_ring_t *rx;
2689 	bus_dmamap_t old_map;
2690 	int idx;
2691 
2692 	sc = ss->sc;
2693 	ifp = sc->ifp;
2694 	rx = &ss->rx_small;
2695 	idx = rx->cnt & rx->mask;
2696 	rx->cnt++;
2697 	/* save a pointer to the received mbuf */
2698 	m = rx->info[idx].m;
2699 	/* try to replace the received mbuf */
2700 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2701 		/* drop the frame -- the old mbuf is re-cycled */
2702 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2703 		return;
2704 	}
2705 
2706 	/* unmap the received buffer */
2707 	old_map = rx->info[idx].map;
2708 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2709 	bus_dmamap_unload(rx->dmat, old_map);
2710 
2711 	/* swap the bus_dmamap_t's */
2712 	rx->info[idx].map = rx->extra_map;
2713 	rx->extra_map = old_map;
2714 
2715 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2716 	 * aligned */
2717 	m->m_data += MXGEFW_PAD;
2718 
2719 	m->m_pkthdr.rcvif = ifp;
2720 	m->m_len = m->m_pkthdr.len = len;
2721 	ss->ipackets++;
2722 	eh = mtod(m, struct ether_header *);
2723 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2724 		mxge_vlan_tag_remove(m, &csum);
2725 	}
2726 	/* flowid only valid if RSS hashing is enabled */
2727 	if (sc->num_slices > 1) {
2728 		m->m_pkthdr.flowid = (ss - sc->ss);
2729 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2730 	}
2731 	/* if the checksum is valid, mark it in the mbuf header */
2732 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2733 	    (0 == mxge_rx_csum(m, csum))) {
2734 		/* Tell the stack that the  checksum is good */
2735 		m->m_pkthdr.csum_data = 0xffff;
2736 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2737 			CSUM_DATA_VALID;
2738 
2739 #if defined(INET) || defined (INET6)
2740 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2741 			return;
2742 #endif
2743 	}
2744 	/* pass the frame up the stack */
2745 	(*ifp->if_input)(ifp, m);
2746 }
2747 
2748 static inline void
2749 mxge_clean_rx_done(struct mxge_slice_state *ss)
2750 {
2751 	mxge_rx_done_t *rx_done = &ss->rx_done;
2752 	int limit = 0;
2753 	uint16_t length;
2754 	uint16_t checksum;
2755 	int lro;
2756 
2757 	lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
2758 	while (rx_done->entry[rx_done->idx].length != 0) {
2759 		length = ntohs(rx_done->entry[rx_done->idx].length);
2760 		rx_done->entry[rx_done->idx].length = 0;
2761 		checksum = rx_done->entry[rx_done->idx].checksum;
2762 		if (length <= (MHLEN - MXGEFW_PAD))
2763 			mxge_rx_done_small(ss, length, checksum, lro);
2764 		else
2765 			mxge_rx_done_big(ss, length, checksum, lro);
2766 		rx_done->cnt++;
2767 		rx_done->idx = rx_done->cnt & rx_done->mask;
2768 
2769 		/* limit potential for livelock */
2770 		if (__predict_false(++limit > rx_done->mask / 2))
2771 			break;
2772 	}
2773 #if defined(INET)  || defined (INET6)
2774 	tcp_lro_flush_all(&ss->lc);
2775 #endif
2776 }
2777 
2778 static inline void
2779 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2780 {
2781 	struct ifnet *ifp;
2782 	mxge_tx_ring_t *tx;
2783 	struct mbuf *m;
2784 	bus_dmamap_t map;
2785 	int idx;
2786 	int *flags;
2787 
2788 	tx = &ss->tx;
2789 	ifp = ss->sc->ifp;
2790 	while (tx->pkt_done != mcp_idx) {
2791 		idx = tx->done & tx->mask;
2792 		tx->done++;
2793 		m = tx->info[idx].m;
2794 		/* mbuf and DMA map only attached to the first
2795 		   segment per-mbuf */
2796 		if (m != NULL) {
2797 			ss->obytes += m->m_pkthdr.len;
2798 			if (m->m_flags & M_MCAST)
2799 				ss->omcasts++;
2800 			ss->opackets++;
2801 			tx->info[idx].m = NULL;
2802 			map = tx->info[idx].map;
2803 			bus_dmamap_unload(tx->dmat, map);
2804 			m_freem(m);
2805 		}
2806 		if (tx->info[idx].flag) {
2807 			tx->info[idx].flag = 0;
2808 			tx->pkt_done++;
2809 		}
2810 	}
2811 
2812 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2813 	   its OK to send packets */
2814 #ifdef IFNET_BUF_RING
2815 	flags = &ss->if_drv_flags;
2816 #else
2817 	flags = &ifp->if_drv_flags;
2818 #endif
2819 	mtx_lock(&ss->tx.mtx);
2820 	if ((*flags) & IFF_DRV_OACTIVE &&
2821 	    tx->req - tx->done < (tx->mask + 1)/4) {
2822 		*(flags) &= ~IFF_DRV_OACTIVE;
2823 		ss->tx.wake++;
2824 		mxge_start_locked(ss);
2825 	}
2826 #ifdef IFNET_BUF_RING
2827 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2828 		/* let the NIC stop polling this queue, since there
2829 		 * are no more transmits pending */
2830 		if (tx->req == tx->done) {
2831 			*tx->send_stop = 1;
2832 			tx->queue_active = 0;
2833 			tx->deactivate++;
2834 			wmb();
2835 		}
2836 	}
2837 #endif
2838 	mtx_unlock(&ss->tx.mtx);
2839 
2840 }
2841 
2842 static struct mxge_media_type mxge_xfp_media_types[] =
2843 {
2844 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2845 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2846 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2847 	{0,		(1 << 5),	"10GBASE-ER"},
2848 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2849 	{0,		(1 << 3),	"10GBASE-SW"},
2850 	{0,		(1 << 2),	"10GBASE-LW"},
2851 	{0,		(1 << 1),	"10GBASE-EW"},
2852 	{0,		(1 << 0),	"Reserved"}
2853 };
2854 static struct mxge_media_type mxge_sfp_media_types[] =
2855 {
2856 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2857 	{0,		(1 << 7),	"Reserved"},
2858 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2859 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2860 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2861 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2862 };
2863 
2864 static void
2865 mxge_media_set(mxge_softc_t *sc, int media_type)
2866 {
2867 
2868 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2869 		    0, NULL);
2870 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2871 	sc->current_media = media_type;
2872 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2873 }
2874 
2875 static void
2876 mxge_media_init(mxge_softc_t *sc)
2877 {
2878 	char *ptr;
2879 	int i;
2880 
2881 	ifmedia_removeall(&sc->media);
2882 	mxge_media_set(sc, IFM_AUTO);
2883 
2884 	/*
2885 	 * parse the product code to deterimine the interface type
2886 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2887 	 * after the 3rd dash in the driver's cached copy of the
2888 	 * EEPROM's product code string.
2889 	 */
2890 	ptr = sc->product_code_string;
2891 	if (ptr == NULL) {
2892 		device_printf(sc->dev, "Missing product code\n");
2893 		return;
2894 	}
2895 
2896 	for (i = 0; i < 3; i++, ptr++) {
2897 		ptr = strchr(ptr, '-');
2898 		if (ptr == NULL) {
2899 			device_printf(sc->dev,
2900 				      "only %d dashes in PC?!?\n", i);
2901 			return;
2902 		}
2903 	}
2904 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2905 		/* -C is CX4 */
2906 		sc->connector = MXGE_CX4;
2907 		mxge_media_set(sc, IFM_10G_CX4);
2908 	} else if (*ptr == 'Q') {
2909 		/* -Q is Quad Ribbon Fiber */
2910 		sc->connector = MXGE_QRF;
2911 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2912 		/* FreeBSD has no media type for Quad ribbon fiber */
2913 	} else if (*ptr == 'R') {
2914 		/* -R is XFP */
2915 		sc->connector = MXGE_XFP;
2916 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2917 		/* -S or -2S is SFP+ */
2918 		sc->connector = MXGE_SFP;
2919 	} else {
2920 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2921 	}
2922 }
2923 
2924 /*
2925  * Determine the media type for a NIC.  Some XFPs will identify
2926  * themselves only when their link is up, so this is initiated via a
2927  * link up interrupt.  However, this can potentially take up to
2928  * several milliseconds, so it is run via the watchdog routine, rather
2929  * than in the interrupt handler itself.
2930  */
2931 static void
2932 mxge_media_probe(mxge_softc_t *sc)
2933 {
2934 	mxge_cmd_t cmd;
2935 	char *cage_type;
2936 
2937 	struct mxge_media_type *mxge_media_types = NULL;
2938 	int i, err, ms, mxge_media_type_entries;
2939 	uint32_t byte;
2940 
2941 	sc->need_media_probe = 0;
2942 
2943 	if (sc->connector == MXGE_XFP) {
2944 		/* -R is XFP */
2945 		mxge_media_types = mxge_xfp_media_types;
2946 		mxge_media_type_entries =
2947 			nitems(mxge_xfp_media_types);
2948 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2949 		cage_type = "XFP";
2950 	} else 	if (sc->connector == MXGE_SFP) {
2951 		/* -S or -2S is SFP+ */
2952 		mxge_media_types = mxge_sfp_media_types;
2953 		mxge_media_type_entries =
2954 			nitems(mxge_sfp_media_types);
2955 		cage_type = "SFP+";
2956 		byte = 3;
2957 	} else {
2958 		/* nothing to do; media type cannot change */
2959 		return;
2960 	}
2961 
2962 	/*
2963 	 * At this point we know the NIC has an XFP cage, so now we
2964 	 * try to determine what is in the cage by using the
2965 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2966 	 * register.  We read just one byte, which may take over
2967 	 * a millisecond
2968 	 */
2969 
2970 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2971 	cmd.data1 = byte;
2972 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2973 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2974 		device_printf(sc->dev, "failed to read XFP\n");
2975 	}
2976 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2977 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2978 	}
2979 	if (err != MXGEFW_CMD_OK) {
2980 		return;
2981 	}
2982 
2983 	/* now we wait for the data to be cached */
2984 	cmd.data0 = byte;
2985 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2986 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2987 		DELAY(1000);
2988 		cmd.data0 = byte;
2989 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2990 	}
2991 	if (err != MXGEFW_CMD_OK) {
2992 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2993 			      cage_type, err, ms);
2994 		return;
2995 	}
2996 
2997 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2998 		if (mxge_verbose)
2999 			device_printf(sc->dev, "%s:%s\n", cage_type,
3000 				      mxge_media_types[0].name);
3001 		if (sc->current_media != mxge_media_types[0].flag) {
3002 			mxge_media_init(sc);
3003 			mxge_media_set(sc, mxge_media_types[0].flag);
3004 		}
3005 		return;
3006 	}
3007 	for (i = 1; i < mxge_media_type_entries; i++) {
3008 		if (cmd.data0 & mxge_media_types[i].bitmask) {
3009 			if (mxge_verbose)
3010 				device_printf(sc->dev, "%s:%s\n",
3011 					      cage_type,
3012 					      mxge_media_types[i].name);
3013 
3014 			if (sc->current_media != mxge_media_types[i].flag) {
3015 				mxge_media_init(sc);
3016 				mxge_media_set(sc, mxge_media_types[i].flag);
3017 			}
3018 			return;
3019 		}
3020 	}
3021 	if (mxge_verbose)
3022 		device_printf(sc->dev, "%s media 0x%x unknown\n",
3023 			      cage_type, cmd.data0);
3024 
3025 	return;
3026 }
3027 
3028 static void
3029 mxge_intr(void *arg)
3030 {
3031 	struct mxge_slice_state *ss = arg;
3032 	mxge_softc_t *sc = ss->sc;
3033 	mcp_irq_data_t *stats = ss->fw_stats;
3034 	mxge_tx_ring_t *tx = &ss->tx;
3035 	mxge_rx_done_t *rx_done = &ss->rx_done;
3036 	uint32_t send_done_count;
3037 	uint8_t valid;
3038 
3039 #ifndef IFNET_BUF_RING
3040 	/* an interrupt on a non-zero slice is implicitly valid
3041 	   since MSI-X irqs are not shared */
3042 	if (ss != sc->ss) {
3043 		mxge_clean_rx_done(ss);
3044 		*ss->irq_claim = be32toh(3);
3045 		return;
3046 	}
3047 #endif
3048 
3049 	/* make sure the DMA has finished */
3050 	if (!stats->valid) {
3051 		return;
3052 	}
3053 	valid = stats->valid;
3054 
3055 	if (sc->legacy_irq) {
3056 		/* lower legacy IRQ  */
3057 		*sc->irq_deassert = 0;
3058 		if (!mxge_deassert_wait)
3059 			/* don't wait for conf. that irq is low */
3060 			stats->valid = 0;
3061 	} else {
3062 		stats->valid = 0;
3063 	}
3064 
3065 	/* loop while waiting for legacy irq deassertion */
3066 	do {
3067 		/* check for transmit completes and receives */
3068 		send_done_count = be32toh(stats->send_done_count);
3069 		while ((send_done_count != tx->pkt_done) ||
3070 		       (rx_done->entry[rx_done->idx].length != 0)) {
3071 			if (send_done_count != tx->pkt_done)
3072 				mxge_tx_done(ss, (int)send_done_count);
3073 			mxge_clean_rx_done(ss);
3074 			send_done_count = be32toh(stats->send_done_count);
3075 		}
3076 		if (sc->legacy_irq && mxge_deassert_wait)
3077 			wmb();
3078 	} while (*((volatile uint8_t *) &stats->valid));
3079 
3080 	/* fw link & error stats meaningful only on the first slice */
3081 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3082 		if (sc->link_state != stats->link_up) {
3083 			sc->link_state = stats->link_up;
3084 			if (sc->link_state) {
3085 				if_link_state_change(sc->ifp, LINK_STATE_UP);
3086 				if (mxge_verbose)
3087 					device_printf(sc->dev, "link up\n");
3088 			} else {
3089 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3090 				if (mxge_verbose)
3091 					device_printf(sc->dev, "link down\n");
3092 			}
3093 			sc->need_media_probe = 1;
3094 		}
3095 		if (sc->rdma_tags_available !=
3096 		    be32toh(stats->rdma_tags_available)) {
3097 			sc->rdma_tags_available =
3098 				be32toh(stats->rdma_tags_available);
3099 			device_printf(sc->dev, "RDMA timed out! %d tags "
3100 				      "left\n", sc->rdma_tags_available);
3101 		}
3102 
3103 		if (stats->link_down) {
3104 			sc->down_cnt += stats->link_down;
3105 			sc->link_state = 0;
3106 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3107 		}
3108 	}
3109 
3110 	/* check to see if we have rx token to pass back */
3111 	if (valid & 0x1)
3112 	    *ss->irq_claim = be32toh(3);
3113 	*(ss->irq_claim + 1) = be32toh(3);
3114 }
3115 
3116 static void
3117 mxge_init(void *arg)
3118 {
3119 	mxge_softc_t *sc = arg;
3120 	struct ifnet *ifp = sc->ifp;
3121 
3122 	mtx_lock(&sc->driver_mtx);
3123 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3124 		(void) mxge_open(sc);
3125 	mtx_unlock(&sc->driver_mtx);
3126 }
3127 
3128 static void
3129 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3130 {
3131 	int i;
3132 
3133 #if defined(INET) || defined(INET6)
3134 	tcp_lro_free(&ss->lc);
3135 #endif
3136 	for (i = 0; i <= ss->rx_big.mask; i++) {
3137 		if (ss->rx_big.info[i].m == NULL)
3138 			continue;
3139 		bus_dmamap_unload(ss->rx_big.dmat,
3140 				  ss->rx_big.info[i].map);
3141 		m_freem(ss->rx_big.info[i].m);
3142 		ss->rx_big.info[i].m = NULL;
3143 	}
3144 
3145 	for (i = 0; i <= ss->rx_small.mask; i++) {
3146 		if (ss->rx_small.info[i].m == NULL)
3147 			continue;
3148 		bus_dmamap_unload(ss->rx_small.dmat,
3149 				  ss->rx_small.info[i].map);
3150 		m_freem(ss->rx_small.info[i].m);
3151 		ss->rx_small.info[i].m = NULL;
3152 	}
3153 
3154 	/* transmit ring used only on the first slice */
3155 	if (ss->tx.info == NULL)
3156 		return;
3157 
3158 	for (i = 0; i <= ss->tx.mask; i++) {
3159 		ss->tx.info[i].flag = 0;
3160 		if (ss->tx.info[i].m == NULL)
3161 			continue;
3162 		bus_dmamap_unload(ss->tx.dmat,
3163 				  ss->tx.info[i].map);
3164 		m_freem(ss->tx.info[i].m);
3165 		ss->tx.info[i].m = NULL;
3166 	}
3167 }
3168 
3169 static void
3170 mxge_free_mbufs(mxge_softc_t *sc)
3171 {
3172 	int slice;
3173 
3174 	for (slice = 0; slice < sc->num_slices; slice++)
3175 		mxge_free_slice_mbufs(&sc->ss[slice]);
3176 }
3177 
3178 static void
3179 mxge_free_slice_rings(struct mxge_slice_state *ss)
3180 {
3181 	int i;
3182 
3183 	if (ss->rx_done.entry != NULL)
3184 		mxge_dma_free(&ss->rx_done.dma);
3185 	ss->rx_done.entry = NULL;
3186 
3187 	if (ss->tx.req_bytes != NULL)
3188 		free(ss->tx.req_bytes, M_DEVBUF);
3189 	ss->tx.req_bytes = NULL;
3190 
3191 	if (ss->tx.seg_list != NULL)
3192 		free(ss->tx.seg_list, M_DEVBUF);
3193 	ss->tx.seg_list = NULL;
3194 
3195 	if (ss->rx_small.shadow != NULL)
3196 		free(ss->rx_small.shadow, M_DEVBUF);
3197 	ss->rx_small.shadow = NULL;
3198 
3199 	if (ss->rx_big.shadow != NULL)
3200 		free(ss->rx_big.shadow, M_DEVBUF);
3201 	ss->rx_big.shadow = NULL;
3202 
3203 	if (ss->tx.info != NULL) {
3204 		if (ss->tx.dmat != NULL) {
3205 			for (i = 0; i <= ss->tx.mask; i++) {
3206 				bus_dmamap_destroy(ss->tx.dmat,
3207 						   ss->tx.info[i].map);
3208 			}
3209 			bus_dma_tag_destroy(ss->tx.dmat);
3210 		}
3211 		free(ss->tx.info, M_DEVBUF);
3212 	}
3213 	ss->tx.info = NULL;
3214 
3215 	if (ss->rx_small.info != NULL) {
3216 		if (ss->rx_small.dmat != NULL) {
3217 			for (i = 0; i <= ss->rx_small.mask; i++) {
3218 				bus_dmamap_destroy(ss->rx_small.dmat,
3219 						   ss->rx_small.info[i].map);
3220 			}
3221 			bus_dmamap_destroy(ss->rx_small.dmat,
3222 					   ss->rx_small.extra_map);
3223 			bus_dma_tag_destroy(ss->rx_small.dmat);
3224 		}
3225 		free(ss->rx_small.info, M_DEVBUF);
3226 	}
3227 	ss->rx_small.info = NULL;
3228 
3229 	if (ss->rx_big.info != NULL) {
3230 		if (ss->rx_big.dmat != NULL) {
3231 			for (i = 0; i <= ss->rx_big.mask; i++) {
3232 				bus_dmamap_destroy(ss->rx_big.dmat,
3233 						   ss->rx_big.info[i].map);
3234 			}
3235 			bus_dmamap_destroy(ss->rx_big.dmat,
3236 					   ss->rx_big.extra_map);
3237 			bus_dma_tag_destroy(ss->rx_big.dmat);
3238 		}
3239 		free(ss->rx_big.info, M_DEVBUF);
3240 	}
3241 	ss->rx_big.info = NULL;
3242 }
3243 
3244 static void
3245 mxge_free_rings(mxge_softc_t *sc)
3246 {
3247 	int slice;
3248 
3249 	for (slice = 0; slice < sc->num_slices; slice++)
3250 		mxge_free_slice_rings(&sc->ss[slice]);
3251 }
3252 
3253 static int
3254 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3255 		       int tx_ring_entries)
3256 {
3257 	mxge_softc_t *sc = ss->sc;
3258 	size_t bytes;
3259 	int err, i;
3260 
3261 	/* allocate per-slice receive resources */
3262 
3263 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3264 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3265 
3266 	/* allocate the rx shadow rings */
3267 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3268 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3269 
3270 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3271 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3272 
3273 	/* allocate the rx host info rings */
3274 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3275 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3276 
3277 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3278 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3279 
3280 	/* allocate the rx busdma resources */
3281 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3282 				 1,			/* alignment */
3283 				 4096,			/* boundary */
3284 				 BUS_SPACE_MAXADDR,	/* low */
3285 				 BUS_SPACE_MAXADDR,	/* high */
3286 				 NULL, NULL,		/* filter */
3287 				 MHLEN,			/* maxsize */
3288 				 1,			/* num segs */
3289 				 MHLEN,			/* maxsegsize */
3290 				 BUS_DMA_ALLOCNOW,	/* flags */
3291 				 NULL, NULL,		/* lock */
3292 				 &ss->rx_small.dmat);	/* tag */
3293 	if (err != 0) {
3294 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3295 			      err);
3296 		return err;
3297 	}
3298 
3299 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3300 				 1,			/* alignment */
3301 #if MXGE_VIRT_JUMBOS
3302 				 4096,			/* boundary */
3303 #else
3304 				 0,			/* boundary */
3305 #endif
3306 				 BUS_SPACE_MAXADDR,	/* low */
3307 				 BUS_SPACE_MAXADDR,	/* high */
3308 				 NULL, NULL,		/* filter */
3309 				 3*4096,		/* maxsize */
3310 #if MXGE_VIRT_JUMBOS
3311 				 3,			/* num segs */
3312 				 4096,			/* maxsegsize*/
3313 #else
3314 				 1,			/* num segs */
3315 				 MJUM9BYTES,		/* maxsegsize*/
3316 #endif
3317 				 BUS_DMA_ALLOCNOW,	/* flags */
3318 				 NULL, NULL,		/* lock */
3319 				 &ss->rx_big.dmat);	/* tag */
3320 	if (err != 0) {
3321 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3322 			      err);
3323 		return err;
3324 	}
3325 	for (i = 0; i <= ss->rx_small.mask; i++) {
3326 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3327 					&ss->rx_small.info[i].map);
3328 		if (err != 0) {
3329 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3330 				      err);
3331 			return err;
3332 		}
3333 	}
3334 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3335 				&ss->rx_small.extra_map);
3336 	if (err != 0) {
3337 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3338 			      err);
3339 		return err;
3340 	}
3341 
3342 	for (i = 0; i <= ss->rx_big.mask; i++) {
3343 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3344 					&ss->rx_big.info[i].map);
3345 		if (err != 0) {
3346 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3347 				      err);
3348 			return err;
3349 		}
3350 	}
3351 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3352 				&ss->rx_big.extra_map);
3353 	if (err != 0) {
3354 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3355 			      err);
3356 		return err;
3357 	}
3358 
3359 	/* now allocate TX resources */
3360 
3361 #ifndef IFNET_BUF_RING
3362 	/* only use a single TX ring for now */
3363 	if (ss != ss->sc->ss)
3364 		return 0;
3365 #endif
3366 
3367 	ss->tx.mask = tx_ring_entries - 1;
3368 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3369 
3370 	/* allocate the tx request copy block */
3371 	bytes = 8 +
3372 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3373 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3374 	/* ensure req_list entries are aligned to 8 bytes */
3375 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3376 		((uintptr_t)(ss->tx.req_bytes + 7) & ~7UL);
3377 
3378 	/* allocate the tx busdma segment list */
3379 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3380 	ss->tx.seg_list = (bus_dma_segment_t *)
3381 		malloc(bytes, M_DEVBUF, M_WAITOK);
3382 
3383 	/* allocate the tx host info ring */
3384 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3385 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3386 
3387 	/* allocate the tx busdma resources */
3388 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3389 				 1,			/* alignment */
3390 				 sc->tx_boundary,	/* boundary */
3391 				 BUS_SPACE_MAXADDR,	/* low */
3392 				 BUS_SPACE_MAXADDR,	/* high */
3393 				 NULL, NULL,		/* filter */
3394 				 65536 + 256,		/* maxsize */
3395 				 ss->tx.max_desc - 2,	/* num segs */
3396 				 sc->tx_boundary,	/* maxsegsz */
3397 				 BUS_DMA_ALLOCNOW,	/* flags */
3398 				 NULL, NULL,		/* lock */
3399 				 &ss->tx.dmat);		/* tag */
3400 
3401 	if (err != 0) {
3402 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3403 			      err);
3404 		return err;
3405 	}
3406 
3407 	/* now use these tags to setup dmamaps for each slot
3408 	   in the ring */
3409 	for (i = 0; i <= ss->tx.mask; i++) {
3410 		err = bus_dmamap_create(ss->tx.dmat, 0,
3411 					&ss->tx.info[i].map);
3412 		if (err != 0) {
3413 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3414 				      err);
3415 			return err;
3416 		}
3417 	}
3418 	return 0;
3419 
3420 }
3421 
3422 static int
3423 mxge_alloc_rings(mxge_softc_t *sc)
3424 {
3425 	mxge_cmd_t cmd;
3426 	int tx_ring_size;
3427 	int tx_ring_entries, rx_ring_entries;
3428 	int err, slice;
3429 
3430 	/* get ring sizes */
3431 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3432 	tx_ring_size = cmd.data0;
3433 	if (err != 0) {
3434 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3435 		goto abort;
3436 	}
3437 
3438 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3439 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3440 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3441 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3442 	IFQ_SET_READY(&sc->ifp->if_snd);
3443 
3444 	for (slice = 0; slice < sc->num_slices; slice++) {
3445 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3446 					     rx_ring_entries,
3447 					     tx_ring_entries);
3448 		if (err != 0)
3449 			goto abort;
3450 	}
3451 	return 0;
3452 
3453 abort:
3454 	mxge_free_rings(sc);
3455 	return err;
3456 
3457 }
3458 
3459 static void
3460 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3461 {
3462 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3463 
3464 	if (bufsize < MCLBYTES) {
3465 		/* easy, everything fits in a single buffer */
3466 		*big_buf_size = MCLBYTES;
3467 		*cl_size = MCLBYTES;
3468 		*nbufs = 1;
3469 		return;
3470 	}
3471 
3472 	if (bufsize < MJUMPAGESIZE) {
3473 		/* still easy, everything still fits in a single buffer */
3474 		*big_buf_size = MJUMPAGESIZE;
3475 		*cl_size = MJUMPAGESIZE;
3476 		*nbufs = 1;
3477 		return;
3478 	}
3479 #if MXGE_VIRT_JUMBOS
3480 	/* now we need to use virtually contiguous buffers */
3481 	*cl_size = MJUM9BYTES;
3482 	*big_buf_size = 4096;
3483 	*nbufs = mtu / 4096 + 1;
3484 	/* needs to be a power of two, so round up */
3485 	if (*nbufs == 3)
3486 		*nbufs = 4;
3487 #else
3488 	*cl_size = MJUM9BYTES;
3489 	*big_buf_size = MJUM9BYTES;
3490 	*nbufs = 1;
3491 #endif
3492 }
3493 
3494 static int
3495 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3496 {
3497 	mxge_softc_t *sc;
3498 	mxge_cmd_t cmd;
3499 	bus_dmamap_t map;
3500 	int err, i, slice;
3501 
3502 	sc = ss->sc;
3503 	slice = ss - sc->ss;
3504 
3505 #if defined(INET) || defined(INET6)
3506 	(void)tcp_lro_init(&ss->lc);
3507 #endif
3508 	ss->lc.ifp = sc->ifp;
3509 
3510 	/* get the lanai pointers to the send and receive rings */
3511 
3512 	err = 0;
3513 #ifndef IFNET_BUF_RING
3514 	/* We currently only send from the first slice */
3515 	if (slice == 0) {
3516 #endif
3517 		cmd.data0 = slice;
3518 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3519 		ss->tx.lanai =
3520 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3521 		ss->tx.send_go = (volatile uint32_t *)
3522 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3523 		ss->tx.send_stop = (volatile uint32_t *)
3524 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3525 #ifndef IFNET_BUF_RING
3526 	}
3527 #endif
3528 	cmd.data0 = slice;
3529 	err |= mxge_send_cmd(sc,
3530 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3531 	ss->rx_small.lanai =
3532 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3533 	cmd.data0 = slice;
3534 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3535 	ss->rx_big.lanai =
3536 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3537 
3538 	if (err != 0) {
3539 		device_printf(sc->dev,
3540 			      "failed to get ring sizes or locations\n");
3541 		return EIO;
3542 	}
3543 
3544 	/* stock receive rings */
3545 	for (i = 0; i <= ss->rx_small.mask; i++) {
3546 		map = ss->rx_small.info[i].map;
3547 		err = mxge_get_buf_small(ss, map, i);
3548 		if (err) {
3549 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3550 				      i, ss->rx_small.mask + 1);
3551 			return ENOMEM;
3552 		}
3553 	}
3554 	for (i = 0; i <= ss->rx_big.mask; i++) {
3555 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3556 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3557 	}
3558 	ss->rx_big.nbufs = nbufs;
3559 	ss->rx_big.cl_size = cl_size;
3560 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3561 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3562 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3563 		map = ss->rx_big.info[i].map;
3564 		err = mxge_get_buf_big(ss, map, i);
3565 		if (err) {
3566 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3567 				      i, ss->rx_big.mask + 1);
3568 			return ENOMEM;
3569 		}
3570 	}
3571 	return 0;
3572 }
3573 
3574 static int
3575 mxge_open(mxge_softc_t *sc)
3576 {
3577 	mxge_cmd_t cmd;
3578 	int err, big_bytes, nbufs, slice, cl_size, i;
3579 	bus_addr_t bus;
3580 	volatile uint8_t *itable;
3581 	struct mxge_slice_state *ss;
3582 
3583 	/* Copy the MAC address in case it was overridden */
3584 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3585 
3586 	err = mxge_reset(sc, 1);
3587 	if (err != 0) {
3588 		device_printf(sc->dev, "failed to reset\n");
3589 		return EIO;
3590 	}
3591 
3592 	if (sc->num_slices > 1) {
3593 		/* setup the indirection table */
3594 		cmd.data0 = sc->num_slices;
3595 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3596 				    &cmd);
3597 
3598 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3599 				     &cmd);
3600 		if (err != 0) {
3601 			device_printf(sc->dev,
3602 				      "failed to setup rss tables\n");
3603 			return err;
3604 		}
3605 
3606 		/* just enable an identity mapping */
3607 		itable = sc->sram + cmd.data0;
3608 		for (i = 0; i < sc->num_slices; i++)
3609 			itable[i] = (uint8_t)i;
3610 
3611 		cmd.data0 = 1;
3612 		cmd.data1 = mxge_rss_hash_type;
3613 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3614 		if (err != 0) {
3615 			device_printf(sc->dev, "failed to enable slices\n");
3616 			return err;
3617 		}
3618 	}
3619 
3620 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3621 
3622 	cmd.data0 = nbufs;
3623 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3624 			    &cmd);
3625 	/* error is only meaningful if we're trying to set
3626 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3627 	if (err && nbufs > 1) {
3628 		device_printf(sc->dev,
3629 			      "Failed to set alway-use-n to %d\n",
3630 			      nbufs);
3631 		return EIO;
3632 	}
3633 	/* Give the firmware the mtu and the big and small buffer
3634 	   sizes.  The firmware wants the big buf size to be a power
3635 	   of two. Luckily, FreeBSD's clusters are powers of two */
3636 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3637 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3638 	cmd.data0 = MHLEN - MXGEFW_PAD;
3639 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3640 			     &cmd);
3641 	cmd.data0 = big_bytes;
3642 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3643 
3644 	if (err != 0) {
3645 		device_printf(sc->dev, "failed to setup params\n");
3646 		goto abort;
3647 	}
3648 
3649 	/* Now give him the pointer to the stats block */
3650 	for (slice = 0;
3651 #ifdef IFNET_BUF_RING
3652 	     slice < sc->num_slices;
3653 #else
3654 	     slice < 1;
3655 #endif
3656 	     slice++) {
3657 		ss = &sc->ss[slice];
3658 		cmd.data0 =
3659 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3660 		cmd.data1 =
3661 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3662 		cmd.data2 = sizeof(struct mcp_irq_data);
3663 		cmd.data2 |= (slice << 16);
3664 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3665 	}
3666 
3667 	if (err != 0) {
3668 		bus = sc->ss->fw_stats_dma.bus_addr;
3669 		bus += offsetof(struct mcp_irq_data, send_done_count);
3670 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3671 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3672 		err = mxge_send_cmd(sc,
3673 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3674 				    &cmd);
3675 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3676 		sc->fw_multicast_support = 0;
3677 	} else {
3678 		sc->fw_multicast_support = 1;
3679 	}
3680 
3681 	if (err != 0) {
3682 		device_printf(sc->dev, "failed to setup params\n");
3683 		goto abort;
3684 	}
3685 
3686 	for (slice = 0; slice < sc->num_slices; slice++) {
3687 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3688 		if (err != 0) {
3689 			device_printf(sc->dev, "couldn't open slice %d\n",
3690 				      slice);
3691 			goto abort;
3692 		}
3693 	}
3694 
3695 	/* Finally, start the firmware running */
3696 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3697 	if (err) {
3698 		device_printf(sc->dev, "Couldn't bring up link\n");
3699 		goto abort;
3700 	}
3701 #ifdef IFNET_BUF_RING
3702 	for (slice = 0; slice < sc->num_slices; slice++) {
3703 		ss = &sc->ss[slice];
3704 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3705 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3706 	}
3707 #endif
3708 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3709 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3710 
3711 	return 0;
3712 
3713 abort:
3714 	mxge_free_mbufs(sc);
3715 
3716 	return err;
3717 }
3718 
3719 static int
3720 mxge_close(mxge_softc_t *sc, int down)
3721 {
3722 	mxge_cmd_t cmd;
3723 	int err, old_down_cnt;
3724 #ifdef IFNET_BUF_RING
3725 	struct mxge_slice_state *ss;
3726 	int slice;
3727 #endif
3728 
3729 #ifdef IFNET_BUF_RING
3730 	for (slice = 0; slice < sc->num_slices; slice++) {
3731 		ss = &sc->ss[slice];
3732 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3733 	}
3734 #endif
3735 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3736 	if (!down) {
3737 		old_down_cnt = sc->down_cnt;
3738 		wmb();
3739 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3740 		if (err) {
3741 			device_printf(sc->dev,
3742 				      "Couldn't bring down link\n");
3743 		}
3744 		if (old_down_cnt == sc->down_cnt) {
3745 			/* wait for down irq */
3746 			DELAY(10 * sc->intr_coal_delay);
3747 		}
3748 		wmb();
3749 		if (old_down_cnt == sc->down_cnt) {
3750 			device_printf(sc->dev, "never got down irq\n");
3751 		}
3752 	}
3753 	mxge_free_mbufs(sc);
3754 
3755 	return 0;
3756 }
3757 
3758 static void
3759 mxge_setup_cfg_space(mxge_softc_t *sc)
3760 {
3761 	device_t dev = sc->dev;
3762 	int reg;
3763 	uint16_t lnk, pectl;
3764 
3765 	/* find the PCIe link width and set max read request to 4KB*/
3766 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3767 		lnk = pci_read_config(dev, reg + 0x12, 2);
3768 		sc->link_width = (lnk >> 4) & 0x3f;
3769 
3770 		if (sc->pectl == 0) {
3771 			pectl = pci_read_config(dev, reg + 0x8, 2);
3772 			pectl = (pectl & ~0x7000) | (5 << 12);
3773 			pci_write_config(dev, reg + 0x8, pectl, 2);
3774 			sc->pectl = pectl;
3775 		} else {
3776 			/* restore saved pectl after watchdog reset */
3777 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3778 		}
3779 	}
3780 
3781 	/* Enable DMA and Memory space access */
3782 	pci_enable_busmaster(dev);
3783 }
3784 
3785 static uint32_t
3786 mxge_read_reboot(mxge_softc_t *sc)
3787 {
3788 	device_t dev = sc->dev;
3789 	uint32_t vs;
3790 
3791 	/* find the vendor specific offset */
3792 	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3793 		device_printf(sc->dev,
3794 			      "could not find vendor specific offset\n");
3795 		return (uint32_t)-1;
3796 	}
3797 	/* enable read32 mode */
3798 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3799 	/* tell NIC which register to read */
3800 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3801 	return (pci_read_config(dev, vs + 0x14, 4));
3802 }
3803 
3804 static void
3805 mxge_watchdog_reset(mxge_softc_t *sc)
3806 {
3807 	struct pci_devinfo *dinfo;
3808 	struct mxge_slice_state *ss;
3809 	int err, running, s, num_tx_slices = 1;
3810 	uint32_t reboot;
3811 	uint16_t cmd;
3812 
3813 	err = ENXIO;
3814 
3815 	device_printf(sc->dev, "Watchdog reset!\n");
3816 
3817 	/*
3818 	 * check to see if the NIC rebooted.  If it did, then all of
3819 	 * PCI config space has been reset, and things like the
3820 	 * busmaster bit will be zero.  If this is the case, then we
3821 	 * must restore PCI config space before the NIC can be used
3822 	 * again
3823 	 */
3824 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3825 	if (cmd == 0xffff) {
3826 		/*
3827 		 * maybe the watchdog caught the NIC rebooting; wait
3828 		 * up to 100ms for it to finish.  If it does not come
3829 		 * back, then give up
3830 		 */
3831 		DELAY(1000*100);
3832 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3833 		if (cmd == 0xffff) {
3834 			device_printf(sc->dev, "NIC disappeared!\n");
3835 		}
3836 	}
3837 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3838 		/* print the reboot status */
3839 		reboot = mxge_read_reboot(sc);
3840 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3841 			      reboot);
3842 		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3843 		if (running) {
3844 			/*
3845 			 * quiesce NIC so that TX routines will not try to
3846 			 * xmit after restoration of BAR
3847 			 */
3848 
3849 			/* Mark the link as down */
3850 			if (sc->link_state) {
3851 				sc->link_state = 0;
3852 				if_link_state_change(sc->ifp,
3853 						     LINK_STATE_DOWN);
3854 			}
3855 #ifdef IFNET_BUF_RING
3856 			num_tx_slices = sc->num_slices;
3857 #endif
3858 			/* grab all TX locks to ensure no tx  */
3859 			for (s = 0; s < num_tx_slices; s++) {
3860 				ss = &sc->ss[s];
3861 				mtx_lock(&ss->tx.mtx);
3862 			}
3863 			mxge_close(sc, 1);
3864 		}
3865 		/* restore PCI configuration space */
3866 		dinfo = device_get_ivars(sc->dev);
3867 		pci_cfg_restore(sc->dev, dinfo);
3868 
3869 		/* and redo any changes we made to our config space */
3870 		mxge_setup_cfg_space(sc);
3871 
3872 		/* reload f/w */
3873 		err = mxge_load_firmware(sc, 0);
3874 		if (err) {
3875 			device_printf(sc->dev,
3876 				      "Unable to re-load f/w\n");
3877 		}
3878 		if (running) {
3879 			if (!err)
3880 				err = mxge_open(sc);
3881 			/* release all TX locks */
3882 			for (s = 0; s < num_tx_slices; s++) {
3883 				ss = &sc->ss[s];
3884 #ifdef IFNET_BUF_RING
3885 				mxge_start_locked(ss);
3886 #endif
3887 				mtx_unlock(&ss->tx.mtx);
3888 			}
3889 		}
3890 		sc->watchdog_resets++;
3891 	} else {
3892 		device_printf(sc->dev,
3893 			      "NIC did not reboot, not resetting\n");
3894 		err = 0;
3895 	}
3896 	if (err) {
3897 		device_printf(sc->dev, "watchdog reset failed\n");
3898 	} else {
3899 		if (sc->dying == 2)
3900 			sc->dying = 0;
3901 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3902 	}
3903 }
3904 
3905 static void
3906 mxge_watchdog_task(void *arg, int pending)
3907 {
3908 	mxge_softc_t *sc = arg;
3909 
3910 	mtx_lock(&sc->driver_mtx);
3911 	mxge_watchdog_reset(sc);
3912 	mtx_unlock(&sc->driver_mtx);
3913 }
3914 
3915 static void
3916 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3917 {
3918 	tx = &sc->ss[slice].tx;
3919 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3920 	device_printf(sc->dev,
3921 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3922 		      tx->req, tx->done, tx->queue_active);
3923 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3924 			      tx->activate, tx->deactivate);
3925 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3926 		      tx->pkt_done,
3927 		      be32toh(sc->ss->fw_stats->send_done_count));
3928 }
3929 
3930 static int
3931 mxge_watchdog(mxge_softc_t *sc)
3932 {
3933 	mxge_tx_ring_t *tx;
3934 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3935 	int i, err = 0;
3936 
3937 	/* see if we have outstanding transmits, which
3938 	   have been pending for more than mxge_ticks */
3939 	for (i = 0;
3940 #ifdef IFNET_BUF_RING
3941 	     (i < sc->num_slices) && (err == 0);
3942 #else
3943 	     (i < 1) && (err == 0);
3944 #endif
3945 	     i++) {
3946 		tx = &sc->ss[i].tx;
3947 		if (tx->req != tx->done &&
3948 		    tx->watchdog_req != tx->watchdog_done &&
3949 		    tx->done == tx->watchdog_done) {
3950 			/* check for pause blocking before resetting */
3951 			if (tx->watchdog_rx_pause == rx_pause) {
3952 				mxge_warn_stuck(sc, tx, i);
3953 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3954 				return (ENXIO);
3955 			}
3956 			else
3957 				device_printf(sc->dev, "Flow control blocking "
3958 					      "xmits, check link partner\n");
3959 		}
3960 
3961 		tx->watchdog_req = tx->req;
3962 		tx->watchdog_done = tx->done;
3963 		tx->watchdog_rx_pause = rx_pause;
3964 	}
3965 
3966 	if (sc->need_media_probe)
3967 		mxge_media_probe(sc);
3968 	return (err);
3969 }
3970 
3971 static uint64_t
3972 mxge_get_counter(struct ifnet *ifp, ift_counter cnt)
3973 {
3974 	struct mxge_softc *sc;
3975 	uint64_t rv;
3976 
3977 	sc = if_getsoftc(ifp);
3978 	rv = 0;
3979 
3980 	switch (cnt) {
3981 	case IFCOUNTER_IPACKETS:
3982 		for (int s = 0; s < sc->num_slices; s++)
3983 			rv += sc->ss[s].ipackets;
3984 		return (rv);
3985 	case IFCOUNTER_OPACKETS:
3986 		for (int s = 0; s < sc->num_slices; s++)
3987 			rv += sc->ss[s].opackets;
3988 		return (rv);
3989 	case IFCOUNTER_OERRORS:
3990 		for (int s = 0; s < sc->num_slices; s++)
3991 			rv += sc->ss[s].oerrors;
3992 		return (rv);
3993 #ifdef IFNET_BUF_RING
3994 	case IFCOUNTER_OBYTES:
3995 		for (int s = 0; s < sc->num_slices; s++)
3996 			rv += sc->ss[s].obytes;
3997 		return (rv);
3998 	case IFCOUNTER_OMCASTS:
3999 		for (int s = 0; s < sc->num_slices; s++)
4000 			rv += sc->ss[s].omcasts;
4001 		return (rv);
4002 	case IFCOUNTER_OQDROPS:
4003 		for (int s = 0; s < sc->num_slices; s++)
4004 			rv += sc->ss[s].tx.br->br_drops;
4005 		return (rv);
4006 #endif
4007 	default:
4008 		return (if_get_counter_default(ifp, cnt));
4009 	}
4010 }
4011 
4012 static void
4013 mxge_tick(void *arg)
4014 {
4015 	mxge_softc_t *sc = arg;
4016 	u_long pkts = 0;
4017 	int err = 0;
4018 	int running, ticks;
4019 	uint16_t cmd;
4020 
4021 	ticks = mxge_ticks;
4022 	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4023 	if (running) {
4024 		if (!sc->watchdog_countdown) {
4025 			err = mxge_watchdog(sc);
4026 			sc->watchdog_countdown = 4;
4027 		}
4028 		sc->watchdog_countdown--;
4029 	}
4030 	if (pkts == 0) {
4031 		/* ensure NIC did not suffer h/w fault while idle */
4032 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
4033 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4034 			sc->dying = 2;
4035 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4036 			err = ENXIO;
4037 		}
4038 		/* look less often if NIC is idle */
4039 		ticks *= 4;
4040 	}
4041 
4042 	if (err == 0)
4043 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4044 
4045 }
4046 
4047 static int
4048 mxge_media_change(struct ifnet *ifp)
4049 {
4050 	return EINVAL;
4051 }
4052 
4053 static int
4054 mxge_change_mtu(mxge_softc_t *sc, int mtu)
4055 {
4056 	struct ifnet *ifp = sc->ifp;
4057 	int real_mtu, old_mtu;
4058 	int err = 0;
4059 
4060 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4061 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4062 		return EINVAL;
4063 	mtx_lock(&sc->driver_mtx);
4064 	old_mtu = ifp->if_mtu;
4065 	ifp->if_mtu = mtu;
4066 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4067 		mxge_close(sc, 0);
4068 		err = mxge_open(sc);
4069 		if (err != 0) {
4070 			ifp->if_mtu = old_mtu;
4071 			mxge_close(sc, 0);
4072 			(void) mxge_open(sc);
4073 		}
4074 	}
4075 	mtx_unlock(&sc->driver_mtx);
4076 	return err;
4077 }
4078 
4079 static void
4080 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4081 {
4082 	mxge_softc_t *sc = ifp->if_softc;
4083 
4084 	if (sc == NULL)
4085 		return;
4086 	ifmr->ifm_status = IFM_AVALID;
4087 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4088 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4089 	ifmr->ifm_active |= sc->current_media;
4090 }
4091 
4092 static int
4093 mxge_fetch_i2c(mxge_softc_t *sc, struct ifi2creq *i2c)
4094 {
4095 	mxge_cmd_t cmd;
4096 	uint32_t i2c_args;
4097 	int i, ms, err;
4098 
4099 	if (i2c->dev_addr != 0xA0 &&
4100 	    i2c->dev_addr != 0xA2)
4101 		return (EINVAL);
4102 	if (i2c->len > sizeof(i2c->data))
4103 		return (EINVAL);
4104 
4105 	for (i = 0; i < i2c->len; i++) {
4106 		i2c_args = i2c->dev_addr << 0x8;
4107 		i2c_args |= i2c->offset + i;
4108 		cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
4109 		cmd.data1 = i2c_args;
4110 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
4111 
4112 		if (err != MXGEFW_CMD_OK)
4113 			return (EIO);
4114 		/* now we wait for the data to be cached */
4115 		cmd.data0 = i2c_args & 0xff;
4116 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
4117 		for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
4118 			cmd.data0 = i2c_args & 0xff;
4119 			err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
4120 			if (err == EBUSY)
4121 				DELAY(1000);
4122 		}
4123 		if (err != MXGEFW_CMD_OK)
4124 			return (EIO);
4125 		i2c->data[i] = cmd.data0;
4126 	}
4127 	return (0);
4128 }
4129 
4130 static int
4131 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4132 {
4133 	mxge_softc_t *sc = ifp->if_softc;
4134 	struct ifreq *ifr = (struct ifreq *)data;
4135 	struct ifi2creq i2c;
4136 	int err, mask;
4137 
4138 	err = 0;
4139 	switch (command) {
4140 	case SIOCSIFMTU:
4141 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4142 		break;
4143 
4144 	case SIOCSIFFLAGS:
4145 		mtx_lock(&sc->driver_mtx);
4146 		if (sc->dying) {
4147 			mtx_unlock(&sc->driver_mtx);
4148 			return EINVAL;
4149 		}
4150 		if (ifp->if_flags & IFF_UP) {
4151 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4152 				err = mxge_open(sc);
4153 			} else {
4154 				/* take care of promis can allmulti
4155 				   flag chages */
4156 				mxge_change_promisc(sc,
4157 						    ifp->if_flags & IFF_PROMISC);
4158 				mxge_set_multicast_list(sc);
4159 			}
4160 		} else {
4161 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4162 				mxge_close(sc, 0);
4163 			}
4164 		}
4165 		mtx_unlock(&sc->driver_mtx);
4166 		break;
4167 
4168 	case SIOCADDMULTI:
4169 	case SIOCDELMULTI:
4170 		mtx_lock(&sc->driver_mtx);
4171 		if (sc->dying) {
4172 			mtx_unlock(&sc->driver_mtx);
4173 			return (EINVAL);
4174 		}
4175 		mxge_set_multicast_list(sc);
4176 		mtx_unlock(&sc->driver_mtx);
4177 		break;
4178 
4179 	case SIOCSIFCAP:
4180 		mtx_lock(&sc->driver_mtx);
4181 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4182 		if (mask & IFCAP_TXCSUM) {
4183 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4184 				mask &= ~IFCAP_TSO4;
4185 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4186 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
4187 			} else {
4188 				ifp->if_capenable |= IFCAP_TXCSUM;
4189 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4190 			}
4191 		}
4192 		if (mask & IFCAP_RXCSUM) {
4193 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4194 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4195 			} else {
4196 				ifp->if_capenable |= IFCAP_RXCSUM;
4197 			}
4198 		}
4199 		if (mask & IFCAP_TSO4) {
4200 			if (IFCAP_TSO4 & ifp->if_capenable) {
4201 				ifp->if_capenable &= ~IFCAP_TSO4;
4202 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4203 				ifp->if_capenable |= IFCAP_TSO4;
4204 				ifp->if_hwassist |= CSUM_TSO;
4205 			} else {
4206 				printf("mxge requires tx checksum offload"
4207 				       " be enabled to use TSO\n");
4208 				err = EINVAL;
4209 			}
4210 		}
4211 #if IFCAP_TSO6
4212 		if (mask & IFCAP_TXCSUM_IPV6) {
4213 			if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4214 				mask &= ~IFCAP_TSO6;
4215 				ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
4216 						       | IFCAP_TSO6);
4217 				ifp->if_hwassist &= ~(CSUM_TCP_IPV6
4218 						      | CSUM_UDP);
4219 			} else {
4220 				ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
4221 				ifp->if_hwassist |= (CSUM_TCP_IPV6
4222 						     | CSUM_UDP_IPV6);
4223 			}
4224 		}
4225 		if (mask & IFCAP_RXCSUM_IPV6) {
4226 			if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
4227 				ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
4228 			} else {
4229 				ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
4230 			}
4231 		}
4232 		if (mask & IFCAP_TSO6) {
4233 			if (IFCAP_TSO6 & ifp->if_capenable) {
4234 				ifp->if_capenable &= ~IFCAP_TSO6;
4235 			} else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4236 				ifp->if_capenable |= IFCAP_TSO6;
4237 				ifp->if_hwassist |= CSUM_TSO;
4238 			} else {
4239 				printf("mxge requires tx checksum offload"
4240 				       " be enabled to use TSO\n");
4241 				err = EINVAL;
4242 			}
4243 		}
4244 #endif /*IFCAP_TSO6 */
4245 
4246 		if (mask & IFCAP_LRO)
4247 			ifp->if_capenable ^= IFCAP_LRO;
4248 		if (mask & IFCAP_VLAN_HWTAGGING)
4249 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4250 		if (mask & IFCAP_VLAN_HWTSO)
4251 			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4252 
4253 		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4254 		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4255 			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4256 
4257 		mtx_unlock(&sc->driver_mtx);
4258 		VLAN_CAPABILITIES(ifp);
4259 
4260 		break;
4261 
4262 	case SIOCGIFMEDIA:
4263 		mtx_lock(&sc->driver_mtx);
4264 		if (sc->dying) {
4265 			mtx_unlock(&sc->driver_mtx);
4266 			return (EINVAL);
4267 		}
4268 		mxge_media_probe(sc);
4269 		mtx_unlock(&sc->driver_mtx);
4270 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4271 				    &sc->media, command);
4272 		break;
4273 
4274 	case SIOCGI2C:
4275 		if (sc->connector != MXGE_XFP &&
4276 		    sc->connector != MXGE_SFP) {
4277 			err = ENXIO;
4278 			break;
4279 		}
4280 		err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
4281 		if (err != 0)
4282 			break;
4283 		mtx_lock(&sc->driver_mtx);
4284 		if (sc->dying) {
4285 			mtx_unlock(&sc->driver_mtx);
4286 			return (EINVAL);
4287 		}
4288 		err = mxge_fetch_i2c(sc, &i2c);
4289 		mtx_unlock(&sc->driver_mtx);
4290 		if (err == 0)
4291 			err = copyout(&i2c, ifr_data_get_ptr(ifr),
4292 			    sizeof(i2c));
4293 		break;
4294 	default:
4295 		err = ether_ioctl(ifp, command, data);
4296 		break;
4297 	}
4298 	return err;
4299 }
4300 
4301 static void
4302 mxge_fetch_tunables(mxge_softc_t *sc)
4303 {
4304 
4305 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4306 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4307 			  &mxge_flow_control);
4308 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4309 			  &mxge_intr_coal_delay);
4310 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4311 			  &mxge_nvidia_ecrc_enable);
4312 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4313 			  &mxge_force_firmware);
4314 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4315 			  &mxge_deassert_wait);
4316 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4317 			  &mxge_verbose);
4318 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4319 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4320 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4321 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4322 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4323 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4324 
4325 	if (bootverbose)
4326 		mxge_verbose = 1;
4327 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4328 		mxge_intr_coal_delay = 30;
4329 	if (mxge_ticks == 0)
4330 		mxge_ticks = hz / 2;
4331 	sc->pause = mxge_flow_control;
4332 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4333 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4334 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4335 	}
4336 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4337 	    mxge_initial_mtu < ETHER_MIN_LEN)
4338 		mxge_initial_mtu = ETHERMTU_JUMBO;
4339 
4340 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4341 		mxge_throttle = MXGE_MAX_THROTTLE;
4342 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4343 		mxge_throttle = MXGE_MIN_THROTTLE;
4344 	sc->throttle = mxge_throttle;
4345 }
4346 
4347 static void
4348 mxge_free_slices(mxge_softc_t *sc)
4349 {
4350 	struct mxge_slice_state *ss;
4351 	int i;
4352 
4353 	if (sc->ss == NULL)
4354 		return;
4355 
4356 	for (i = 0; i < sc->num_slices; i++) {
4357 		ss = &sc->ss[i];
4358 		if (ss->fw_stats != NULL) {
4359 			mxge_dma_free(&ss->fw_stats_dma);
4360 			ss->fw_stats = NULL;
4361 #ifdef IFNET_BUF_RING
4362 			if (ss->tx.br != NULL) {
4363 				drbr_free(ss->tx.br, M_DEVBUF);
4364 				ss->tx.br = NULL;
4365 			}
4366 #endif
4367 			mtx_destroy(&ss->tx.mtx);
4368 		}
4369 		if (ss->rx_done.entry != NULL) {
4370 			mxge_dma_free(&ss->rx_done.dma);
4371 			ss->rx_done.entry = NULL;
4372 		}
4373 	}
4374 	free(sc->ss, M_DEVBUF);
4375 	sc->ss = NULL;
4376 }
4377 
4378 static int
4379 mxge_alloc_slices(mxge_softc_t *sc)
4380 {
4381 	mxge_cmd_t cmd;
4382 	struct mxge_slice_state *ss;
4383 	size_t bytes;
4384 	int err, i, max_intr_slots;
4385 
4386 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4387 	if (err != 0) {
4388 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4389 		return err;
4390 	}
4391 	sc->rx_ring_size = cmd.data0;
4392 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4393 
4394 	bytes = sizeof (*sc->ss) * sc->num_slices;
4395 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4396 	if (sc->ss == NULL)
4397 		return (ENOMEM);
4398 	for (i = 0; i < sc->num_slices; i++) {
4399 		ss = &sc->ss[i];
4400 
4401 		ss->sc = sc;
4402 
4403 		/* allocate per-slice rx interrupt queues */
4404 
4405 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4406 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4407 		if (err != 0)
4408 			goto abort;
4409 		ss->rx_done.entry = ss->rx_done.dma.addr;
4410 		bzero(ss->rx_done.entry, bytes);
4411 
4412 		/*
4413 		 * allocate the per-slice firmware stats; stats
4414 		 * (including tx) are used used only on the first
4415 		 * slice for now
4416 		 */
4417 #ifndef IFNET_BUF_RING
4418 		if (i > 0)
4419 			continue;
4420 #endif
4421 
4422 		bytes = sizeof (*ss->fw_stats);
4423 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4424 				     sizeof (*ss->fw_stats), 64);
4425 		if (err != 0)
4426 			goto abort;
4427 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4428 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4429 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4430 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4431 #ifdef IFNET_BUF_RING
4432 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4433 					   &ss->tx.mtx);
4434 #endif
4435 	}
4436 
4437 	return (0);
4438 
4439 abort:
4440 	mxge_free_slices(sc);
4441 	return (ENOMEM);
4442 }
4443 
4444 static void
4445 mxge_slice_probe(mxge_softc_t *sc)
4446 {
4447 	mxge_cmd_t cmd;
4448 	char *old_fw;
4449 	int msix_cnt, status, max_intr_slots;
4450 
4451 	sc->num_slices = 1;
4452 	/*
4453 	 *  don't enable multiple slices if they are not enabled,
4454 	 *  or if this is not an SMP system
4455 	 */
4456 
4457 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4458 		return;
4459 
4460 	/* see how many MSI-X interrupts are available */
4461 	msix_cnt = pci_msix_count(sc->dev);
4462 	if (msix_cnt < 2)
4463 		return;
4464 
4465 	/* now load the slice aware firmware see what it supports */
4466 	old_fw = sc->fw_name;
4467 	if (old_fw == mxge_fw_aligned)
4468 		sc->fw_name = mxge_fw_rss_aligned;
4469 	else
4470 		sc->fw_name = mxge_fw_rss_unaligned;
4471 	status = mxge_load_firmware(sc, 0);
4472 	if (status != 0) {
4473 		device_printf(sc->dev, "Falling back to a single slice\n");
4474 		return;
4475 	}
4476 
4477 	/* try to send a reset command to the card to see if it
4478 	   is alive */
4479 	memset(&cmd, 0, sizeof (cmd));
4480 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4481 	if (status != 0) {
4482 		device_printf(sc->dev, "failed reset\n");
4483 		goto abort_with_fw;
4484 	}
4485 
4486 	/* get rx ring size */
4487 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4488 	if (status != 0) {
4489 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4490 		goto abort_with_fw;
4491 	}
4492 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4493 
4494 	/* tell it the size of the interrupt queues */
4495 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4496 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4497 	if (status != 0) {
4498 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4499 		goto abort_with_fw;
4500 	}
4501 
4502 	/* ask the maximum number of slices it supports */
4503 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4504 	if (status != 0) {
4505 		device_printf(sc->dev,
4506 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4507 		goto abort_with_fw;
4508 	}
4509 	sc->num_slices = cmd.data0;
4510 	if (sc->num_slices > msix_cnt)
4511 		sc->num_slices = msix_cnt;
4512 
4513 	if (mxge_max_slices == -1) {
4514 		/* cap to number of CPUs in system */
4515 		if (sc->num_slices > mp_ncpus)
4516 			sc->num_slices = mp_ncpus;
4517 	} else {
4518 		if (sc->num_slices > mxge_max_slices)
4519 			sc->num_slices = mxge_max_slices;
4520 	}
4521 	/* make sure it is a power of two */
4522 	while (sc->num_slices & (sc->num_slices - 1))
4523 		sc->num_slices--;
4524 
4525 	if (mxge_verbose)
4526 		device_printf(sc->dev, "using %d slices\n",
4527 			      sc->num_slices);
4528 
4529 	return;
4530 
4531 abort_with_fw:
4532 	sc->fw_name = old_fw;
4533 	(void) mxge_load_firmware(sc, 0);
4534 }
4535 
4536 static int
4537 mxge_add_msix_irqs(mxge_softc_t *sc)
4538 {
4539 	size_t bytes;
4540 	int count, err, i, rid;
4541 
4542 	rid = PCIR_BAR(2);
4543 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4544 						    &rid, RF_ACTIVE);
4545 
4546 	if (sc->msix_table_res == NULL) {
4547 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4548 		return ENXIO;
4549 	}
4550 
4551 	count = sc->num_slices;
4552 	err = pci_alloc_msix(sc->dev, &count);
4553 	if (err != 0) {
4554 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4555 			      "err = %d \n", sc->num_slices, err);
4556 		goto abort_with_msix_table;
4557 	}
4558 	if (count < sc->num_slices) {
4559 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4560 			      count, sc->num_slices);
4561 		device_printf(sc->dev,
4562 			      "Try setting hw.mxge.max_slices to %d\n",
4563 			      count);
4564 		err = ENOSPC;
4565 		goto abort_with_msix;
4566 	}
4567 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4568 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4569 	if (sc->msix_irq_res == NULL) {
4570 		err = ENOMEM;
4571 		goto abort_with_msix;
4572 	}
4573 
4574 	for (i = 0; i < sc->num_slices; i++) {
4575 		rid = i + 1;
4576 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4577 							  SYS_RES_IRQ,
4578 							  &rid, RF_ACTIVE);
4579 		if (sc->msix_irq_res[i] == NULL) {
4580 			device_printf(sc->dev, "couldn't allocate IRQ res"
4581 				      " for message %d\n", i);
4582 			err = ENXIO;
4583 			goto abort_with_res;
4584 		}
4585 	}
4586 
4587 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4588 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4589 
4590 	for (i = 0; i < sc->num_slices; i++) {
4591 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4592 				     INTR_TYPE_NET | INTR_MPSAFE,
4593 #if __FreeBSD_version > 700030
4594 				     NULL,
4595 #endif
4596 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4597 		if (err != 0) {
4598 			device_printf(sc->dev, "couldn't setup intr for "
4599 				      "message %d\n", i);
4600 			goto abort_with_intr;
4601 		}
4602 		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4603 				  sc->msix_ih[i], "s%d", i);
4604 	}
4605 
4606 	if (mxge_verbose) {
4607 		device_printf(sc->dev, "using %d msix IRQs:",
4608 			      sc->num_slices);
4609 		for (i = 0; i < sc->num_slices; i++)
4610 			printf(" %jd", rman_get_start(sc->msix_irq_res[i]));
4611 		printf("\n");
4612 	}
4613 	return (0);
4614 
4615 abort_with_intr:
4616 	for (i = 0; i < sc->num_slices; i++) {
4617 		if (sc->msix_ih[i] != NULL) {
4618 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4619 					  sc->msix_ih[i]);
4620 			sc->msix_ih[i] = NULL;
4621 		}
4622 	}
4623 	free(sc->msix_ih, M_DEVBUF);
4624 
4625 abort_with_res:
4626 	for (i = 0; i < sc->num_slices; i++) {
4627 		rid = i + 1;
4628 		if (sc->msix_irq_res[i] != NULL)
4629 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4630 					     sc->msix_irq_res[i]);
4631 		sc->msix_irq_res[i] = NULL;
4632 	}
4633 	free(sc->msix_irq_res, M_DEVBUF);
4634 
4635 abort_with_msix:
4636 	pci_release_msi(sc->dev);
4637 
4638 abort_with_msix_table:
4639 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4640 			     sc->msix_table_res);
4641 
4642 	return err;
4643 }
4644 
4645 static int
4646 mxge_add_single_irq(mxge_softc_t *sc)
4647 {
4648 	int count, err, rid;
4649 
4650 	count = pci_msi_count(sc->dev);
4651 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4652 		rid = 1;
4653 	} else {
4654 		rid = 0;
4655 		sc->legacy_irq = 1;
4656 	}
4657 	sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &rid,
4658 					     RF_SHAREABLE | RF_ACTIVE);
4659 	if (sc->irq_res == NULL) {
4660 		device_printf(sc->dev, "could not alloc interrupt\n");
4661 		return ENXIO;
4662 	}
4663 	if (mxge_verbose)
4664 		device_printf(sc->dev, "using %s irq %jd\n",
4665 			      sc->legacy_irq ? "INTx" : "MSI",
4666 			      rman_get_start(sc->irq_res));
4667 	err = bus_setup_intr(sc->dev, sc->irq_res,
4668 			     INTR_TYPE_NET | INTR_MPSAFE,
4669 #if __FreeBSD_version > 700030
4670 			     NULL,
4671 #endif
4672 			     mxge_intr, &sc->ss[0], &sc->ih);
4673 	if (err != 0) {
4674 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4675 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4676 		if (!sc->legacy_irq)
4677 			pci_release_msi(sc->dev);
4678 	}
4679 	return err;
4680 }
4681 
4682 static void
4683 mxge_rem_msix_irqs(mxge_softc_t *sc)
4684 {
4685 	int i, rid;
4686 
4687 	for (i = 0; i < sc->num_slices; i++) {
4688 		if (sc->msix_ih[i] != NULL) {
4689 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4690 					  sc->msix_ih[i]);
4691 			sc->msix_ih[i] = NULL;
4692 		}
4693 	}
4694 	free(sc->msix_ih, M_DEVBUF);
4695 
4696 	for (i = 0; i < sc->num_slices; i++) {
4697 		rid = i + 1;
4698 		if (sc->msix_irq_res[i] != NULL)
4699 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4700 					     sc->msix_irq_res[i]);
4701 		sc->msix_irq_res[i] = NULL;
4702 	}
4703 	free(sc->msix_irq_res, M_DEVBUF);
4704 
4705 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4706 			     sc->msix_table_res);
4707 
4708 	pci_release_msi(sc->dev);
4709 	return;
4710 }
4711 
4712 static void
4713 mxge_rem_single_irq(mxge_softc_t *sc)
4714 {
4715 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4716 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4717 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4718 	if (!sc->legacy_irq)
4719 		pci_release_msi(sc->dev);
4720 }
4721 
4722 static void
4723 mxge_rem_irq(mxge_softc_t *sc)
4724 {
4725 	if (sc->num_slices > 1)
4726 		mxge_rem_msix_irqs(sc);
4727 	else
4728 		mxge_rem_single_irq(sc);
4729 }
4730 
4731 static int
4732 mxge_add_irq(mxge_softc_t *sc)
4733 {
4734 	int err;
4735 
4736 	if (sc->num_slices > 1)
4737 		err = mxge_add_msix_irqs(sc);
4738 	else
4739 		err = mxge_add_single_irq(sc);
4740 
4741 	if (0 && err == 0 && sc->num_slices > 1) {
4742 		mxge_rem_msix_irqs(sc);
4743 		err = mxge_add_msix_irqs(sc);
4744 	}
4745 	return err;
4746 }
4747 
4748 static int
4749 mxge_attach(device_t dev)
4750 {
4751 	mxge_cmd_t cmd;
4752 	mxge_softc_t *sc = device_get_softc(dev);
4753 	struct ifnet *ifp;
4754 	int err, rid;
4755 
4756 	sc->dev = dev;
4757 	mxge_fetch_tunables(sc);
4758 
4759 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4760 	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4761 				  taskqueue_thread_enqueue, &sc->tq);
4762 	if (sc->tq == NULL) {
4763 		err = ENOMEM;
4764 		goto abort_with_nothing;
4765 	}
4766 
4767 	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4768 				 1,			/* alignment */
4769 				 0,			/* boundary */
4770 				 BUS_SPACE_MAXADDR,	/* low */
4771 				 BUS_SPACE_MAXADDR,	/* high */
4772 				 NULL, NULL,		/* filter */
4773 				 65536 + 256,		/* maxsize */
4774 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4775 				 65536,			/* maxsegsize */
4776 				 0,			/* flags */
4777 				 NULL, NULL,		/* lock */
4778 				 &sc->parent_dmat);	/* tag */
4779 
4780 	if (err != 0) {
4781 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4782 			      err);
4783 		goto abort_with_tq;
4784 	}
4785 
4786 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4787 	if (ifp == NULL) {
4788 		device_printf(dev, "can not if_alloc()\n");
4789 		err = ENOSPC;
4790 		goto abort_with_parent_dmat;
4791 	}
4792 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4793 
4794 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4795 		 device_get_nameunit(dev));
4796 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4797 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4798 		 "%s:drv", device_get_nameunit(dev));
4799 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4800 		 MTX_NETWORK_LOCK, MTX_DEF);
4801 
4802 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4803 
4804 	mxge_setup_cfg_space(sc);
4805 
4806 	/* Map the board into the kernel */
4807 	rid = PCIR_BARS;
4808 	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
4809 					     RF_ACTIVE);
4810 	if (sc->mem_res == NULL) {
4811 		device_printf(dev, "could not map memory\n");
4812 		err = ENXIO;
4813 		goto abort_with_lock;
4814 	}
4815 	sc->sram = rman_get_virtual(sc->mem_res);
4816 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4817 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4818 		device_printf(dev, "impossible memory region size %jd\n",
4819 			      rman_get_size(sc->mem_res));
4820 		err = ENXIO;
4821 		goto abort_with_mem_res;
4822 	}
4823 
4824 	/* make NULL terminated copy of the EEPROM strings section of
4825 	   lanai SRAM */
4826 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4827 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4828 				rman_get_bushandle(sc->mem_res),
4829 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4830 				sc->eeprom_strings,
4831 				MXGE_EEPROM_STRINGS_SIZE - 2);
4832 	err = mxge_parse_strings(sc);
4833 	if (err != 0)
4834 		goto abort_with_mem_res;
4835 
4836 	/* Enable write combining for efficient use of PCIe bus */
4837 	mxge_enable_wc(sc);
4838 
4839 	/* Allocate the out of band dma memory */
4840 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4841 			     sizeof (mxge_cmd_t), 64);
4842 	if (err != 0)
4843 		goto abort_with_mem_res;
4844 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4845 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4846 	if (err != 0)
4847 		goto abort_with_cmd_dma;
4848 
4849 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4850 	if (err != 0)
4851 		goto abort_with_zeropad_dma;
4852 
4853 	/* select & load the firmware */
4854 	err = mxge_select_firmware(sc);
4855 	if (err != 0)
4856 		goto abort_with_dmabench;
4857 	sc->intr_coal_delay = mxge_intr_coal_delay;
4858 
4859 	mxge_slice_probe(sc);
4860 	err = mxge_alloc_slices(sc);
4861 	if (err != 0)
4862 		goto abort_with_dmabench;
4863 
4864 	err = mxge_reset(sc, 0);
4865 	if (err != 0)
4866 		goto abort_with_slices;
4867 
4868 	err = mxge_alloc_rings(sc);
4869 	if (err != 0) {
4870 		device_printf(sc->dev, "failed to allocate rings\n");
4871 		goto abort_with_slices;
4872 	}
4873 
4874 	err = mxge_add_irq(sc);
4875 	if (err != 0) {
4876 		device_printf(sc->dev, "failed to add irq\n");
4877 		goto abort_with_rings;
4878 	}
4879 
4880 	ifp->if_baudrate = IF_Gbps(10);
4881 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4882 		IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4883 		IFCAP_RXCSUM_IPV6;
4884 #if defined(INET) || defined(INET6)
4885 	ifp->if_capabilities |= IFCAP_LRO;
4886 #endif
4887 
4888 #ifdef MXGE_NEW_VLAN_API
4889 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4890 
4891 	/* Only FW 1.4.32 and newer can do TSO over vlans */
4892 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4893 	    sc->fw_ver_tiny >= 32)
4894 		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4895 #endif
4896 	sc->max_mtu = mxge_max_mtu(sc);
4897 	if (sc->max_mtu >= 9000)
4898 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4899 	else
4900 		device_printf(dev, "MTU limited to %d.  Install "
4901 			      "latest firmware for 9000 byte jumbo support\n",
4902 			      sc->max_mtu - ETHER_HDR_LEN);
4903 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4904 	ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
4905 	/* check to see if f/w supports TSO for IPv6 */
4906 	if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4907 		if (CSUM_TCP_IPV6)
4908 			ifp->if_capabilities |= IFCAP_TSO6;
4909 		sc->max_tso6_hlen = min(cmd.data0,
4910 					sizeof (sc->ss[0].scratch));
4911 	}
4912 	ifp->if_capenable = ifp->if_capabilities;
4913 	if (sc->lro_cnt == 0)
4914 		ifp->if_capenable &= ~IFCAP_LRO;
4915 	ifp->if_init = mxge_init;
4916 	ifp->if_softc = sc;
4917 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4918 	ifp->if_ioctl = mxge_ioctl;
4919 	ifp->if_start = mxge_start;
4920 	ifp->if_get_counter = mxge_get_counter;
4921 	ifp->if_hw_tsomax = IP_MAXPACKET - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
4922 	ifp->if_hw_tsomaxsegcount = sc->ss[0].tx.max_desc;
4923 	ifp->if_hw_tsomaxsegsize = IP_MAXPACKET;
4924 	/* Initialise the ifmedia structure */
4925 	ifmedia_init(&sc->media, 0, mxge_media_change,
4926 		     mxge_media_status);
4927 	mxge_media_init(sc);
4928 	mxge_media_probe(sc);
4929 	sc->dying = 0;
4930 	ether_ifattach(ifp, sc->mac_addr);
4931 	/* ether_ifattach sets mtu to ETHERMTU */
4932 	if (mxge_initial_mtu != ETHERMTU)
4933 		mxge_change_mtu(sc, mxge_initial_mtu);
4934 
4935 	mxge_add_sysctls(sc);
4936 #ifdef IFNET_BUF_RING
4937 	ifp->if_transmit = mxge_transmit;
4938 	ifp->if_qflush = mxge_qflush;
4939 #endif
4940 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4941 				device_get_nameunit(sc->dev));
4942 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4943 	return 0;
4944 
4945 abort_with_rings:
4946 	mxge_free_rings(sc);
4947 abort_with_slices:
4948 	mxge_free_slices(sc);
4949 abort_with_dmabench:
4950 	mxge_dma_free(&sc->dmabench_dma);
4951 abort_with_zeropad_dma:
4952 	mxge_dma_free(&sc->zeropad_dma);
4953 abort_with_cmd_dma:
4954 	mxge_dma_free(&sc->cmd_dma);
4955 abort_with_mem_res:
4956 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4957 abort_with_lock:
4958 	pci_disable_busmaster(dev);
4959 	mtx_destroy(&sc->cmd_mtx);
4960 	mtx_destroy(&sc->driver_mtx);
4961 	if_free(ifp);
4962 abort_with_parent_dmat:
4963 	bus_dma_tag_destroy(sc->parent_dmat);
4964 abort_with_tq:
4965 	if (sc->tq != NULL) {
4966 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4967 		taskqueue_free(sc->tq);
4968 		sc->tq = NULL;
4969 	}
4970 abort_with_nothing:
4971 	return err;
4972 }
4973 
4974 static int
4975 mxge_detach(device_t dev)
4976 {
4977 	mxge_softc_t *sc = device_get_softc(dev);
4978 
4979 	if (mxge_vlans_active(sc)) {
4980 		device_printf(sc->dev,
4981 			      "Detach vlans before removing module\n");
4982 		return EBUSY;
4983 	}
4984 	mtx_lock(&sc->driver_mtx);
4985 	sc->dying = 1;
4986 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4987 		mxge_close(sc, 0);
4988 	mtx_unlock(&sc->driver_mtx);
4989 	ether_ifdetach(sc->ifp);
4990 	if (sc->tq != NULL) {
4991 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4992 		taskqueue_free(sc->tq);
4993 		sc->tq = NULL;
4994 	}
4995 	callout_drain(&sc->co_hdl);
4996 	ifmedia_removeall(&sc->media);
4997 	mxge_dummy_rdma(sc, 0);
4998 	mxge_rem_sysctls(sc);
4999 	mxge_rem_irq(sc);
5000 	mxge_free_rings(sc);
5001 	mxge_free_slices(sc);
5002 	mxge_dma_free(&sc->dmabench_dma);
5003 	mxge_dma_free(&sc->zeropad_dma);
5004 	mxge_dma_free(&sc->cmd_dma);
5005 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5006 	pci_disable_busmaster(dev);
5007 	mtx_destroy(&sc->cmd_mtx);
5008 	mtx_destroy(&sc->driver_mtx);
5009 	if_free(sc->ifp);
5010 	bus_dma_tag_destroy(sc->parent_dmat);
5011 	return 0;
5012 }
5013 
5014 static int
5015 mxge_shutdown(device_t dev)
5016 {
5017 	return 0;
5018 }
5019 
5020 /*
5021   This file uses Myri10GE driver indentation.
5022 
5023   Local Variables:
5024   c-file-style:"linux"
5025   tab-width:8
5026   End:
5027 */
5028