xref: /freebsd/sys/dev/mxge/if_mxge.c (revision e17f5b1d)
1 /******************************************************************************
2 SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 
4 Copyright (c) 2006-2013, Myricom Inc.
5 All rights reserved.
6 
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9 
10  1. Redistributions of source code must retain the above copyright notice,
11     this list of conditions and the following disclaimer.
12 
13  2. Neither the name of the Myricom Inc, nor the names of its
14     contributors may be used to endorse or promote products derived from
15     this software without specific prior written permission.
16 
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 POSSIBILITY OF SUCH DAMAGE.
28 
29 ***************************************************************************/
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/linker.h>
37 #include <sys/firmware.h>
38 #include <sys/endian.h>
39 #include <sys/sockio.h>
40 #include <sys/mbuf.h>
41 #include <sys/malloc.h>
42 #include <sys/kdb.h>
43 #include <sys/kernel.h>
44 #include <sys/lock.h>
45 #include <sys/module.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
48 #include <sys/sx.h>
49 #include <sys/taskqueue.h>
50 #include <contrib/zlib/zlib.h>
51 #include <dev/zlib/zcalloc.h>
52 
53 #include <net/if.h>
54 #include <net/if_var.h>
55 #include <net/if_arp.h>
56 #include <net/ethernet.h>
57 #include <net/if_dl.h>
58 #include <net/if_media.h>
59 
60 #include <net/bpf.h>
61 
62 #include <net/if_types.h>
63 #include <net/if_vlan_var.h>
64 
65 #include <netinet/in_systm.h>
66 #include <netinet/in.h>
67 #include <netinet/ip.h>
68 #include <netinet/ip6.h>
69 #include <netinet/tcp.h>
70 #include <netinet/tcp_lro.h>
71 #include <netinet6/ip6_var.h>
72 
73 #include <machine/bus.h>
74 #include <machine/in_cksum.h>
75 #include <machine/resource.h>
76 #include <sys/bus.h>
77 #include <sys/rman.h>
78 #include <sys/smp.h>
79 
80 #include <dev/pci/pcireg.h>
81 #include <dev/pci/pcivar.h>
82 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
83 
84 #include <vm/vm.h>		/* for pmap_mapdev() */
85 #include <vm/pmap.h>
86 
87 #if defined(__i386) || defined(__amd64)
88 #include <machine/specialreg.h>
89 #endif
90 
91 #include <dev/mxge/mxge_mcp.h>
92 #include <dev/mxge/mcp_gen_header.h>
93 /*#define MXGE_FAKE_IFP*/
94 #include <dev/mxge/if_mxge_var.h>
95 #ifdef IFNET_BUF_RING
96 #include <sys/buf_ring.h>
97 #endif
98 
99 #include "opt_inet.h"
100 #include "opt_inet6.h"
101 
102 /* tunable params */
103 static int mxge_nvidia_ecrc_enable = 1;
104 static int mxge_force_firmware = 0;
105 static int mxge_intr_coal_delay = 30;
106 static int mxge_deassert_wait = 1;
107 static int mxge_flow_control = 1;
108 static int mxge_verbose = 0;
109 static int mxge_ticks;
110 static int mxge_max_slices = 1;
111 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
112 static int mxge_always_promisc = 0;
113 static int mxge_initial_mtu = ETHERMTU_JUMBO;
114 static int mxge_throttle = 0;
115 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
116 static char *mxge_fw_aligned = "mxge_eth_z8e";
117 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
118 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
119 
120 static int mxge_probe(device_t dev);
121 static int mxge_attach(device_t dev);
122 static int mxge_detach(device_t dev);
123 static int mxge_shutdown(device_t dev);
124 static void mxge_intr(void *arg);
125 
126 static device_method_t mxge_methods[] =
127 {
128   /* Device interface */
129   DEVMETHOD(device_probe, mxge_probe),
130   DEVMETHOD(device_attach, mxge_attach),
131   DEVMETHOD(device_detach, mxge_detach),
132   DEVMETHOD(device_shutdown, mxge_shutdown),
133 
134   DEVMETHOD_END
135 };
136 
137 static driver_t mxge_driver =
138 {
139   "mxge",
140   mxge_methods,
141   sizeof(mxge_softc_t),
142 };
143 
144 static devclass_t mxge_devclass;
145 
146 /* Declare ourselves to be a child of the PCI bus.*/
147 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
148 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
149 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
150 
151 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
152 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
153 static int mxge_close(mxge_softc_t *sc, int down);
154 static int mxge_open(mxge_softc_t *sc);
155 static void mxge_tick(void *arg);
156 
157 static int
158 mxge_probe(device_t dev)
159 {
160 	int rev;
161 
162 
163 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
164 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
165 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
166 		rev = pci_get_revid(dev);
167 		switch (rev) {
168 		case MXGE_PCI_REV_Z8E:
169 			device_set_desc(dev, "Myri10G-PCIE-8A");
170 			break;
171 		case MXGE_PCI_REV_Z8ES:
172 			device_set_desc(dev, "Myri10G-PCIE-8B");
173 			break;
174 		default:
175 			device_set_desc(dev, "Myri10G-PCIE-8??");
176 			device_printf(dev, "Unrecognized rev %d NIC\n",
177 				      rev);
178 			break;
179 		}
180 		return 0;
181 	}
182 	return ENXIO;
183 }
184 
185 static void
186 mxge_enable_wc(mxge_softc_t *sc)
187 {
188 #if defined(__i386) || defined(__amd64)
189 	vm_offset_t len;
190 	int err;
191 
192 	sc->wc = 1;
193 	len = rman_get_size(sc->mem_res);
194 	err = pmap_change_attr((vm_offset_t) sc->sram,
195 			       len, PAT_WRITE_COMBINING);
196 	if (err != 0) {
197 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
198 			      err);
199 		sc->wc = 0;
200 	}
201 #endif
202 }
203 
204 
205 /* callback to get our DMA address */
206 static void
207 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
208 			 int error)
209 {
210 	if (error == 0) {
211 		*(bus_addr_t *) arg = segs->ds_addr;
212 	}
213 }
214 
215 static int
216 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
217 		   bus_size_t alignment)
218 {
219 	int err;
220 	device_t dev = sc->dev;
221 	bus_size_t boundary, maxsegsize;
222 
223 	if (bytes > 4096 && alignment == 4096) {
224 		boundary = 0;
225 		maxsegsize = bytes;
226 	} else {
227 		boundary = 4096;
228 		maxsegsize = 4096;
229 	}
230 
231 	/* allocate DMAable memory tags */
232 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
233 				 alignment,		/* alignment */
234 				 boundary,		/* boundary */
235 				 BUS_SPACE_MAXADDR,	/* low */
236 				 BUS_SPACE_MAXADDR,	/* high */
237 				 NULL, NULL,		/* filter */
238 				 bytes,			/* maxsize */
239 				 1,			/* num segs */
240 				 maxsegsize,		/* maxsegsize */
241 				 BUS_DMA_COHERENT,	/* flags */
242 				 NULL, NULL,		/* lock */
243 				 &dma->dmat);		/* tag */
244 	if (err != 0) {
245 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
246 		return err;
247 	}
248 
249 	/* allocate DMAable memory & map */
250 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
251 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
252 				| BUS_DMA_ZERO),  &dma->map);
253 	if (err != 0) {
254 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
255 		goto abort_with_dmat;
256 	}
257 
258 	/* load the memory */
259 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
260 			      mxge_dmamap_callback,
261 			      (void *)&dma->bus_addr, 0);
262 	if (err != 0) {
263 		device_printf(dev, "couldn't load map (err = %d)\n", err);
264 		goto abort_with_mem;
265 	}
266 	return 0;
267 
268 abort_with_mem:
269 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
270 abort_with_dmat:
271 	(void)bus_dma_tag_destroy(dma->dmat);
272 	return err;
273 }
274 
275 
276 static void
277 mxge_dma_free(mxge_dma_t *dma)
278 {
279 	bus_dmamap_unload(dma->dmat, dma->map);
280 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
281 	(void)bus_dma_tag_destroy(dma->dmat);
282 }
283 
284 /*
285  * The eeprom strings on the lanaiX have the format
286  * SN=x\0
287  * MAC=x:x:x:x:x:x\0
288  * PC=text\0
289  */
290 
291 static int
292 mxge_parse_strings(mxge_softc_t *sc)
293 {
294 	char *ptr;
295 	int i, found_mac, found_sn2;
296 	char *endptr;
297 
298 	ptr = sc->eeprom_strings;
299 	found_mac = 0;
300 	found_sn2 = 0;
301 	while (*ptr != '\0') {
302 		if (strncmp(ptr, "MAC=", 4) == 0) {
303 			ptr += 4;
304 			for (i = 0;;) {
305 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
306 				if (endptr - ptr != 2)
307 					goto abort;
308 				ptr = endptr;
309 				if (++i == 6)
310 					break;
311 				if (*ptr++ != ':')
312 					goto abort;
313 			}
314 			found_mac = 1;
315 		} else if (strncmp(ptr, "PC=", 3) == 0) {
316 			ptr += 3;
317 			strlcpy(sc->product_code_string, ptr,
318 			    sizeof(sc->product_code_string));
319 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
320 			ptr += 3;
321 			strlcpy(sc->serial_number_string, ptr,
322 			    sizeof(sc->serial_number_string));
323 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
324 			/* SN2 takes precedence over SN */
325 			ptr += 4;
326 			found_sn2 = 1;
327 			strlcpy(sc->serial_number_string, ptr,
328 			    sizeof(sc->serial_number_string));
329 		}
330 		while (*ptr++ != '\0') {}
331 	}
332 
333 	if (found_mac)
334 		return 0;
335 
336  abort:
337 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
338 
339 	return ENXIO;
340 }
341 
342 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
343 static void
344 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
345 {
346 	uint32_t val;
347 	unsigned long base, off;
348 	char *va, *cfgptr;
349 	device_t pdev, mcp55;
350 	uint16_t vendor_id, device_id, word;
351 	uintptr_t bus, slot, func, ivend, idev;
352 	uint32_t *ptr32;
353 
354 
355 	if (!mxge_nvidia_ecrc_enable)
356 		return;
357 
358 	pdev = device_get_parent(device_get_parent(sc->dev));
359 	if (pdev == NULL) {
360 		device_printf(sc->dev, "could not find parent?\n");
361 		return;
362 	}
363 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
364 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
365 
366 	if (vendor_id != 0x10de)
367 		return;
368 
369 	base = 0;
370 
371 	if (device_id == 0x005d) {
372 		/* ck804, base address is magic */
373 		base = 0xe0000000UL;
374 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
375 		/* mcp55, base address stored in chipset */
376 		mcp55 = pci_find_bsf(0, 0, 0);
377 		if (mcp55 &&
378 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
379 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
380 			word = pci_read_config(mcp55, 0x90, 2);
381 			base = ((unsigned long)word & 0x7ffeU) << 25;
382 		}
383 	}
384 	if (!base)
385 		return;
386 
387 	/* XXXX
388 	   Test below is commented because it is believed that doing
389 	   config read/write beyond 0xff will access the config space
390 	   for the next larger function.  Uncomment this and remove
391 	   the hacky pmap_mapdev() way of accessing config space when
392 	   FreeBSD grows support for extended pcie config space access
393 	*/
394 #if 0
395 	/* See if we can, by some miracle, access the extended
396 	   config space */
397 	val = pci_read_config(pdev, 0x178, 4);
398 	if (val != 0xffffffff) {
399 		val |= 0x40;
400 		pci_write_config(pdev, 0x178, val, 4);
401 		return;
402 	}
403 #endif
404 	/* Rather than using normal pci config space writes, we must
405 	 * map the Nvidia config space ourselves.  This is because on
406 	 * opteron/nvidia class machine the 0xe000000 mapping is
407 	 * handled by the nvidia chipset, that means the internal PCI
408 	 * device (the on-chip northbridge), or the amd-8131 bridge
409 	 * and things behind them are not visible by this method.
410 	 */
411 
412 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
413 		      PCI_IVAR_BUS, &bus);
414 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
415 		      PCI_IVAR_SLOT, &slot);
416 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
417 		      PCI_IVAR_FUNCTION, &func);
418 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
419 		      PCI_IVAR_VENDOR, &ivend);
420 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
421 		      PCI_IVAR_DEVICE, &idev);
422 
423 	off =  base
424 		+ 0x00100000UL * (unsigned long)bus
425 		+ 0x00001000UL * (unsigned long)(func
426 						 + 8 * slot);
427 
428 	/* map it into the kernel */
429 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
430 
431 
432 	if (va == NULL) {
433 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
434 		return;
435 	}
436 	/* get a pointer to the config space mapped into the kernel */
437 	cfgptr = va + (off & PAGE_MASK);
438 
439 	/* make sure that we can really access it */
440 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
441 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
442 	if (! (vendor_id == ivend && device_id == idev)) {
443 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
444 			      vendor_id, device_id);
445 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
446 		return;
447 	}
448 
449 	ptr32 = (uint32_t*)(cfgptr + 0x178);
450 	val = *ptr32;
451 
452 	if (val == 0xffffffff) {
453 		device_printf(sc->dev, "extended mapping failed\n");
454 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
455 		return;
456 	}
457 	*ptr32 = val | 0x40;
458 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
459 	if (mxge_verbose)
460 		device_printf(sc->dev,
461 			      "Enabled ECRC on upstream Nvidia bridge "
462 			      "at %d:%d:%d\n",
463 			      (int)bus, (int)slot, (int)func);
464 	return;
465 }
466 #else
467 static void
468 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
469 {
470 	device_printf(sc->dev,
471 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
472 	return;
473 }
474 #endif
475 
476 
477 static int
478 mxge_dma_test(mxge_softc_t *sc, int test_type)
479 {
480 	mxge_cmd_t cmd;
481 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
482 	int status;
483 	uint32_t len;
484 	char *test = " ";
485 
486 
487 	/* Run a small DMA test.
488 	 * The magic multipliers to the length tell the firmware
489 	 * to do DMA read, write, or read+write tests.  The
490 	 * results are returned in cmd.data0.  The upper 16
491 	 * bits of the return is the number of transfers completed.
492 	 * The lower 16 bits is the time in 0.5us ticks that the
493 	 * transfers took to complete.
494 	 */
495 
496 	len = sc->tx_boundary;
497 
498 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
499 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
500 	cmd.data2 = len * 0x10000;
501 	status = mxge_send_cmd(sc, test_type, &cmd);
502 	if (status != 0) {
503 		test = "read";
504 		goto abort;
505 	}
506 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
507 		(cmd.data0 & 0xffff);
508 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
509 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
510 	cmd.data2 = len * 0x1;
511 	status = mxge_send_cmd(sc, test_type, &cmd);
512 	if (status != 0) {
513 		test = "write";
514 		goto abort;
515 	}
516 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
517 		(cmd.data0 & 0xffff);
518 
519 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
520 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
521 	cmd.data2 = len * 0x10001;
522 	status = mxge_send_cmd(sc, test_type, &cmd);
523 	if (status != 0) {
524 		test = "read/write";
525 		goto abort;
526 	}
527 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
528 		(cmd.data0 & 0xffff);
529 
530 abort:
531 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
532 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
533 			      test, status);
534 
535 	return status;
536 }
537 
538 /*
539  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
540  * when the PCI-E Completion packets are aligned on an 8-byte
541  * boundary.  Some PCI-E chip sets always align Completion packets; on
542  * the ones that do not, the alignment can be enforced by enabling
543  * ECRC generation (if supported).
544  *
545  * When PCI-E Completion packets are not aligned, it is actually more
546  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
547  *
548  * If the driver can neither enable ECRC nor verify that it has
549  * already been enabled, then it must use a firmware image which works
550  * around unaligned completion packets (ethp_z8e.dat), and it should
551  * also ensure that it never gives the device a Read-DMA which is
552  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
553  * enabled, then the driver should use the aligned (eth_z8e.dat)
554  * firmware image, and set tx_boundary to 4KB.
555  */
556 
557 static int
558 mxge_firmware_probe(mxge_softc_t *sc)
559 {
560 	device_t dev = sc->dev;
561 	int reg, status;
562 	uint16_t pectl;
563 
564 	sc->tx_boundary = 4096;
565 	/*
566 	 * Verify the max read request size was set to 4KB
567 	 * before trying the test with 4KB.
568 	 */
569 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
570 		pectl = pci_read_config(dev, reg + 0x8, 2);
571 		if ((pectl & (5 << 12)) != (5 << 12)) {
572 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
573 				      pectl);
574 			sc->tx_boundary = 2048;
575 		}
576 	}
577 
578 	/*
579 	 * load the optimized firmware (which assumes aligned PCIe
580 	 * completions) in order to see if it works on this host.
581 	 */
582 	sc->fw_name = mxge_fw_aligned;
583 	status = mxge_load_firmware(sc, 1);
584 	if (status != 0) {
585 		return status;
586 	}
587 
588 	/*
589 	 * Enable ECRC if possible
590 	 */
591 	mxge_enable_nvidia_ecrc(sc);
592 
593 	/*
594 	 * Run a DMA test which watches for unaligned completions and
595 	 * aborts on the first one seen.  Not required on Z8ES or newer.
596 	 */
597 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
598 		return 0;
599 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
600 	if (status == 0)
601 		return 0; /* keep the aligned firmware */
602 
603 	if (status != E2BIG)
604 		device_printf(dev, "DMA test failed: %d\n", status);
605 	if (status == ENOSYS)
606 		device_printf(dev, "Falling back to ethp! "
607 			      "Please install up to date fw\n");
608 	return status;
609 }
610 
611 static int
612 mxge_select_firmware(mxge_softc_t *sc)
613 {
614 	int aligned = 0;
615 	int force_firmware = mxge_force_firmware;
616 
617 	if (sc->throttle)
618 		force_firmware = sc->throttle;
619 
620 	if (force_firmware != 0) {
621 		if (force_firmware == 1)
622 			aligned = 1;
623 		else
624 			aligned = 0;
625 		if (mxge_verbose)
626 			device_printf(sc->dev,
627 				      "Assuming %s completions (forced)\n",
628 				      aligned ? "aligned" : "unaligned");
629 		goto abort;
630 	}
631 
632 	/* if the PCIe link width is 4 or less, we can use the aligned
633 	   firmware and skip any checks */
634 	if (sc->link_width != 0 && sc->link_width <= 4) {
635 		device_printf(sc->dev,
636 			      "PCIe x%d Link, expect reduced performance\n",
637 			      sc->link_width);
638 		aligned = 1;
639 		goto abort;
640 	}
641 
642 	if (0 == mxge_firmware_probe(sc))
643 		return 0;
644 
645 abort:
646 	if (aligned) {
647 		sc->fw_name = mxge_fw_aligned;
648 		sc->tx_boundary = 4096;
649 	} else {
650 		sc->fw_name = mxge_fw_unaligned;
651 		sc->tx_boundary = 2048;
652 	}
653 	return (mxge_load_firmware(sc, 0));
654 }
655 
656 static int
657 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
658 {
659 
660 
661 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
662 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
663 			      be32toh(hdr->mcp_type));
664 		return EIO;
665 	}
666 
667 	/* save firmware version for sysctl */
668 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
669 	if (mxge_verbose)
670 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
671 
672 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
673 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
674 
675 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
676 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
677 		device_printf(sc->dev, "Found firmware version %s\n",
678 			      sc->fw_version);
679 		device_printf(sc->dev, "Driver needs %d.%d\n",
680 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
681 		return EINVAL;
682 	}
683 	return 0;
684 
685 }
686 
687 static int
688 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
689 {
690 	z_stream zs;
691 	char *inflate_buffer;
692 	const struct firmware *fw;
693 	const mcp_gen_header_t *hdr;
694 	unsigned hdr_offset;
695 	int status;
696 	unsigned int i;
697 	char dummy;
698 	size_t fw_len;
699 
700 	fw = firmware_get(sc->fw_name);
701 	if (fw == NULL) {
702 		device_printf(sc->dev, "Could not find firmware image %s\n",
703 			      sc->fw_name);
704 		return ENOENT;
705 	}
706 
707 
708 
709 	/* setup zlib and decompress f/w */
710 	bzero(&zs, sizeof (zs));
711 	zs.zalloc = zcalloc_nowait;
712 	zs.zfree = zcfree;
713 	status = inflateInit(&zs);
714 	if (status != Z_OK) {
715 		status = EIO;
716 		goto abort_with_fw;
717 	}
718 
719 	/* the uncompressed size is stored as the firmware version,
720 	   which would otherwise go unused */
721 	fw_len = (size_t) fw->version;
722 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
723 	if (inflate_buffer == NULL)
724 		goto abort_with_zs;
725 	zs.avail_in = fw->datasize;
726 	zs.next_in = __DECONST(char *, fw->data);
727 	zs.avail_out = fw_len;
728 	zs.next_out = inflate_buffer;
729 	status = inflate(&zs, Z_FINISH);
730 	if (status != Z_STREAM_END) {
731 		device_printf(sc->dev, "zlib %d\n", status);
732 		status = EIO;
733 		goto abort_with_buffer;
734 	}
735 
736 	/* check id */
737 	hdr_offset = htobe32(*(const uint32_t *)
738 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
739 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
740 		device_printf(sc->dev, "Bad firmware file");
741 		status = EIO;
742 		goto abort_with_buffer;
743 	}
744 	hdr = (const void*)(inflate_buffer + hdr_offset);
745 
746 	status = mxge_validate_firmware(sc, hdr);
747 	if (status != 0)
748 		goto abort_with_buffer;
749 
750 	/* Copy the inflated firmware to NIC SRAM. */
751 	for (i = 0; i < fw_len; i += 256) {
752 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
753 			      inflate_buffer + i,
754 			      min(256U, (unsigned)(fw_len - i)));
755 		wmb();
756 		dummy = *sc->sram;
757 		wmb();
758 	}
759 
760 	*limit = fw_len;
761 	status = 0;
762 abort_with_buffer:
763 	free(inflate_buffer, M_TEMP);
764 abort_with_zs:
765 	inflateEnd(&zs);
766 abort_with_fw:
767 	firmware_put(fw, FIRMWARE_UNLOAD);
768 	return status;
769 }
770 
771 /*
772  * Enable or disable periodic RDMAs from the host to make certain
773  * chipsets resend dropped PCIe messages
774  */
775 
776 static void
777 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
778 {
779 	char buf_bytes[72];
780 	volatile uint32_t *confirm;
781 	volatile char *submit;
782 	uint32_t *buf, dma_low, dma_high;
783 	int i;
784 
785 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
786 
787 	/* clear confirmation addr */
788 	confirm = (volatile uint32_t *)sc->cmd;
789 	*confirm = 0;
790 	wmb();
791 
792 	/* send an rdma command to the PCIe engine, and wait for the
793 	   response in the confirmation address.  The firmware should
794 	   write a -1 there to indicate it is alive and well
795 	*/
796 
797 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
798 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
799 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
800 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
801 	buf[2] = htobe32(0xffffffff);		/* confirm data */
802 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
803 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
804 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
805 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
806 	buf[5] = htobe32(enable);			/* enable? */
807 
808 
809 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
810 
811 	mxge_pio_copy(submit, buf, 64);
812 	wmb();
813 	DELAY(1000);
814 	wmb();
815 	i = 0;
816 	while (*confirm != 0xffffffff && i < 20) {
817 		DELAY(1000);
818 		i++;
819 	}
820 	if (*confirm != 0xffffffff) {
821 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
822 			      (enable ? "enable" : "disable"), confirm,
823 			      *confirm);
824 	}
825 	return;
826 }
827 
828 static int
829 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
830 {
831 	mcp_cmd_t *buf;
832 	char buf_bytes[sizeof(*buf) + 8];
833 	volatile mcp_cmd_response_t *response = sc->cmd;
834 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
835 	uint32_t dma_low, dma_high;
836 	int err, sleep_total = 0;
837 
838 	/* ensure buf is aligned to 8 bytes */
839 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
840 
841 	buf->data0 = htobe32(data->data0);
842 	buf->data1 = htobe32(data->data1);
843 	buf->data2 = htobe32(data->data2);
844 	buf->cmd = htobe32(cmd);
845 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
846 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
847 
848 	buf->response_addr.low = htobe32(dma_low);
849 	buf->response_addr.high = htobe32(dma_high);
850 	mtx_lock(&sc->cmd_mtx);
851 	response->result = 0xffffffff;
852 	wmb();
853 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
854 
855 	/* wait up to 20ms */
856 	err = EAGAIN;
857 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
858 		bus_dmamap_sync(sc->cmd_dma.dmat,
859 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
860 		wmb();
861 		switch (be32toh(response->result)) {
862 		case 0:
863 			data->data0 = be32toh(response->data);
864 			err = 0;
865 			break;
866 		case 0xffffffff:
867 			DELAY(1000);
868 			break;
869 		case MXGEFW_CMD_UNKNOWN:
870 			err = ENOSYS;
871 			break;
872 		case MXGEFW_CMD_ERROR_UNALIGNED:
873 			err = E2BIG;
874 			break;
875 		case MXGEFW_CMD_ERROR_BUSY:
876 			err = EBUSY;
877 			break;
878 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
879 			err = ENXIO;
880 			break;
881 		default:
882 			device_printf(sc->dev,
883 				      "mxge: command %d "
884 				      "failed, result = %d\n",
885 				      cmd, be32toh(response->result));
886 			err = ENXIO;
887 			break;
888 		}
889 		if (err != EAGAIN)
890 			break;
891 	}
892 	if (err == EAGAIN)
893 		device_printf(sc->dev, "mxge: command %d timed out"
894 			      "result = %d\n",
895 			      cmd, be32toh(response->result));
896 	mtx_unlock(&sc->cmd_mtx);
897 	return err;
898 }
899 
900 static int
901 mxge_adopt_running_firmware(mxge_softc_t *sc)
902 {
903 	struct mcp_gen_header *hdr;
904 	const size_t bytes = sizeof (struct mcp_gen_header);
905 	size_t hdr_offset;
906 	int status;
907 
908 	/* find running firmware header */
909 	hdr_offset = htobe32(*(volatile uint32_t *)
910 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
911 
912 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
913 		device_printf(sc->dev,
914 			      "Running firmware has bad header offset (%d)\n",
915 			      (int)hdr_offset);
916 		return EIO;
917 	}
918 
919 	/* copy header of running firmware from SRAM to host memory to
920 	 * validate firmware */
921 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
922 	if (hdr == NULL) {
923 		device_printf(sc->dev, "could not malloc firmware hdr\n");
924 		return ENOMEM;
925 	}
926 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
927 				rman_get_bushandle(sc->mem_res),
928 				hdr_offset, (char *)hdr, bytes);
929 	status = mxge_validate_firmware(sc, hdr);
930 	free(hdr, M_DEVBUF);
931 
932 	/*
933 	 * check to see if adopted firmware has bug where adopting
934 	 * it will cause broadcasts to be filtered unless the NIC
935 	 * is kept in ALLMULTI mode
936 	 */
937 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
938 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
939 		sc->adopted_rx_filter_bug = 1;
940 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
941 			      "working around rx filter bug\n",
942 			      sc->fw_ver_major, sc->fw_ver_minor,
943 			      sc->fw_ver_tiny);
944 	}
945 
946 	return status;
947 }
948 
949 
950 static int
951 mxge_load_firmware(mxge_softc_t *sc, int adopt)
952 {
953 	volatile uint32_t *confirm;
954 	volatile char *submit;
955 	char buf_bytes[72];
956 	uint32_t *buf, size, dma_low, dma_high;
957 	int status, i;
958 
959 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
960 
961 	size = sc->sram_size;
962 	status = mxge_load_firmware_helper(sc, &size);
963 	if (status) {
964 		if (!adopt)
965 			return status;
966 		/* Try to use the currently running firmware, if
967 		   it is new enough */
968 		status = mxge_adopt_running_firmware(sc);
969 		if (status) {
970 			device_printf(sc->dev,
971 				      "failed to adopt running firmware\n");
972 			return status;
973 		}
974 		device_printf(sc->dev,
975 			      "Successfully adopted running firmware\n");
976 		if (sc->tx_boundary == 4096) {
977 			device_printf(sc->dev,
978 				"Using firmware currently running on NIC"
979 				 ".  For optimal\n");
980 			device_printf(sc->dev,
981 				 "performance consider loading optimized "
982 				 "firmware\n");
983 		}
984 		sc->fw_name = mxge_fw_unaligned;
985 		sc->tx_boundary = 2048;
986 		return 0;
987 	}
988 	/* clear confirmation addr */
989 	confirm = (volatile uint32_t *)sc->cmd;
990 	*confirm = 0;
991 	wmb();
992 	/* send a reload command to the bootstrap MCP, and wait for the
993 	   response in the confirmation address.  The firmware should
994 	   write a -1 there to indicate it is alive and well
995 	*/
996 
997 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
998 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
999 
1000 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1001 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1002 	buf[2] = htobe32(0xffffffff);	/* confirm data */
1003 
1004 	/* FIX: All newest firmware should un-protect the bottom of
1005 	   the sram before handoff. However, the very first interfaces
1006 	   do not. Therefore the handoff copy must skip the first 8 bytes
1007 	*/
1008 					/* where the code starts*/
1009 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1010 	buf[4] = htobe32(size - 8); 	/* length of code */
1011 	buf[5] = htobe32(8);		/* where to copy to */
1012 	buf[6] = htobe32(0);		/* where to jump to */
1013 
1014 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1015 	mxge_pio_copy(submit, buf, 64);
1016 	wmb();
1017 	DELAY(1000);
1018 	wmb();
1019 	i = 0;
1020 	while (*confirm != 0xffffffff && i < 20) {
1021 		DELAY(1000*10);
1022 		i++;
1023 		bus_dmamap_sync(sc->cmd_dma.dmat,
1024 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1025 	}
1026 	if (*confirm != 0xffffffff) {
1027 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1028 			confirm, *confirm);
1029 
1030 		return ENXIO;
1031 	}
1032 	return 0;
1033 }
1034 
1035 static int
1036 mxge_update_mac_address(mxge_softc_t *sc)
1037 {
1038 	mxge_cmd_t cmd;
1039 	uint8_t *addr = sc->mac_addr;
1040 	int status;
1041 
1042 
1043 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1044 		     | (addr[2] << 8) | addr[3]);
1045 
1046 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1047 
1048 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1049 	return status;
1050 }
1051 
1052 static int
1053 mxge_change_pause(mxge_softc_t *sc, int pause)
1054 {
1055 	mxge_cmd_t cmd;
1056 	int status;
1057 
1058 	if (pause)
1059 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1060 				       &cmd);
1061 	else
1062 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1063 				       &cmd);
1064 
1065 	if (status) {
1066 		device_printf(sc->dev, "Failed to set flow control mode\n");
1067 		return ENXIO;
1068 	}
1069 	sc->pause = pause;
1070 	return 0;
1071 }
1072 
1073 static void
1074 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1075 {
1076 	mxge_cmd_t cmd;
1077 	int status;
1078 
1079 	if (mxge_always_promisc)
1080 		promisc = 1;
1081 
1082 	if (promisc)
1083 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1084 				       &cmd);
1085 	else
1086 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1087 				       &cmd);
1088 
1089 	if (status) {
1090 		device_printf(sc->dev, "Failed to set promisc mode\n");
1091 	}
1092 }
1093 
1094 struct mxge_add_maddr_ctx {
1095 	mxge_softc_t *sc;
1096 	int error;
1097 };
1098 
1099 static u_int
1100 mxge_add_maddr(void *arg, struct sockaddr_dl *sdl, u_int cnt)
1101 {
1102 	struct mxge_add_maddr_ctx *ctx = arg;
1103 	mxge_cmd_t cmd;
1104 
1105 	if (ctx->error != 0)
1106 		return (0);
1107 	bcopy(LLADDR(sdl), &cmd.data0, 4);
1108 	bcopy(LLADDR(sdl) + 4, &cmd.data1, 2);
1109 	cmd.data0 = htonl(cmd.data0);
1110 	cmd.data1 = htonl(cmd.data1);
1111 
1112 	ctx->error = mxge_send_cmd(ctx->sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1113 
1114 	return (1);
1115 }
1116 
1117 static void
1118 mxge_set_multicast_list(mxge_softc_t *sc)
1119 {
1120 	struct mxge_add_maddr_ctx ctx;
1121 	struct ifnet *ifp = sc->ifp;
1122 	mxge_cmd_t cmd;
1123 	int err;
1124 
1125 	/* This firmware is known to not support multicast */
1126 	if (!sc->fw_multicast_support)
1127 		return;
1128 
1129 	/* Disable multicast filtering while we play with the lists*/
1130 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1131 	if (err != 0) {
1132 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1133 		       " error status: %d\n", err);
1134 		return;
1135 	}
1136 
1137 	if (sc->adopted_rx_filter_bug)
1138 		return;
1139 
1140 	if (ifp->if_flags & IFF_ALLMULTI)
1141 		/* request to disable multicast filtering, so quit here */
1142 		return;
1143 
1144 	/* Flush all the filters */
1145 
1146 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1147 	if (err != 0) {
1148 		device_printf(sc->dev,
1149 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1150 			      ", error status: %d\n", err);
1151 		return;
1152 	}
1153 
1154 	/* Walk the multicast list, and add each address */
1155 	ctx.sc = sc;
1156 	ctx.error = 0;
1157 	if_foreach_llmaddr(ifp, mxge_add_maddr, &ctx);
1158 	if (ctx.error != 0) {
1159 		device_printf(sc->dev, "Failed MXGEFW_JOIN_MULTICAST_GROUP, "
1160 		    "error status:" "%d\t", ctx.error);
1161 		/* abort, leaving multicast filtering off */
1162 		return;
1163 	}
1164 
1165 	/* Enable multicast filtering */
1166 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1167 	if (err != 0) {
1168 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1169 		       ", error status: %d\n", err);
1170 	}
1171 }
1172 
1173 static int
1174 mxge_max_mtu(mxge_softc_t *sc)
1175 {
1176 	mxge_cmd_t cmd;
1177 	int status;
1178 
1179 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1180 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1181 
1182 	/* try to set nbufs to see if it we can
1183 	   use virtually contiguous jumbos */
1184 	cmd.data0 = 0;
1185 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1186 			       &cmd);
1187 	if (status == 0)
1188 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1189 
1190 	/* otherwise, we're limited to MJUMPAGESIZE */
1191 	return MJUMPAGESIZE - MXGEFW_PAD;
1192 }
1193 
1194 static int
1195 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1196 {
1197 	struct mxge_slice_state *ss;
1198 	mxge_rx_done_t *rx_done;
1199 	volatile uint32_t *irq_claim;
1200 	mxge_cmd_t cmd;
1201 	int slice, status;
1202 
1203 	/* try to send a reset command to the card to see if it
1204 	   is alive */
1205 	memset(&cmd, 0, sizeof (cmd));
1206 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1207 	if (status != 0) {
1208 		device_printf(sc->dev, "failed reset\n");
1209 		return ENXIO;
1210 	}
1211 
1212 	mxge_dummy_rdma(sc, 1);
1213 
1214 
1215 	/* set the intrq size */
1216 	cmd.data0 = sc->rx_ring_size;
1217 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1218 
1219 	/*
1220 	 * Even though we already know how many slices are supported
1221 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1222 	 * has magic side effects, and must be called after a reset.
1223 	 * It must be called prior to calling any RSS related cmds,
1224 	 * including assigning an interrupt queue for anything but
1225 	 * slice 0.  It must also be called *after*
1226 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1227 	 * the firmware to compute offsets.
1228 	 */
1229 
1230 	if (sc->num_slices > 1) {
1231 		/* ask the maximum number of slices it supports */
1232 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1233 					   &cmd);
1234 		if (status != 0) {
1235 			device_printf(sc->dev,
1236 				      "failed to get number of slices\n");
1237 			return status;
1238 		}
1239 		/*
1240 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1241 		 * to setting up the interrupt queue DMA
1242 		 */
1243 		cmd.data0 = sc->num_slices;
1244 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1245 #ifdef IFNET_BUF_RING
1246 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1247 #endif
1248 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1249 					   &cmd);
1250 		if (status != 0) {
1251 			device_printf(sc->dev,
1252 				      "failed to set number of slices\n");
1253 			return status;
1254 		}
1255 	}
1256 
1257 
1258 	if (interrupts_setup) {
1259 		/* Now exchange information about interrupts  */
1260 		for (slice = 0; slice < sc->num_slices; slice++) {
1261 			rx_done = &sc->ss[slice].rx_done;
1262 			memset(rx_done->entry, 0, sc->rx_ring_size);
1263 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1264 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1265 			cmd.data2 = slice;
1266 			status |= mxge_send_cmd(sc,
1267 						MXGEFW_CMD_SET_INTRQ_DMA,
1268 						&cmd);
1269 		}
1270 	}
1271 
1272 	status |= mxge_send_cmd(sc,
1273 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1274 
1275 
1276 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1277 
1278 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1279 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1280 
1281 
1282 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1283 				&cmd);
1284 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1285 	if (status != 0) {
1286 		device_printf(sc->dev, "failed set interrupt parameters\n");
1287 		return status;
1288 	}
1289 
1290 
1291 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1292 
1293 
1294 	/* run a DMA benchmark */
1295 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1296 
1297 	for (slice = 0; slice < sc->num_slices; slice++) {
1298 		ss = &sc->ss[slice];
1299 
1300 		ss->irq_claim = irq_claim + (2 * slice);
1301 		/* reset mcp/driver shared state back to 0 */
1302 		ss->rx_done.idx = 0;
1303 		ss->rx_done.cnt = 0;
1304 		ss->tx.req = 0;
1305 		ss->tx.done = 0;
1306 		ss->tx.pkt_done = 0;
1307 		ss->tx.queue_active = 0;
1308 		ss->tx.activate = 0;
1309 		ss->tx.deactivate = 0;
1310 		ss->tx.wake = 0;
1311 		ss->tx.defrag = 0;
1312 		ss->tx.stall = 0;
1313 		ss->rx_big.cnt = 0;
1314 		ss->rx_small.cnt = 0;
1315 		ss->lc.lro_bad_csum = 0;
1316 		ss->lc.lro_queued = 0;
1317 		ss->lc.lro_flushed = 0;
1318 		if (ss->fw_stats != NULL) {
1319 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1320 		}
1321 	}
1322 	sc->rdma_tags_available = 15;
1323 	status = mxge_update_mac_address(sc);
1324 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1325 	mxge_change_pause(sc, sc->pause);
1326 	mxge_set_multicast_list(sc);
1327 	if (sc->throttle) {
1328 		cmd.data0 = sc->throttle;
1329 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1330 				  &cmd)) {
1331 			device_printf(sc->dev,
1332 				      "can't enable throttle\n");
1333 		}
1334 	}
1335 	return status;
1336 }
1337 
1338 static int
1339 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1340 {
1341 	mxge_cmd_t cmd;
1342 	mxge_softc_t *sc;
1343 	int err;
1344 	unsigned int throttle;
1345 
1346 	sc = arg1;
1347 	throttle = sc->throttle;
1348 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1349 	if (err != 0) {
1350 		return err;
1351 	}
1352 
1353 	if (throttle == sc->throttle)
1354 		return 0;
1355 
1356 	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1357 		return EINVAL;
1358 
1359 	mtx_lock(&sc->driver_mtx);
1360 	cmd.data0 = throttle;
1361 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1362 	if (err == 0)
1363 		sc->throttle = throttle;
1364 	mtx_unlock(&sc->driver_mtx);
1365 	return err;
1366 }
1367 
1368 static int
1369 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1370 {
1371 	mxge_softc_t *sc;
1372 	unsigned int intr_coal_delay;
1373 	int err;
1374 
1375 	sc = arg1;
1376 	intr_coal_delay = sc->intr_coal_delay;
1377 	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1378 	if (err != 0) {
1379 		return err;
1380 	}
1381 	if (intr_coal_delay == sc->intr_coal_delay)
1382 		return 0;
1383 
1384 	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1385 		return EINVAL;
1386 
1387 	mtx_lock(&sc->driver_mtx);
1388 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1389 	sc->intr_coal_delay = intr_coal_delay;
1390 
1391 	mtx_unlock(&sc->driver_mtx);
1392 	return err;
1393 }
1394 
1395 static int
1396 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1397 {
1398 	mxge_softc_t *sc;
1399 	unsigned int enabled;
1400 	int err;
1401 
1402 	sc = arg1;
1403 	enabled = sc->pause;
1404 	err = sysctl_handle_int(oidp, &enabled, arg2, req);
1405 	if (err != 0) {
1406 		return err;
1407 	}
1408 	if (enabled == sc->pause)
1409 		return 0;
1410 
1411 	mtx_lock(&sc->driver_mtx);
1412 	err = mxge_change_pause(sc, enabled);
1413 	mtx_unlock(&sc->driver_mtx);
1414 	return err;
1415 }
1416 
1417 static int
1418 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1419 {
1420 	int err;
1421 
1422 	if (arg1 == NULL)
1423 		return EFAULT;
1424 	arg2 = be32toh(*(int *)arg1);
1425 	arg1 = NULL;
1426 	err = sysctl_handle_int(oidp, arg1, arg2, req);
1427 
1428 	return err;
1429 }
1430 
1431 static void
1432 mxge_rem_sysctls(mxge_softc_t *sc)
1433 {
1434 	struct mxge_slice_state *ss;
1435 	int slice;
1436 
1437 	if (sc->slice_sysctl_tree == NULL)
1438 		return;
1439 
1440 	for (slice = 0; slice < sc->num_slices; slice++) {
1441 		ss = &sc->ss[slice];
1442 		if (ss == NULL || ss->sysctl_tree == NULL)
1443 			continue;
1444 		sysctl_ctx_free(&ss->sysctl_ctx);
1445 		ss->sysctl_tree = NULL;
1446 	}
1447 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1448 	sc->slice_sysctl_tree = NULL;
1449 }
1450 
1451 static void
1452 mxge_add_sysctls(mxge_softc_t *sc)
1453 {
1454 	struct sysctl_ctx_list *ctx;
1455 	struct sysctl_oid_list *children;
1456 	mcp_irq_data_t *fw;
1457 	struct mxge_slice_state *ss;
1458 	int slice;
1459 	char slice_num[8];
1460 
1461 	ctx = device_get_sysctl_ctx(sc->dev);
1462 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1463 	fw = sc->ss[0].fw_stats;
1464 
1465 	/* random information */
1466 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1467 		       "firmware_version",
1468 		       CTLFLAG_RD, sc->fw_version,
1469 		       0, "firmware version");
1470 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1471 		       "serial_number",
1472 		       CTLFLAG_RD, sc->serial_number_string,
1473 		       0, "serial number");
1474 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1475 		       "product_code",
1476 		       CTLFLAG_RD, sc->product_code_string,
1477 		       0, "product_code");
1478 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1479 		       "pcie_link_width",
1480 		       CTLFLAG_RD, &sc->link_width,
1481 		       0, "tx_boundary");
1482 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1483 		       "tx_boundary",
1484 		       CTLFLAG_RD, &sc->tx_boundary,
1485 		       0, "tx_boundary");
1486 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1487 		       "write_combine",
1488 		       CTLFLAG_RD, &sc->wc,
1489 		       0, "write combining PIO?");
1490 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1491 		       "read_dma_MBs",
1492 		       CTLFLAG_RD, &sc->read_dma,
1493 		       0, "DMA Read speed in MB/s");
1494 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1495 		       "write_dma_MBs",
1496 		       CTLFLAG_RD, &sc->write_dma,
1497 		       0, "DMA Write speed in MB/s");
1498 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1499 		       "read_write_dma_MBs",
1500 		       CTLFLAG_RD, &sc->read_write_dma,
1501 		       0, "DMA concurrent Read/Write speed in MB/s");
1502 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1503 		       "watchdog_resets",
1504 		       CTLFLAG_RD, &sc->watchdog_resets,
1505 		       0, "Number of times NIC was reset");
1506 
1507 
1508 	/* performance related tunables */
1509 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1510 	    "intr_coal_delay", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1511 	    sc, 0, mxge_change_intr_coal, "I",
1512 	    "interrupt coalescing delay in usecs");
1513 
1514 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1515 	    "throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1516 	    mxge_change_throttle, "I", "transmit throttling");
1517 
1518 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1519 	    "flow_control_enabled",
1520 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1521 	    mxge_change_flow_control, "I",
1522 	    "interrupt coalescing delay in usecs");
1523 
1524 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1525 		       "deassert_wait",
1526 		       CTLFLAG_RW, &mxge_deassert_wait,
1527 		       0, "Wait for IRQ line to go low in ihandler");
1528 
1529 	/* stats block from firmware is in network byte order.
1530 	   Need to swap it */
1531 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1532 	    "link_up", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1533 	    &fw->link_up, 0, mxge_handle_be32, "I", "link up");
1534 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1535 	    "rdma_tags_available", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1536 	    &fw->rdma_tags_available, 0, mxge_handle_be32, "I",
1537 	    "rdma_tags_available");
1538 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1539 	    "dropped_bad_crc32", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1540 	    &fw->dropped_bad_crc32, 0, mxge_handle_be32, "I",
1541 	    "dropped_bad_crc32");
1542 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1543 	    "dropped_bad_phy", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1544 	    &fw->dropped_bad_phy, 0, mxge_handle_be32, "I", "dropped_bad_phy");
1545 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1546 	    "dropped_link_error_or_filtered",
1547 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1548 	    &fw->dropped_link_error_or_filtered, 0, mxge_handle_be32, "I",
1549 	    "dropped_link_error_or_filtered");
1550 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1551 	    "dropped_link_overflow",
1552 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1553 	    &fw->dropped_link_overflow, 0, mxge_handle_be32, "I",
1554 	    "dropped_link_overflow");
1555 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1556 	    "dropped_multicast_filtered",
1557 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1558 	    &fw->dropped_multicast_filtered, 0, mxge_handle_be32, "I",
1559 	    "dropped_multicast_filtered");
1560 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1561 	    "dropped_no_big_buffer",
1562 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1563 	    &fw->dropped_no_big_buffer, 0, mxge_handle_be32, "I",
1564 	    "dropped_no_big_buffer");
1565 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1566 	    "dropped_no_small_buffer",
1567 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1568 	    &fw->dropped_no_small_buffer, 0, mxge_handle_be32, "I",
1569 	    "dropped_no_small_buffer");
1570 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1571 	    "dropped_overrun",
1572 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1573 	    &fw->dropped_overrun, 0, mxge_handle_be32, "I",
1574 	    "dropped_overrun");
1575 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1576 	    "dropped_pause", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1577 	    &fw->dropped_pause, 0, mxge_handle_be32, "I", "dropped_pause");
1578 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1579 	    "dropped_runt", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1580 	    &fw->dropped_runt, 0, mxge_handle_be32, "I", "dropped_runt");
1581 
1582 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1583 	    "dropped_unicast_filtered",
1584 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1585 	    &fw->dropped_unicast_filtered, 0, mxge_handle_be32, "I",
1586 	    "dropped_unicast_filtered");
1587 
1588 	/* verbose printing? */
1589 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1590 		       "verbose",
1591 		       CTLFLAG_RW, &mxge_verbose,
1592 		       0, "verbose printing");
1593 
1594 	/* add counters exported for debugging from all slices */
1595 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1596 	sc->slice_sysctl_tree =
1597 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1598 		    "slice", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
1599 
1600 	for (slice = 0; slice < sc->num_slices; slice++) {
1601 		ss = &sc->ss[slice];
1602 		sysctl_ctx_init(&ss->sysctl_ctx);
1603 		ctx = &ss->sysctl_ctx;
1604 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1605 		sprintf(slice_num, "%d", slice);
1606 		ss->sysctl_tree =
1607 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1608 			    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
1609 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1610 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1611 			       "rx_small_cnt",
1612 			       CTLFLAG_RD, &ss->rx_small.cnt,
1613 			       0, "rx_small_cnt");
1614 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1615 			       "rx_big_cnt",
1616 			       CTLFLAG_RD, &ss->rx_big.cnt,
1617 			       0, "rx_small_cnt");
1618 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1619 			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1620 			       0, "number of lro merge queues flushed");
1621 
1622 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1623 			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1624 			       0, "number of bad csums preventing LRO");
1625 
1626 		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1627 			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1628 			       0, "number of frames appended to lro merge"
1629 			       "queues");
1630 
1631 #ifndef IFNET_BUF_RING
1632 		/* only transmit from slice 0 for now */
1633 		if (slice > 0)
1634 			continue;
1635 #endif
1636 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1637 			       "tx_req",
1638 			       CTLFLAG_RD, &ss->tx.req,
1639 			       0, "tx_req");
1640 
1641 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1642 			       "tx_done",
1643 			       CTLFLAG_RD, &ss->tx.done,
1644 			       0, "tx_done");
1645 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1646 			       "tx_pkt_done",
1647 			       CTLFLAG_RD, &ss->tx.pkt_done,
1648 			       0, "tx_done");
1649 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1650 			       "tx_stall",
1651 			       CTLFLAG_RD, &ss->tx.stall,
1652 			       0, "tx_stall");
1653 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1654 			       "tx_wake",
1655 			       CTLFLAG_RD, &ss->tx.wake,
1656 			       0, "tx_wake");
1657 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1658 			       "tx_defrag",
1659 			       CTLFLAG_RD, &ss->tx.defrag,
1660 			       0, "tx_defrag");
1661 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1662 			       "tx_queue_active",
1663 			       CTLFLAG_RD, &ss->tx.queue_active,
1664 			       0, "tx_queue_active");
1665 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1666 			       "tx_activate",
1667 			       CTLFLAG_RD, &ss->tx.activate,
1668 			       0, "tx_activate");
1669 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1670 			       "tx_deactivate",
1671 			       CTLFLAG_RD, &ss->tx.deactivate,
1672 			       0, "tx_deactivate");
1673 	}
1674 }
1675 
1676 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1677    backwards one at a time and handle ring wraps */
1678 
1679 static inline void
1680 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1681 			    mcp_kreq_ether_send_t *src, int cnt)
1682 {
1683 	int idx, starting_slot;
1684 	starting_slot = tx->req;
1685 	while (cnt > 1) {
1686 		cnt--;
1687 		idx = (starting_slot + cnt) & tx->mask;
1688 		mxge_pio_copy(&tx->lanai[idx],
1689 			      &src[cnt], sizeof(*src));
1690 		wmb();
1691 	}
1692 }
1693 
1694 /*
1695  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1696  * at most 32 bytes at a time, so as to avoid involving the software
1697  * pio handler in the nic.   We re-write the first segment's flags
1698  * to mark them valid only after writing the entire chain
1699  */
1700 
1701 static inline void
1702 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1703 		  int cnt)
1704 {
1705 	int idx, i;
1706 	uint32_t *src_ints;
1707 	volatile uint32_t *dst_ints;
1708 	mcp_kreq_ether_send_t *srcp;
1709 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1710 	uint8_t last_flags;
1711 
1712 	idx = tx->req & tx->mask;
1713 
1714 	last_flags = src->flags;
1715 	src->flags = 0;
1716 	wmb();
1717 	dst = dstp = &tx->lanai[idx];
1718 	srcp = src;
1719 
1720 	if ((idx + cnt) < tx->mask) {
1721 		for (i = 0; i < (cnt - 1); i += 2) {
1722 			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1723 			wmb(); /* force write every 32 bytes */
1724 			srcp += 2;
1725 			dstp += 2;
1726 		}
1727 	} else {
1728 		/* submit all but the first request, and ensure
1729 		   that it is submitted below */
1730 		mxge_submit_req_backwards(tx, src, cnt);
1731 		i = 0;
1732 	}
1733 	if (i < cnt) {
1734 		/* submit the first request */
1735 		mxge_pio_copy(dstp, srcp, sizeof(*src));
1736 		wmb(); /* barrier before setting valid flag */
1737 	}
1738 
1739 	/* re-write the last 32-bits with the valid flags */
1740 	src->flags = last_flags;
1741 	src_ints = (uint32_t *)src;
1742 	src_ints+=3;
1743 	dst_ints = (volatile uint32_t *)dst;
1744 	dst_ints+=3;
1745 	*dst_ints =  *src_ints;
1746 	tx->req += cnt;
1747 	wmb();
1748 }
1749 
1750 static int
1751 mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1752     struct mxge_pkt_info *pi)
1753 {
1754 	struct ether_vlan_header *eh;
1755 	uint16_t etype;
1756 	int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1757 #if IFCAP_TSO6 && defined(INET6)
1758 	int nxt;
1759 #endif
1760 
1761 	eh = mtod(m, struct ether_vlan_header *);
1762 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1763 		etype = ntohs(eh->evl_proto);
1764 		pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1765 	} else {
1766 		etype = ntohs(eh->evl_encap_proto);
1767 		pi->ip_off = ETHER_HDR_LEN;
1768 	}
1769 
1770 	switch (etype) {
1771 	case ETHERTYPE_IP:
1772 		/*
1773 		 * ensure ip header is in first mbuf, copy it to a
1774 		 * scratch buffer if not
1775 		 */
1776 		pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1777 		pi->ip6 = NULL;
1778 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1779 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1780 			    ss->scratch);
1781 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1782 		}
1783 		pi->ip_hlen = pi->ip->ip_hl << 2;
1784 		if (!tso)
1785 			return 0;
1786 
1787 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1788 		    sizeof(struct tcphdr))) {
1789 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1790 			    sizeof(struct tcphdr), ss->scratch);
1791 			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1792 		}
1793 		pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1794 		break;
1795 #if IFCAP_TSO6 && defined(INET6)
1796 	case ETHERTYPE_IPV6:
1797 		pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1798 		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1799 			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1800 			    ss->scratch);
1801 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1802 		}
1803 		nxt = 0;
1804 		pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1805 		pi->ip_hlen -= pi->ip_off;
1806 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1807 			return EINVAL;
1808 
1809 		if (!tso)
1810 			return 0;
1811 
1812 		if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1813 			return EINVAL;
1814 
1815 		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1816 		    sizeof(struct tcphdr))) {
1817 			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1818 			    sizeof(struct tcphdr), ss->scratch);
1819 			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1820 		}
1821 		pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1822 		break;
1823 #endif
1824 	default:
1825 		return EINVAL;
1826 	}
1827 	return 0;
1828 }
1829 
1830 #if IFCAP_TSO4
1831 
1832 static void
1833 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1834 	       int busdma_seg_cnt, struct mxge_pkt_info *pi)
1835 {
1836 	mxge_tx_ring_t *tx;
1837 	mcp_kreq_ether_send_t *req;
1838 	bus_dma_segment_t *seg;
1839 	uint32_t low, high_swapped;
1840 	int len, seglen, cum_len, cum_len_next;
1841 	int next_is_first, chop, cnt, rdma_count, small;
1842 	uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1843 	uint8_t flags, flags_next;
1844 	static int once;
1845 
1846 	mss = m->m_pkthdr.tso_segsz;
1847 
1848 	/* negative cum_len signifies to the
1849 	 * send loop that we are still in the
1850 	 * header portion of the TSO packet.
1851 	 */
1852 
1853 	cksum_offset = pi->ip_off + pi->ip_hlen;
1854 	cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1855 
1856 	/* TSO implies checksum offload on this hardware */
1857 	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1858 		/*
1859 		 * If packet has full TCP csum, replace it with pseudo hdr
1860 		 * sum that the NIC expects, otherwise the NIC will emit
1861 		 * packets with bad TCP checksums.
1862 		 */
1863 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1864 		if (pi->ip6) {
1865 #if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1866 			m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1867 			sum = in6_cksum_pseudo(pi->ip6,
1868 			    m->m_pkthdr.len - cksum_offset,
1869 			    IPPROTO_TCP, 0);
1870 #endif
1871 		} else {
1872 #ifdef INET
1873 			m->m_pkthdr.csum_flags |= CSUM_TCP;
1874 			sum = in_pseudo(pi->ip->ip_src.s_addr,
1875 			    pi->ip->ip_dst.s_addr,
1876 			    htons(IPPROTO_TCP + (m->m_pkthdr.len -
1877 				    cksum_offset)));
1878 #endif
1879 		}
1880 		m_copyback(m, offsetof(struct tcphdr, th_sum) +
1881 		    cksum_offset, sizeof(sum), (caddr_t)&sum);
1882 	}
1883 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1884 
1885 
1886 	/* for TSO, pseudo_hdr_offset holds mss.
1887 	 * The firmware figures out where to put
1888 	 * the checksum by parsing the header. */
1889 	pseudo_hdr_offset = htobe16(mss);
1890 
1891 	if (pi->ip6) {
1892 		/*
1893 		 * for IPv6 TSO, the "checksum offset" is re-purposed
1894 		 * to store the TCP header len
1895 		 */
1896 		cksum_offset = (pi->tcp->th_off << 2);
1897 	}
1898 
1899 	tx = &ss->tx;
1900 	req = tx->req_list;
1901 	seg = tx->seg_list;
1902 	cnt = 0;
1903 	rdma_count = 0;
1904 	/* "rdma_count" is the number of RDMAs belonging to the
1905 	 * current packet BEFORE the current send request. For
1906 	 * non-TSO packets, this is equal to "count".
1907 	 * For TSO packets, rdma_count needs to be reset
1908 	 * to 0 after a segment cut.
1909 	 *
1910 	 * The rdma_count field of the send request is
1911 	 * the number of RDMAs of the packet starting at
1912 	 * that request. For TSO send requests with one ore more cuts
1913 	 * in the middle, this is the number of RDMAs starting
1914 	 * after the last cut in the request. All previous
1915 	 * segments before the last cut implicitly have 1 RDMA.
1916 	 *
1917 	 * Since the number of RDMAs is not known beforehand,
1918 	 * it must be filled-in retroactively - after each
1919 	 * segmentation cut or at the end of the entire packet.
1920 	 */
1921 
1922 	while (busdma_seg_cnt) {
1923 		/* Break the busdma segment up into pieces*/
1924 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1925 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1926 		len = seg->ds_len;
1927 
1928 		while (len) {
1929 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1930 			seglen = len;
1931 			cum_len_next = cum_len + seglen;
1932 			(req-rdma_count)->rdma_count = rdma_count + 1;
1933 			if (__predict_true(cum_len >= 0)) {
1934 				/* payload */
1935 				chop = (cum_len_next > mss);
1936 				cum_len_next = cum_len_next % mss;
1937 				next_is_first = (cum_len_next == 0);
1938 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1939 				flags_next |= next_is_first *
1940 					MXGEFW_FLAGS_FIRST;
1941 				rdma_count |= -(chop | next_is_first);
1942 				rdma_count += chop & !next_is_first;
1943 			} else if (cum_len_next >= 0) {
1944 				/* header ends */
1945 				rdma_count = -1;
1946 				cum_len_next = 0;
1947 				seglen = -cum_len;
1948 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1949 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1950 					MXGEFW_FLAGS_FIRST |
1951 					(small * MXGEFW_FLAGS_SMALL);
1952 			    }
1953 
1954 			req->addr_high = high_swapped;
1955 			req->addr_low = htobe32(low);
1956 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1957 			req->pad = 0;
1958 			req->rdma_count = 1;
1959 			req->length = htobe16(seglen);
1960 			req->cksum_offset = cksum_offset;
1961 			req->flags = flags | ((cum_len & 1) *
1962 					      MXGEFW_FLAGS_ALIGN_ODD);
1963 			low += seglen;
1964 			len -= seglen;
1965 			cum_len = cum_len_next;
1966 			flags = flags_next;
1967 			req++;
1968 			cnt++;
1969 			rdma_count++;
1970 			if (cksum_offset != 0 && !pi->ip6) {
1971 				if (__predict_false(cksum_offset > seglen))
1972 					cksum_offset -= seglen;
1973 				else
1974 					cksum_offset = 0;
1975 			}
1976 			if (__predict_false(cnt > tx->max_desc))
1977 				goto drop;
1978 		}
1979 		busdma_seg_cnt--;
1980 		seg++;
1981 	}
1982 	(req-rdma_count)->rdma_count = rdma_count;
1983 
1984 	do {
1985 		req--;
1986 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1987 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1988 
1989 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1990 	mxge_submit_req(tx, tx->req_list, cnt);
1991 #ifdef IFNET_BUF_RING
1992 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1993 		/* tell the NIC to start polling this slice */
1994 		*tx->send_go = 1;
1995 		tx->queue_active = 1;
1996 		tx->activate++;
1997 		wmb();
1998 	}
1999 #endif
2000 	return;
2001 
2002 drop:
2003 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
2004 	m_freem(m);
2005 	ss->oerrors++;
2006 	if (!once) {
2007 		printf("tx->max_desc exceeded via TSO!\n");
2008 		printf("mss = %d, %ld, %d!\n", mss,
2009 		       (long)seg - (long)tx->seg_list, tx->max_desc);
2010 		once = 1;
2011 	}
2012 	return;
2013 
2014 }
2015 
2016 #endif /* IFCAP_TSO4 */
2017 
2018 #ifdef MXGE_NEW_VLAN_API
2019 /*
2020  * We reproduce the software vlan tag insertion from
2021  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2022  * vlan tag insertion. We need to advertise this in order to have the
2023  * vlan interface respect our csum offload flags.
2024  */
2025 static struct mbuf *
2026 mxge_vlan_tag_insert(struct mbuf *m)
2027 {
2028 	struct ether_vlan_header *evl;
2029 
2030 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
2031 	if (__predict_false(m == NULL))
2032 		return NULL;
2033 	if (m->m_len < sizeof(*evl)) {
2034 		m = m_pullup(m, sizeof(*evl));
2035 		if (__predict_false(m == NULL))
2036 			return NULL;
2037 	}
2038 	/*
2039 	 * Transform the Ethernet header into an Ethernet header
2040 	 * with 802.1Q encapsulation.
2041 	 */
2042 	evl = mtod(m, struct ether_vlan_header *);
2043 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2044 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2045 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2046 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2047 	m->m_flags &= ~M_VLANTAG;
2048 	return m;
2049 }
2050 #endif /* MXGE_NEW_VLAN_API */
2051 
2052 static void
2053 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2054 {
2055 	struct mxge_pkt_info pi = {0,0,0,0};
2056 	mxge_softc_t *sc;
2057 	mcp_kreq_ether_send_t *req;
2058 	bus_dma_segment_t *seg;
2059 	struct mbuf *m_tmp;
2060 	struct ifnet *ifp;
2061 	mxge_tx_ring_t *tx;
2062 	int cnt, cum_len, err, i, idx, odd_flag;
2063 	uint16_t pseudo_hdr_offset;
2064 	uint8_t flags, cksum_offset;
2065 
2066 
2067 	sc = ss->sc;
2068 	ifp = sc->ifp;
2069 	tx = &ss->tx;
2070 
2071 #ifdef MXGE_NEW_VLAN_API
2072 	if (m->m_flags & M_VLANTAG) {
2073 		m = mxge_vlan_tag_insert(m);
2074 		if (__predict_false(m == NULL))
2075 			goto drop_without_m;
2076 	}
2077 #endif
2078 	if (m->m_pkthdr.csum_flags &
2079 	    (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2080 		if (mxge_parse_tx(ss, m, &pi))
2081 			goto drop;
2082 	}
2083 
2084 	/* (try to) map the frame for DMA */
2085 	idx = tx->req & tx->mask;
2086 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2087 				      m, tx->seg_list, &cnt,
2088 				      BUS_DMA_NOWAIT);
2089 	if (__predict_false(err == EFBIG)) {
2090 		/* Too many segments in the chain.  Try
2091 		   to defrag */
2092 		m_tmp = m_defrag(m, M_NOWAIT);
2093 		if (m_tmp == NULL) {
2094 			goto drop;
2095 		}
2096 		ss->tx.defrag++;
2097 		m = m_tmp;
2098 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2099 					      tx->info[idx].map,
2100 					      m, tx->seg_list, &cnt,
2101 					      BUS_DMA_NOWAIT);
2102 	}
2103 	if (__predict_false(err != 0)) {
2104 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2105 			      " packet len = %d\n", err, m->m_pkthdr.len);
2106 		goto drop;
2107 	}
2108 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2109 			BUS_DMASYNC_PREWRITE);
2110 	tx->info[idx].m = m;
2111 
2112 #if IFCAP_TSO4
2113 	/* TSO is different enough, we handle it in another routine */
2114 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2115 		mxge_encap_tso(ss, m, cnt, &pi);
2116 		return;
2117 	}
2118 #endif
2119 
2120 	req = tx->req_list;
2121 	cksum_offset = 0;
2122 	pseudo_hdr_offset = 0;
2123 	flags = MXGEFW_FLAGS_NO_TSO;
2124 
2125 	/* checksum offloading? */
2126 	if (m->m_pkthdr.csum_flags &
2127 	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2128 		/* ensure ip header is in first mbuf, copy
2129 		   it to a scratch buffer if not */
2130 		cksum_offset = pi.ip_off + pi.ip_hlen;
2131 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2132 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2133 		req->cksum_offset = cksum_offset;
2134 		flags |= MXGEFW_FLAGS_CKSUM;
2135 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2136 	} else {
2137 		odd_flag = 0;
2138 	}
2139 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2140 		flags |= MXGEFW_FLAGS_SMALL;
2141 
2142 	/* convert segments into a request list */
2143 	cum_len = 0;
2144 	seg = tx->seg_list;
2145 	req->flags = MXGEFW_FLAGS_FIRST;
2146 	for (i = 0; i < cnt; i++) {
2147 		req->addr_low =
2148 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2149 		req->addr_high =
2150 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2151 		req->length = htobe16(seg->ds_len);
2152 		req->cksum_offset = cksum_offset;
2153 		if (cksum_offset > seg->ds_len)
2154 			cksum_offset -= seg->ds_len;
2155 		else
2156 			cksum_offset = 0;
2157 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2158 		req->pad = 0; /* complete solid 16-byte block */
2159 		req->rdma_count = 1;
2160 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2161 		cum_len += seg->ds_len;
2162 		seg++;
2163 		req++;
2164 		req->flags = 0;
2165 	}
2166 	req--;
2167 	/* pad runts to 60 bytes */
2168 	if (cum_len < 60) {
2169 		req++;
2170 		req->addr_low =
2171 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2172 		req->addr_high =
2173 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2174 		req->length = htobe16(60 - cum_len);
2175 		req->cksum_offset = 0;
2176 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2177 		req->pad = 0; /* complete solid 16-byte block */
2178 		req->rdma_count = 1;
2179 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2180 		cnt++;
2181 	}
2182 
2183 	tx->req_list[0].rdma_count = cnt;
2184 #if 0
2185 	/* print what the firmware will see */
2186 	for (i = 0; i < cnt; i++) {
2187 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2188 		    "cso:%d, flags:0x%x, rdma:%d\n",
2189 		    i, (int)ntohl(tx->req_list[i].addr_high),
2190 		    (int)ntohl(tx->req_list[i].addr_low),
2191 		    (int)ntohs(tx->req_list[i].length),
2192 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2193 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2194 		    tx->req_list[i].rdma_count);
2195 	}
2196 	printf("--------------\n");
2197 #endif
2198 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2199 	mxge_submit_req(tx, tx->req_list, cnt);
2200 #ifdef IFNET_BUF_RING
2201 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2202 		/* tell the NIC to start polling this slice */
2203 		*tx->send_go = 1;
2204 		tx->queue_active = 1;
2205 		tx->activate++;
2206 		wmb();
2207 	}
2208 #endif
2209 	return;
2210 
2211 drop:
2212 	m_freem(m);
2213 drop_without_m:
2214 	ss->oerrors++;
2215 	return;
2216 }
2217 
2218 #ifdef IFNET_BUF_RING
2219 static void
2220 mxge_qflush(struct ifnet *ifp)
2221 {
2222 	mxge_softc_t *sc = ifp->if_softc;
2223 	mxge_tx_ring_t *tx;
2224 	struct mbuf *m;
2225 	int slice;
2226 
2227 	for (slice = 0; slice < sc->num_slices; slice++) {
2228 		tx = &sc->ss[slice].tx;
2229 		mtx_lock(&tx->mtx);
2230 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2231 			m_freem(m);
2232 		mtx_unlock(&tx->mtx);
2233 	}
2234 	if_qflush(ifp);
2235 }
2236 
2237 static inline void
2238 mxge_start_locked(struct mxge_slice_state *ss)
2239 {
2240 	mxge_softc_t *sc;
2241 	struct mbuf *m;
2242 	struct ifnet *ifp;
2243 	mxge_tx_ring_t *tx;
2244 
2245 	sc = ss->sc;
2246 	ifp = sc->ifp;
2247 	tx = &ss->tx;
2248 
2249 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2250 		m = drbr_dequeue(ifp, tx->br);
2251 		if (m == NULL) {
2252 			return;
2253 		}
2254 		/* let BPF see it */
2255 		BPF_MTAP(ifp, m);
2256 
2257 		/* give it to the nic */
2258 		mxge_encap(ss, m);
2259 	}
2260 	/* ran out of transmit slots */
2261 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2262 	    && (!drbr_empty(ifp, tx->br))) {
2263 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2264 		tx->stall++;
2265 	}
2266 }
2267 
2268 static int
2269 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2270 {
2271 	mxge_softc_t *sc;
2272 	struct ifnet *ifp;
2273 	mxge_tx_ring_t *tx;
2274 	int err;
2275 
2276 	sc = ss->sc;
2277 	ifp = sc->ifp;
2278 	tx = &ss->tx;
2279 
2280 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2281 	    IFF_DRV_RUNNING) {
2282 		err = drbr_enqueue(ifp, tx->br, m);
2283 		return (err);
2284 	}
2285 
2286 	if (!drbr_needs_enqueue(ifp, tx->br) &&
2287 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2288 		/* let BPF see it */
2289 		BPF_MTAP(ifp, m);
2290 		/* give it to the nic */
2291 		mxge_encap(ss, m);
2292 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2293 		return (err);
2294 	}
2295 	if (!drbr_empty(ifp, tx->br))
2296 		mxge_start_locked(ss);
2297 	return (0);
2298 }
2299 
2300 static int
2301 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2302 {
2303 	mxge_softc_t *sc = ifp->if_softc;
2304 	struct mxge_slice_state *ss;
2305 	mxge_tx_ring_t *tx;
2306 	int err = 0;
2307 	int slice;
2308 
2309 	slice = m->m_pkthdr.flowid;
2310 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2311 
2312 	ss = &sc->ss[slice];
2313 	tx = &ss->tx;
2314 
2315 	if (mtx_trylock(&tx->mtx)) {
2316 		err = mxge_transmit_locked(ss, m);
2317 		mtx_unlock(&tx->mtx);
2318 	} else {
2319 		err = drbr_enqueue(ifp, tx->br, m);
2320 	}
2321 
2322 	return (err);
2323 }
2324 
2325 #else
2326 
2327 static inline void
2328 mxge_start_locked(struct mxge_slice_state *ss)
2329 {
2330 	mxge_softc_t *sc;
2331 	struct mbuf *m;
2332 	struct ifnet *ifp;
2333 	mxge_tx_ring_t *tx;
2334 
2335 	sc = ss->sc;
2336 	ifp = sc->ifp;
2337 	tx = &ss->tx;
2338 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2339 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2340 		if (m == NULL) {
2341 			return;
2342 		}
2343 		/* let BPF see it */
2344 		BPF_MTAP(ifp, m);
2345 
2346 		/* give it to the nic */
2347 		mxge_encap(ss, m);
2348 	}
2349 	/* ran out of transmit slots */
2350 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2351 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2352 		tx->stall++;
2353 	}
2354 }
2355 #endif
2356 static void
2357 mxge_start(struct ifnet *ifp)
2358 {
2359 	mxge_softc_t *sc = ifp->if_softc;
2360 	struct mxge_slice_state *ss;
2361 
2362 	/* only use the first slice for now */
2363 	ss = &sc->ss[0];
2364 	mtx_lock(&ss->tx.mtx);
2365 	mxge_start_locked(ss);
2366 	mtx_unlock(&ss->tx.mtx);
2367 }
2368 
2369 /*
2370  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2371  * at most 32 bytes at a time, so as to avoid involving the software
2372  * pio handler in the nic.   We re-write the first segment's low
2373  * DMA address to mark it valid only after we write the entire chunk
2374  * in a burst
2375  */
2376 static inline void
2377 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2378 		mcp_kreq_ether_recv_t *src)
2379 {
2380 	uint32_t low;
2381 
2382 	low = src->addr_low;
2383 	src->addr_low = 0xffffffff;
2384 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2385 	wmb();
2386 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2387 	wmb();
2388 	src->addr_low = low;
2389 	dst->addr_low = low;
2390 	wmb();
2391 }
2392 
2393 static int
2394 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2395 {
2396 	bus_dma_segment_t seg;
2397 	struct mbuf *m;
2398 	mxge_rx_ring_t *rx = &ss->rx_small;
2399 	int cnt, err;
2400 
2401 	m = m_gethdr(M_NOWAIT, MT_DATA);
2402 	if (m == NULL) {
2403 		rx->alloc_fail++;
2404 		err = ENOBUFS;
2405 		goto done;
2406 	}
2407 	m->m_len = MHLEN;
2408 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2409 				      &seg, &cnt, BUS_DMA_NOWAIT);
2410 	if (err != 0) {
2411 		m_free(m);
2412 		goto done;
2413 	}
2414 	rx->info[idx].m = m;
2415 	rx->shadow[idx].addr_low =
2416 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2417 	rx->shadow[idx].addr_high =
2418 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2419 
2420 done:
2421 	if ((idx & 7) == 7)
2422 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2423 	return err;
2424 }
2425 
2426 static int
2427 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2428 {
2429 	bus_dma_segment_t seg[3];
2430 	struct mbuf *m;
2431 	mxge_rx_ring_t *rx = &ss->rx_big;
2432 	int cnt, err, i;
2433 
2434 	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2435 	if (m == NULL) {
2436 		rx->alloc_fail++;
2437 		err = ENOBUFS;
2438 		goto done;
2439 	}
2440 	m->m_len = rx->mlen;
2441 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2442 				      seg, &cnt, BUS_DMA_NOWAIT);
2443 	if (err != 0) {
2444 		m_free(m);
2445 		goto done;
2446 	}
2447 	rx->info[idx].m = m;
2448 	rx->shadow[idx].addr_low =
2449 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2450 	rx->shadow[idx].addr_high =
2451 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2452 
2453 #if MXGE_VIRT_JUMBOS
2454 	for (i = 1; i < cnt; i++) {
2455 		rx->shadow[idx + i].addr_low =
2456 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2457 		rx->shadow[idx + i].addr_high =
2458 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2459        }
2460 #endif
2461 
2462 done:
2463        for (i = 0; i < rx->nbufs; i++) {
2464 		if ((idx & 7) == 7) {
2465 			mxge_submit_8rx(&rx->lanai[idx - 7],
2466 					&rx->shadow[idx - 7]);
2467 		}
2468 		idx++;
2469 	}
2470 	return err;
2471 }
2472 
2473 #ifdef INET6
2474 
2475 static uint16_t
2476 mxge_csum_generic(uint16_t *raw, int len)
2477 {
2478 	uint32_t csum;
2479 
2480 
2481 	csum = 0;
2482 	while (len > 0) {
2483 		csum += *raw;
2484 		raw++;
2485 		len -= 2;
2486 	}
2487 	csum = (csum >> 16) + (csum & 0xffff);
2488 	csum = (csum >> 16) + (csum & 0xffff);
2489 	return (uint16_t)csum;
2490 }
2491 
2492 static inline uint16_t
2493 mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2494 {
2495 	uint32_t partial;
2496 	int nxt, cksum_offset;
2497 	struct ip6_hdr *ip6 = p;
2498 	uint16_t c;
2499 
2500 	nxt = ip6->ip6_nxt;
2501 	cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2502 	if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2503 		cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2504 					   IPPROTO_IPV6, &nxt);
2505 		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2506 			return (1);
2507 	}
2508 
2509 	/*
2510 	 * IPv6 headers do not contain a checksum, and hence
2511 	 * do not checksum to zero, so they don't "fall out"
2512 	 * of the partial checksum calculation like IPv4
2513 	 * headers do.  We need to fix the partial checksum by
2514 	 * subtracting the checksum of the IPv6 header.
2515 	 */
2516 
2517 	partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2518 				    ETHER_HDR_LEN);
2519 	csum += ~partial;
2520 	csum +=	 (csum < ~partial);
2521 	csum = (csum >> 16) + (csum & 0xFFFF);
2522 	csum = (csum >> 16) + (csum & 0xFFFF);
2523 	c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2524 			     csum);
2525 	c ^= 0xffff;
2526 	return (c);
2527 }
2528 #endif /* INET6 */
2529 /*
2530  *  Myri10GE hardware checksums are not valid if the sender
2531  *  padded the frame with non-zero padding.  This is because
2532  *  the firmware just does a simple 16-bit 1s complement
2533  *  checksum across the entire frame, excluding the first 14
2534  *  bytes.  It is best to simply to check the checksum and
2535  *  tell the stack about it only if the checksum is good
2536  */
2537 
2538 static inline uint16_t
2539 mxge_rx_csum(struct mbuf *m, int csum)
2540 {
2541 	struct ether_header *eh;
2542 #ifdef INET
2543 	struct ip *ip;
2544 #endif
2545 #if defined(INET) || defined(INET6)
2546 	int cap = m->m_pkthdr.rcvif->if_capenable;
2547 #endif
2548 	uint16_t c, etype;
2549 
2550 
2551 	eh = mtod(m, struct ether_header *);
2552 	etype = ntohs(eh->ether_type);
2553 	switch (etype) {
2554 #ifdef INET
2555 	case ETHERTYPE_IP:
2556 		if ((cap & IFCAP_RXCSUM) == 0)
2557 			return (1);
2558 		ip = (struct ip *)(eh + 1);
2559 		if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2560 			return (1);
2561 		c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2562 			      htonl(ntohs(csum) + ntohs(ip->ip_len) -
2563 				    (ip->ip_hl << 2) + ip->ip_p));
2564 		c ^= 0xffff;
2565 		break;
2566 #endif
2567 #ifdef INET6
2568 	case ETHERTYPE_IPV6:
2569 		if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2570 			return (1);
2571 		c = mxge_rx_csum6((eh + 1), m, csum);
2572 		break;
2573 #endif
2574 	default:
2575 		c = 1;
2576 	}
2577 	return (c);
2578 }
2579 
2580 static void
2581 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2582 {
2583 	struct ether_vlan_header *evl;
2584 	struct ether_header *eh;
2585 	uint32_t partial;
2586 
2587 	evl = mtod(m, struct ether_vlan_header *);
2588 	eh = mtod(m, struct ether_header *);
2589 
2590 	/*
2591 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2592 	 * after what the firmware thought was the end of the ethernet
2593 	 * header.
2594 	 */
2595 
2596 	/* put checksum into host byte order */
2597 	*csum = ntohs(*csum);
2598 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2599 	(*csum) += ~partial;
2600 	(*csum) +=  ((*csum) < ~partial);
2601 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2602 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2603 
2604 	/* restore checksum to network byte order;
2605 	   later consumers expect this */
2606 	*csum = htons(*csum);
2607 
2608 	/* save the tag */
2609 #ifdef MXGE_NEW_VLAN_API
2610 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2611 #else
2612 	{
2613 		struct m_tag *mtag;
2614 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2615 				   M_NOWAIT);
2616 		if (mtag == NULL)
2617 			return;
2618 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2619 		m_tag_prepend(m, mtag);
2620 	}
2621 
2622 #endif
2623 	m->m_flags |= M_VLANTAG;
2624 
2625 	/*
2626 	 * Remove the 802.1q header by copying the Ethernet
2627 	 * addresses over it and adjusting the beginning of
2628 	 * the data in the mbuf.  The encapsulated Ethernet
2629 	 * type field is already in place.
2630 	 */
2631 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2632 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2633 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2634 }
2635 
2636 
2637 static inline void
2638 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2639 		 uint32_t csum, int lro)
2640 {
2641 	mxge_softc_t *sc;
2642 	struct ifnet *ifp;
2643 	struct mbuf *m;
2644 	struct ether_header *eh;
2645 	mxge_rx_ring_t *rx;
2646 	bus_dmamap_t old_map;
2647 	int idx;
2648 
2649 	sc = ss->sc;
2650 	ifp = sc->ifp;
2651 	rx = &ss->rx_big;
2652 	idx = rx->cnt & rx->mask;
2653 	rx->cnt += rx->nbufs;
2654 	/* save a pointer to the received mbuf */
2655 	m = rx->info[idx].m;
2656 	/* try to replace the received mbuf */
2657 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2658 		/* drop the frame -- the old mbuf is re-cycled */
2659 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2660 		return;
2661 	}
2662 
2663 	/* unmap the received buffer */
2664 	old_map = rx->info[idx].map;
2665 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2666 	bus_dmamap_unload(rx->dmat, old_map);
2667 
2668 	/* swap the bus_dmamap_t's */
2669 	rx->info[idx].map = rx->extra_map;
2670 	rx->extra_map = old_map;
2671 
2672 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2673 	 * aligned */
2674 	m->m_data += MXGEFW_PAD;
2675 
2676 	m->m_pkthdr.rcvif = ifp;
2677 	m->m_len = m->m_pkthdr.len = len;
2678 	ss->ipackets++;
2679 	eh = mtod(m, struct ether_header *);
2680 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2681 		mxge_vlan_tag_remove(m, &csum);
2682 	}
2683 	/* flowid only valid if RSS hashing is enabled */
2684 	if (sc->num_slices > 1) {
2685 		m->m_pkthdr.flowid = (ss - sc->ss);
2686 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2687 	}
2688 	/* if the checksum is valid, mark it in the mbuf header */
2689 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2690 	    (0 == mxge_rx_csum(m, csum))) {
2691 		/* Tell the stack that the  checksum is good */
2692 		m->m_pkthdr.csum_data = 0xffff;
2693 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2694 			CSUM_DATA_VALID;
2695 
2696 #if defined(INET) || defined (INET6)
2697 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2698 			return;
2699 #endif
2700 	}
2701 	/* pass the frame up the stack */
2702 	(*ifp->if_input)(ifp, m);
2703 }
2704 
2705 static inline void
2706 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2707 		   uint32_t csum, int lro)
2708 {
2709 	mxge_softc_t *sc;
2710 	struct ifnet *ifp;
2711 	struct ether_header *eh;
2712 	struct mbuf *m;
2713 	mxge_rx_ring_t *rx;
2714 	bus_dmamap_t old_map;
2715 	int idx;
2716 
2717 	sc = ss->sc;
2718 	ifp = sc->ifp;
2719 	rx = &ss->rx_small;
2720 	idx = rx->cnt & rx->mask;
2721 	rx->cnt++;
2722 	/* save a pointer to the received mbuf */
2723 	m = rx->info[idx].m;
2724 	/* try to replace the received mbuf */
2725 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2726 		/* drop the frame -- the old mbuf is re-cycled */
2727 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2728 		return;
2729 	}
2730 
2731 	/* unmap the received buffer */
2732 	old_map = rx->info[idx].map;
2733 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2734 	bus_dmamap_unload(rx->dmat, old_map);
2735 
2736 	/* swap the bus_dmamap_t's */
2737 	rx->info[idx].map = rx->extra_map;
2738 	rx->extra_map = old_map;
2739 
2740 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2741 	 * aligned */
2742 	m->m_data += MXGEFW_PAD;
2743 
2744 	m->m_pkthdr.rcvif = ifp;
2745 	m->m_len = m->m_pkthdr.len = len;
2746 	ss->ipackets++;
2747 	eh = mtod(m, struct ether_header *);
2748 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2749 		mxge_vlan_tag_remove(m, &csum);
2750 	}
2751 	/* flowid only valid if RSS hashing is enabled */
2752 	if (sc->num_slices > 1) {
2753 		m->m_pkthdr.flowid = (ss - sc->ss);
2754 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2755 	}
2756 	/* if the checksum is valid, mark it in the mbuf header */
2757 	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2758 	    (0 == mxge_rx_csum(m, csum))) {
2759 		/* Tell the stack that the  checksum is good */
2760 		m->m_pkthdr.csum_data = 0xffff;
2761 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2762 			CSUM_DATA_VALID;
2763 
2764 #if defined(INET) || defined (INET6)
2765 		if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2766 			return;
2767 #endif
2768 	}
2769 	/* pass the frame up the stack */
2770 	(*ifp->if_input)(ifp, m);
2771 }
2772 
2773 static inline void
2774 mxge_clean_rx_done(struct mxge_slice_state *ss)
2775 {
2776 	mxge_rx_done_t *rx_done = &ss->rx_done;
2777 	int limit = 0;
2778 	uint16_t length;
2779 	uint16_t checksum;
2780 	int lro;
2781 
2782 	lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
2783 	while (rx_done->entry[rx_done->idx].length != 0) {
2784 		length = ntohs(rx_done->entry[rx_done->idx].length);
2785 		rx_done->entry[rx_done->idx].length = 0;
2786 		checksum = rx_done->entry[rx_done->idx].checksum;
2787 		if (length <= (MHLEN - MXGEFW_PAD))
2788 			mxge_rx_done_small(ss, length, checksum, lro);
2789 		else
2790 			mxge_rx_done_big(ss, length, checksum, lro);
2791 		rx_done->cnt++;
2792 		rx_done->idx = rx_done->cnt & rx_done->mask;
2793 
2794 		/* limit potential for livelock */
2795 		if (__predict_false(++limit > rx_done->mask / 2))
2796 			break;
2797 	}
2798 #if defined(INET)  || defined (INET6)
2799 	tcp_lro_flush_all(&ss->lc);
2800 #endif
2801 }
2802 
2803 
2804 static inline void
2805 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2806 {
2807 	struct ifnet *ifp;
2808 	mxge_tx_ring_t *tx;
2809 	struct mbuf *m;
2810 	bus_dmamap_t map;
2811 	int idx;
2812 	int *flags;
2813 
2814 	tx = &ss->tx;
2815 	ifp = ss->sc->ifp;
2816 	while (tx->pkt_done != mcp_idx) {
2817 		idx = tx->done & tx->mask;
2818 		tx->done++;
2819 		m = tx->info[idx].m;
2820 		/* mbuf and DMA map only attached to the first
2821 		   segment per-mbuf */
2822 		if (m != NULL) {
2823 			ss->obytes += m->m_pkthdr.len;
2824 			if (m->m_flags & M_MCAST)
2825 				ss->omcasts++;
2826 			ss->opackets++;
2827 			tx->info[idx].m = NULL;
2828 			map = tx->info[idx].map;
2829 			bus_dmamap_unload(tx->dmat, map);
2830 			m_freem(m);
2831 		}
2832 		if (tx->info[idx].flag) {
2833 			tx->info[idx].flag = 0;
2834 			tx->pkt_done++;
2835 		}
2836 	}
2837 
2838 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2839 	   its OK to send packets */
2840 #ifdef IFNET_BUF_RING
2841 	flags = &ss->if_drv_flags;
2842 #else
2843 	flags = &ifp->if_drv_flags;
2844 #endif
2845 	mtx_lock(&ss->tx.mtx);
2846 	if ((*flags) & IFF_DRV_OACTIVE &&
2847 	    tx->req - tx->done < (tx->mask + 1)/4) {
2848 		*(flags) &= ~IFF_DRV_OACTIVE;
2849 		ss->tx.wake++;
2850 		mxge_start_locked(ss);
2851 	}
2852 #ifdef IFNET_BUF_RING
2853 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2854 		/* let the NIC stop polling this queue, since there
2855 		 * are no more transmits pending */
2856 		if (tx->req == tx->done) {
2857 			*tx->send_stop = 1;
2858 			tx->queue_active = 0;
2859 			tx->deactivate++;
2860 			wmb();
2861 		}
2862 	}
2863 #endif
2864 	mtx_unlock(&ss->tx.mtx);
2865 
2866 }
2867 
2868 static struct mxge_media_type mxge_xfp_media_types[] =
2869 {
2870 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2871 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2872 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2873 	{0,		(1 << 5),	"10GBASE-ER"},
2874 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2875 	{0,		(1 << 3),	"10GBASE-SW"},
2876 	{0,		(1 << 2),	"10GBASE-LW"},
2877 	{0,		(1 << 1),	"10GBASE-EW"},
2878 	{0,		(1 << 0),	"Reserved"}
2879 };
2880 static struct mxge_media_type mxge_sfp_media_types[] =
2881 {
2882 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2883 	{0,		(1 << 7),	"Reserved"},
2884 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2885 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2886 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2887 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2888 };
2889 
2890 static void
2891 mxge_media_set(mxge_softc_t *sc, int media_type)
2892 {
2893 
2894 
2895 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2896 		    0, NULL);
2897 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2898 	sc->current_media = media_type;
2899 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2900 }
2901 
2902 static void
2903 mxge_media_init(mxge_softc_t *sc)
2904 {
2905 	char *ptr;
2906 	int i;
2907 
2908 	ifmedia_removeall(&sc->media);
2909 	mxge_media_set(sc, IFM_AUTO);
2910 
2911 	/*
2912 	 * parse the product code to deterimine the interface type
2913 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2914 	 * after the 3rd dash in the driver's cached copy of the
2915 	 * EEPROM's product code string.
2916 	 */
2917 	ptr = sc->product_code_string;
2918 	if (ptr == NULL) {
2919 		device_printf(sc->dev, "Missing product code\n");
2920 		return;
2921 	}
2922 
2923 	for (i = 0; i < 3; i++, ptr++) {
2924 		ptr = strchr(ptr, '-');
2925 		if (ptr == NULL) {
2926 			device_printf(sc->dev,
2927 				      "only %d dashes in PC?!?\n", i);
2928 			return;
2929 		}
2930 	}
2931 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2932 		/* -C is CX4 */
2933 		sc->connector = MXGE_CX4;
2934 		mxge_media_set(sc, IFM_10G_CX4);
2935 	} else if (*ptr == 'Q') {
2936 		/* -Q is Quad Ribbon Fiber */
2937 		sc->connector = MXGE_QRF;
2938 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2939 		/* FreeBSD has no media type for Quad ribbon fiber */
2940 	} else if (*ptr == 'R') {
2941 		/* -R is XFP */
2942 		sc->connector = MXGE_XFP;
2943 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2944 		/* -S or -2S is SFP+ */
2945 		sc->connector = MXGE_SFP;
2946 	} else {
2947 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2948 	}
2949 }
2950 
2951 /*
2952  * Determine the media type for a NIC.  Some XFPs will identify
2953  * themselves only when their link is up, so this is initiated via a
2954  * link up interrupt.  However, this can potentially take up to
2955  * several milliseconds, so it is run via the watchdog routine, rather
2956  * than in the interrupt handler itself.
2957  */
2958 static void
2959 mxge_media_probe(mxge_softc_t *sc)
2960 {
2961 	mxge_cmd_t cmd;
2962 	char *cage_type;
2963 
2964 	struct mxge_media_type *mxge_media_types = NULL;
2965 	int i, err, ms, mxge_media_type_entries;
2966 	uint32_t byte;
2967 
2968 	sc->need_media_probe = 0;
2969 
2970 	if (sc->connector == MXGE_XFP) {
2971 		/* -R is XFP */
2972 		mxge_media_types = mxge_xfp_media_types;
2973 		mxge_media_type_entries =
2974 			nitems(mxge_xfp_media_types);
2975 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2976 		cage_type = "XFP";
2977 	} else 	if (sc->connector == MXGE_SFP) {
2978 		/* -S or -2S is SFP+ */
2979 		mxge_media_types = mxge_sfp_media_types;
2980 		mxge_media_type_entries =
2981 			nitems(mxge_sfp_media_types);
2982 		cage_type = "SFP+";
2983 		byte = 3;
2984 	} else {
2985 		/* nothing to do; media type cannot change */
2986 		return;
2987 	}
2988 
2989 	/*
2990 	 * At this point we know the NIC has an XFP cage, so now we
2991 	 * try to determine what is in the cage by using the
2992 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2993 	 * register.  We read just one byte, which may take over
2994 	 * a millisecond
2995 	 */
2996 
2997 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2998 	cmd.data1 = byte;
2999 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
3000 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
3001 		device_printf(sc->dev, "failed to read XFP\n");
3002 	}
3003 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
3004 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
3005 	}
3006 	if (err != MXGEFW_CMD_OK) {
3007 		return;
3008 	}
3009 
3010 	/* now we wait for the data to be cached */
3011 	cmd.data0 = byte;
3012 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3013 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
3014 		DELAY(1000);
3015 		cmd.data0 = byte;
3016 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3017 	}
3018 	if (err != MXGEFW_CMD_OK) {
3019 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
3020 			      cage_type, err, ms);
3021 		return;
3022 	}
3023 
3024 	if (cmd.data0 == mxge_media_types[0].bitmask) {
3025 		if (mxge_verbose)
3026 			device_printf(sc->dev, "%s:%s\n", cage_type,
3027 				      mxge_media_types[0].name);
3028 		if (sc->current_media != mxge_media_types[0].flag) {
3029 			mxge_media_init(sc);
3030 			mxge_media_set(sc, mxge_media_types[0].flag);
3031 		}
3032 		return;
3033 	}
3034 	for (i = 1; i < mxge_media_type_entries; i++) {
3035 		if (cmd.data0 & mxge_media_types[i].bitmask) {
3036 			if (mxge_verbose)
3037 				device_printf(sc->dev, "%s:%s\n",
3038 					      cage_type,
3039 					      mxge_media_types[i].name);
3040 
3041 			if (sc->current_media != mxge_media_types[i].flag) {
3042 				mxge_media_init(sc);
3043 				mxge_media_set(sc, mxge_media_types[i].flag);
3044 			}
3045 			return;
3046 		}
3047 	}
3048 	if (mxge_verbose)
3049 		device_printf(sc->dev, "%s media 0x%x unknown\n",
3050 			      cage_type, cmd.data0);
3051 
3052 	return;
3053 }
3054 
3055 static void
3056 mxge_intr(void *arg)
3057 {
3058 	struct mxge_slice_state *ss = arg;
3059 	mxge_softc_t *sc = ss->sc;
3060 	mcp_irq_data_t *stats = ss->fw_stats;
3061 	mxge_tx_ring_t *tx = &ss->tx;
3062 	mxge_rx_done_t *rx_done = &ss->rx_done;
3063 	uint32_t send_done_count;
3064 	uint8_t valid;
3065 
3066 
3067 #ifndef IFNET_BUF_RING
3068 	/* an interrupt on a non-zero slice is implicitly valid
3069 	   since MSI-X irqs are not shared */
3070 	if (ss != sc->ss) {
3071 		mxge_clean_rx_done(ss);
3072 		*ss->irq_claim = be32toh(3);
3073 		return;
3074 	}
3075 #endif
3076 
3077 	/* make sure the DMA has finished */
3078 	if (!stats->valid) {
3079 		return;
3080 	}
3081 	valid = stats->valid;
3082 
3083 	if (sc->legacy_irq) {
3084 		/* lower legacy IRQ  */
3085 		*sc->irq_deassert = 0;
3086 		if (!mxge_deassert_wait)
3087 			/* don't wait for conf. that irq is low */
3088 			stats->valid = 0;
3089 	} else {
3090 		stats->valid = 0;
3091 	}
3092 
3093 	/* loop while waiting for legacy irq deassertion */
3094 	do {
3095 		/* check for transmit completes and receives */
3096 		send_done_count = be32toh(stats->send_done_count);
3097 		while ((send_done_count != tx->pkt_done) ||
3098 		       (rx_done->entry[rx_done->idx].length != 0)) {
3099 			if (send_done_count != tx->pkt_done)
3100 				mxge_tx_done(ss, (int)send_done_count);
3101 			mxge_clean_rx_done(ss);
3102 			send_done_count = be32toh(stats->send_done_count);
3103 		}
3104 		if (sc->legacy_irq && mxge_deassert_wait)
3105 			wmb();
3106 	} while (*((volatile uint8_t *) &stats->valid));
3107 
3108 	/* fw link & error stats meaningful only on the first slice */
3109 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3110 		if (sc->link_state != stats->link_up) {
3111 			sc->link_state = stats->link_up;
3112 			if (sc->link_state) {
3113 				if_link_state_change(sc->ifp, LINK_STATE_UP);
3114 				if (mxge_verbose)
3115 					device_printf(sc->dev, "link up\n");
3116 			} else {
3117 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3118 				if (mxge_verbose)
3119 					device_printf(sc->dev, "link down\n");
3120 			}
3121 			sc->need_media_probe = 1;
3122 		}
3123 		if (sc->rdma_tags_available !=
3124 		    be32toh(stats->rdma_tags_available)) {
3125 			sc->rdma_tags_available =
3126 				be32toh(stats->rdma_tags_available);
3127 			device_printf(sc->dev, "RDMA timed out! %d tags "
3128 				      "left\n", sc->rdma_tags_available);
3129 		}
3130 
3131 		if (stats->link_down) {
3132 			sc->down_cnt += stats->link_down;
3133 			sc->link_state = 0;
3134 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3135 		}
3136 	}
3137 
3138 	/* check to see if we have rx token to pass back */
3139 	if (valid & 0x1)
3140 	    *ss->irq_claim = be32toh(3);
3141 	*(ss->irq_claim + 1) = be32toh(3);
3142 }
3143 
3144 static void
3145 mxge_init(void *arg)
3146 {
3147 	mxge_softc_t *sc = arg;
3148 	struct ifnet *ifp = sc->ifp;
3149 
3150 
3151 	mtx_lock(&sc->driver_mtx);
3152 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3153 		(void) mxge_open(sc);
3154 	mtx_unlock(&sc->driver_mtx);
3155 }
3156 
3157 
3158 
3159 static void
3160 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3161 {
3162 	int i;
3163 
3164 #if defined(INET) || defined(INET6)
3165 	tcp_lro_free(&ss->lc);
3166 #endif
3167 	for (i = 0; i <= ss->rx_big.mask; i++) {
3168 		if (ss->rx_big.info[i].m == NULL)
3169 			continue;
3170 		bus_dmamap_unload(ss->rx_big.dmat,
3171 				  ss->rx_big.info[i].map);
3172 		m_freem(ss->rx_big.info[i].m);
3173 		ss->rx_big.info[i].m = NULL;
3174 	}
3175 
3176 	for (i = 0; i <= ss->rx_small.mask; i++) {
3177 		if (ss->rx_small.info[i].m == NULL)
3178 			continue;
3179 		bus_dmamap_unload(ss->rx_small.dmat,
3180 				  ss->rx_small.info[i].map);
3181 		m_freem(ss->rx_small.info[i].m);
3182 		ss->rx_small.info[i].m = NULL;
3183 	}
3184 
3185 	/* transmit ring used only on the first slice */
3186 	if (ss->tx.info == NULL)
3187 		return;
3188 
3189 	for (i = 0; i <= ss->tx.mask; i++) {
3190 		ss->tx.info[i].flag = 0;
3191 		if (ss->tx.info[i].m == NULL)
3192 			continue;
3193 		bus_dmamap_unload(ss->tx.dmat,
3194 				  ss->tx.info[i].map);
3195 		m_freem(ss->tx.info[i].m);
3196 		ss->tx.info[i].m = NULL;
3197 	}
3198 }
3199 
3200 static void
3201 mxge_free_mbufs(mxge_softc_t *sc)
3202 {
3203 	int slice;
3204 
3205 	for (slice = 0; slice < sc->num_slices; slice++)
3206 		mxge_free_slice_mbufs(&sc->ss[slice]);
3207 }
3208 
3209 static void
3210 mxge_free_slice_rings(struct mxge_slice_state *ss)
3211 {
3212 	int i;
3213 
3214 
3215 	if (ss->rx_done.entry != NULL)
3216 		mxge_dma_free(&ss->rx_done.dma);
3217 	ss->rx_done.entry = NULL;
3218 
3219 	if (ss->tx.req_bytes != NULL)
3220 		free(ss->tx.req_bytes, M_DEVBUF);
3221 	ss->tx.req_bytes = NULL;
3222 
3223 	if (ss->tx.seg_list != NULL)
3224 		free(ss->tx.seg_list, M_DEVBUF);
3225 	ss->tx.seg_list = NULL;
3226 
3227 	if (ss->rx_small.shadow != NULL)
3228 		free(ss->rx_small.shadow, M_DEVBUF);
3229 	ss->rx_small.shadow = NULL;
3230 
3231 	if (ss->rx_big.shadow != NULL)
3232 		free(ss->rx_big.shadow, M_DEVBUF);
3233 	ss->rx_big.shadow = NULL;
3234 
3235 	if (ss->tx.info != NULL) {
3236 		if (ss->tx.dmat != NULL) {
3237 			for (i = 0; i <= ss->tx.mask; i++) {
3238 				bus_dmamap_destroy(ss->tx.dmat,
3239 						   ss->tx.info[i].map);
3240 			}
3241 			bus_dma_tag_destroy(ss->tx.dmat);
3242 		}
3243 		free(ss->tx.info, M_DEVBUF);
3244 	}
3245 	ss->tx.info = NULL;
3246 
3247 	if (ss->rx_small.info != NULL) {
3248 		if (ss->rx_small.dmat != NULL) {
3249 			for (i = 0; i <= ss->rx_small.mask; i++) {
3250 				bus_dmamap_destroy(ss->rx_small.dmat,
3251 						   ss->rx_small.info[i].map);
3252 			}
3253 			bus_dmamap_destroy(ss->rx_small.dmat,
3254 					   ss->rx_small.extra_map);
3255 			bus_dma_tag_destroy(ss->rx_small.dmat);
3256 		}
3257 		free(ss->rx_small.info, M_DEVBUF);
3258 	}
3259 	ss->rx_small.info = NULL;
3260 
3261 	if (ss->rx_big.info != NULL) {
3262 		if (ss->rx_big.dmat != NULL) {
3263 			for (i = 0; i <= ss->rx_big.mask; i++) {
3264 				bus_dmamap_destroy(ss->rx_big.dmat,
3265 						   ss->rx_big.info[i].map);
3266 			}
3267 			bus_dmamap_destroy(ss->rx_big.dmat,
3268 					   ss->rx_big.extra_map);
3269 			bus_dma_tag_destroy(ss->rx_big.dmat);
3270 		}
3271 		free(ss->rx_big.info, M_DEVBUF);
3272 	}
3273 	ss->rx_big.info = NULL;
3274 }
3275 
3276 static void
3277 mxge_free_rings(mxge_softc_t *sc)
3278 {
3279 	int slice;
3280 
3281 	for (slice = 0; slice < sc->num_slices; slice++)
3282 		mxge_free_slice_rings(&sc->ss[slice]);
3283 }
3284 
3285 static int
3286 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3287 		       int tx_ring_entries)
3288 {
3289 	mxge_softc_t *sc = ss->sc;
3290 	size_t bytes;
3291 	int err, i;
3292 
3293 	/* allocate per-slice receive resources */
3294 
3295 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3296 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3297 
3298 	/* allocate the rx shadow rings */
3299 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3300 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3301 
3302 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3303 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3304 
3305 	/* allocate the rx host info rings */
3306 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3307 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3308 
3309 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3310 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3311 
3312 	/* allocate the rx busdma resources */
3313 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3314 				 1,			/* alignment */
3315 				 4096,			/* boundary */
3316 				 BUS_SPACE_MAXADDR,	/* low */
3317 				 BUS_SPACE_MAXADDR,	/* high */
3318 				 NULL, NULL,		/* filter */
3319 				 MHLEN,			/* maxsize */
3320 				 1,			/* num segs */
3321 				 MHLEN,			/* maxsegsize */
3322 				 BUS_DMA_ALLOCNOW,	/* flags */
3323 				 NULL, NULL,		/* lock */
3324 				 &ss->rx_small.dmat);	/* tag */
3325 	if (err != 0) {
3326 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3327 			      err);
3328 		return err;
3329 	}
3330 
3331 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3332 				 1,			/* alignment */
3333 #if MXGE_VIRT_JUMBOS
3334 				 4096,			/* boundary */
3335 #else
3336 				 0,			/* boundary */
3337 #endif
3338 				 BUS_SPACE_MAXADDR,	/* low */
3339 				 BUS_SPACE_MAXADDR,	/* high */
3340 				 NULL, NULL,		/* filter */
3341 				 3*4096,		/* maxsize */
3342 #if MXGE_VIRT_JUMBOS
3343 				 3,			/* num segs */
3344 				 4096,			/* maxsegsize*/
3345 #else
3346 				 1,			/* num segs */
3347 				 MJUM9BYTES,		/* maxsegsize*/
3348 #endif
3349 				 BUS_DMA_ALLOCNOW,	/* flags */
3350 				 NULL, NULL,		/* lock */
3351 				 &ss->rx_big.dmat);	/* tag */
3352 	if (err != 0) {
3353 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3354 			      err);
3355 		return err;
3356 	}
3357 	for (i = 0; i <= ss->rx_small.mask; i++) {
3358 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3359 					&ss->rx_small.info[i].map);
3360 		if (err != 0) {
3361 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3362 				      err);
3363 			return err;
3364 		}
3365 	}
3366 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3367 				&ss->rx_small.extra_map);
3368 	if (err != 0) {
3369 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3370 			      err);
3371 		return err;
3372 	}
3373 
3374 	for (i = 0; i <= ss->rx_big.mask; i++) {
3375 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3376 					&ss->rx_big.info[i].map);
3377 		if (err != 0) {
3378 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3379 				      err);
3380 			return err;
3381 		}
3382 	}
3383 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3384 				&ss->rx_big.extra_map);
3385 	if (err != 0) {
3386 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3387 			      err);
3388 		return err;
3389 	}
3390 
3391 	/* now allocate TX resources */
3392 
3393 #ifndef IFNET_BUF_RING
3394 	/* only use a single TX ring for now */
3395 	if (ss != ss->sc->ss)
3396 		return 0;
3397 #endif
3398 
3399 	ss->tx.mask = tx_ring_entries - 1;
3400 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3401 
3402 
3403 	/* allocate the tx request copy block */
3404 	bytes = 8 +
3405 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3406 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3407 	/* ensure req_list entries are aligned to 8 bytes */
3408 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3409 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3410 
3411 	/* allocate the tx busdma segment list */
3412 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3413 	ss->tx.seg_list = (bus_dma_segment_t *)
3414 		malloc(bytes, M_DEVBUF, M_WAITOK);
3415 
3416 	/* allocate the tx host info ring */
3417 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3418 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3419 
3420 	/* allocate the tx busdma resources */
3421 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3422 				 1,			/* alignment */
3423 				 sc->tx_boundary,	/* boundary */
3424 				 BUS_SPACE_MAXADDR,	/* low */
3425 				 BUS_SPACE_MAXADDR,	/* high */
3426 				 NULL, NULL,		/* filter */
3427 				 65536 + 256,		/* maxsize */
3428 				 ss->tx.max_desc - 2,	/* num segs */
3429 				 sc->tx_boundary,	/* maxsegsz */
3430 				 BUS_DMA_ALLOCNOW,	/* flags */
3431 				 NULL, NULL,		/* lock */
3432 				 &ss->tx.dmat);		/* tag */
3433 
3434 	if (err != 0) {
3435 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3436 			      err);
3437 		return err;
3438 	}
3439 
3440 	/* now use these tags to setup dmamaps for each slot
3441 	   in the ring */
3442 	for (i = 0; i <= ss->tx.mask; i++) {
3443 		err = bus_dmamap_create(ss->tx.dmat, 0,
3444 					&ss->tx.info[i].map);
3445 		if (err != 0) {
3446 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3447 				      err);
3448 			return err;
3449 		}
3450 	}
3451 	return 0;
3452 
3453 }
3454 
3455 static int
3456 mxge_alloc_rings(mxge_softc_t *sc)
3457 {
3458 	mxge_cmd_t cmd;
3459 	int tx_ring_size;
3460 	int tx_ring_entries, rx_ring_entries;
3461 	int err, slice;
3462 
3463 	/* get ring sizes */
3464 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3465 	tx_ring_size = cmd.data0;
3466 	if (err != 0) {
3467 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3468 		goto abort;
3469 	}
3470 
3471 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3472 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3473 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3474 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3475 	IFQ_SET_READY(&sc->ifp->if_snd);
3476 
3477 	for (slice = 0; slice < sc->num_slices; slice++) {
3478 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3479 					     rx_ring_entries,
3480 					     tx_ring_entries);
3481 		if (err != 0)
3482 			goto abort;
3483 	}
3484 	return 0;
3485 
3486 abort:
3487 	mxge_free_rings(sc);
3488 	return err;
3489 
3490 }
3491 
3492 
3493 static void
3494 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3495 {
3496 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3497 
3498 	if (bufsize < MCLBYTES) {
3499 		/* easy, everything fits in a single buffer */
3500 		*big_buf_size = MCLBYTES;
3501 		*cl_size = MCLBYTES;
3502 		*nbufs = 1;
3503 		return;
3504 	}
3505 
3506 	if (bufsize < MJUMPAGESIZE) {
3507 		/* still easy, everything still fits in a single buffer */
3508 		*big_buf_size = MJUMPAGESIZE;
3509 		*cl_size = MJUMPAGESIZE;
3510 		*nbufs = 1;
3511 		return;
3512 	}
3513 #if MXGE_VIRT_JUMBOS
3514 	/* now we need to use virtually contiguous buffers */
3515 	*cl_size = MJUM9BYTES;
3516 	*big_buf_size = 4096;
3517 	*nbufs = mtu / 4096 + 1;
3518 	/* needs to be a power of two, so round up */
3519 	if (*nbufs == 3)
3520 		*nbufs = 4;
3521 #else
3522 	*cl_size = MJUM9BYTES;
3523 	*big_buf_size = MJUM9BYTES;
3524 	*nbufs = 1;
3525 #endif
3526 }
3527 
3528 static int
3529 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3530 {
3531 	mxge_softc_t *sc;
3532 	mxge_cmd_t cmd;
3533 	bus_dmamap_t map;
3534 	int err, i, slice;
3535 
3536 
3537 	sc = ss->sc;
3538 	slice = ss - sc->ss;
3539 
3540 #if defined(INET) || defined(INET6)
3541 	(void)tcp_lro_init(&ss->lc);
3542 #endif
3543 	ss->lc.ifp = sc->ifp;
3544 
3545 	/* get the lanai pointers to the send and receive rings */
3546 
3547 	err = 0;
3548 #ifndef IFNET_BUF_RING
3549 	/* We currently only send from the first slice */
3550 	if (slice == 0) {
3551 #endif
3552 		cmd.data0 = slice;
3553 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3554 		ss->tx.lanai =
3555 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3556 		ss->tx.send_go = (volatile uint32_t *)
3557 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3558 		ss->tx.send_stop = (volatile uint32_t *)
3559 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3560 #ifndef IFNET_BUF_RING
3561 	}
3562 #endif
3563 	cmd.data0 = slice;
3564 	err |= mxge_send_cmd(sc,
3565 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3566 	ss->rx_small.lanai =
3567 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3568 	cmd.data0 = slice;
3569 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3570 	ss->rx_big.lanai =
3571 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3572 
3573 	if (err != 0) {
3574 		device_printf(sc->dev,
3575 			      "failed to get ring sizes or locations\n");
3576 		return EIO;
3577 	}
3578 
3579 	/* stock receive rings */
3580 	for (i = 0; i <= ss->rx_small.mask; i++) {
3581 		map = ss->rx_small.info[i].map;
3582 		err = mxge_get_buf_small(ss, map, i);
3583 		if (err) {
3584 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3585 				      i, ss->rx_small.mask + 1);
3586 			return ENOMEM;
3587 		}
3588 	}
3589 	for (i = 0; i <= ss->rx_big.mask; i++) {
3590 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3591 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3592 	}
3593 	ss->rx_big.nbufs = nbufs;
3594 	ss->rx_big.cl_size = cl_size;
3595 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3596 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3597 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3598 		map = ss->rx_big.info[i].map;
3599 		err = mxge_get_buf_big(ss, map, i);
3600 		if (err) {
3601 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3602 				      i, ss->rx_big.mask + 1);
3603 			return ENOMEM;
3604 		}
3605 	}
3606 	return 0;
3607 }
3608 
3609 static int
3610 mxge_open(mxge_softc_t *sc)
3611 {
3612 	mxge_cmd_t cmd;
3613 	int err, big_bytes, nbufs, slice, cl_size, i;
3614 	bus_addr_t bus;
3615 	volatile uint8_t *itable;
3616 	struct mxge_slice_state *ss;
3617 
3618 	/* Copy the MAC address in case it was overridden */
3619 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3620 
3621 	err = mxge_reset(sc, 1);
3622 	if (err != 0) {
3623 		device_printf(sc->dev, "failed to reset\n");
3624 		return EIO;
3625 	}
3626 
3627 	if (sc->num_slices > 1) {
3628 		/* setup the indirection table */
3629 		cmd.data0 = sc->num_slices;
3630 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3631 				    &cmd);
3632 
3633 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3634 				     &cmd);
3635 		if (err != 0) {
3636 			device_printf(sc->dev,
3637 				      "failed to setup rss tables\n");
3638 			return err;
3639 		}
3640 
3641 		/* just enable an identity mapping */
3642 		itable = sc->sram + cmd.data0;
3643 		for (i = 0; i < sc->num_slices; i++)
3644 			itable[i] = (uint8_t)i;
3645 
3646 		cmd.data0 = 1;
3647 		cmd.data1 = mxge_rss_hash_type;
3648 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3649 		if (err != 0) {
3650 			device_printf(sc->dev, "failed to enable slices\n");
3651 			return err;
3652 		}
3653 	}
3654 
3655 
3656 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3657 
3658 	cmd.data0 = nbufs;
3659 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3660 			    &cmd);
3661 	/* error is only meaningful if we're trying to set
3662 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3663 	if (err && nbufs > 1) {
3664 		device_printf(sc->dev,
3665 			      "Failed to set alway-use-n to %d\n",
3666 			      nbufs);
3667 		return EIO;
3668 	}
3669 	/* Give the firmware the mtu and the big and small buffer
3670 	   sizes.  The firmware wants the big buf size to be a power
3671 	   of two. Luckily, FreeBSD's clusters are powers of two */
3672 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3673 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3674 	cmd.data0 = MHLEN - MXGEFW_PAD;
3675 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3676 			     &cmd);
3677 	cmd.data0 = big_bytes;
3678 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3679 
3680 	if (err != 0) {
3681 		device_printf(sc->dev, "failed to setup params\n");
3682 		goto abort;
3683 	}
3684 
3685 	/* Now give him the pointer to the stats block */
3686 	for (slice = 0;
3687 #ifdef IFNET_BUF_RING
3688 	     slice < sc->num_slices;
3689 #else
3690 	     slice < 1;
3691 #endif
3692 	     slice++) {
3693 		ss = &sc->ss[slice];
3694 		cmd.data0 =
3695 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3696 		cmd.data1 =
3697 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3698 		cmd.data2 = sizeof(struct mcp_irq_data);
3699 		cmd.data2 |= (slice << 16);
3700 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3701 	}
3702 
3703 	if (err != 0) {
3704 		bus = sc->ss->fw_stats_dma.bus_addr;
3705 		bus += offsetof(struct mcp_irq_data, send_done_count);
3706 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3707 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3708 		err = mxge_send_cmd(sc,
3709 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3710 				    &cmd);
3711 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3712 		sc->fw_multicast_support = 0;
3713 	} else {
3714 		sc->fw_multicast_support = 1;
3715 	}
3716 
3717 	if (err != 0) {
3718 		device_printf(sc->dev, "failed to setup params\n");
3719 		goto abort;
3720 	}
3721 
3722 	for (slice = 0; slice < sc->num_slices; slice++) {
3723 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3724 		if (err != 0) {
3725 			device_printf(sc->dev, "couldn't open slice %d\n",
3726 				      slice);
3727 			goto abort;
3728 		}
3729 	}
3730 
3731 	/* Finally, start the firmware running */
3732 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3733 	if (err) {
3734 		device_printf(sc->dev, "Couldn't bring up link\n");
3735 		goto abort;
3736 	}
3737 #ifdef IFNET_BUF_RING
3738 	for (slice = 0; slice < sc->num_slices; slice++) {
3739 		ss = &sc->ss[slice];
3740 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3741 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3742 	}
3743 #endif
3744 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3745 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3746 
3747 	return 0;
3748 
3749 
3750 abort:
3751 	mxge_free_mbufs(sc);
3752 
3753 	return err;
3754 }
3755 
3756 static int
3757 mxge_close(mxge_softc_t *sc, int down)
3758 {
3759 	mxge_cmd_t cmd;
3760 	int err, old_down_cnt;
3761 #ifdef IFNET_BUF_RING
3762 	struct mxge_slice_state *ss;
3763 	int slice;
3764 #endif
3765 
3766 #ifdef IFNET_BUF_RING
3767 	for (slice = 0; slice < sc->num_slices; slice++) {
3768 		ss = &sc->ss[slice];
3769 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3770 	}
3771 #endif
3772 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3773 	if (!down) {
3774 		old_down_cnt = sc->down_cnt;
3775 		wmb();
3776 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3777 		if (err) {
3778 			device_printf(sc->dev,
3779 				      "Couldn't bring down link\n");
3780 		}
3781 		if (old_down_cnt == sc->down_cnt) {
3782 			/* wait for down irq */
3783 			DELAY(10 * sc->intr_coal_delay);
3784 		}
3785 		wmb();
3786 		if (old_down_cnt == sc->down_cnt) {
3787 			device_printf(sc->dev, "never got down irq\n");
3788 		}
3789 	}
3790 	mxge_free_mbufs(sc);
3791 
3792 	return 0;
3793 }
3794 
3795 static void
3796 mxge_setup_cfg_space(mxge_softc_t *sc)
3797 {
3798 	device_t dev = sc->dev;
3799 	int reg;
3800 	uint16_t lnk, pectl;
3801 
3802 	/* find the PCIe link width and set max read request to 4KB*/
3803 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3804 		lnk = pci_read_config(dev, reg + 0x12, 2);
3805 		sc->link_width = (lnk >> 4) & 0x3f;
3806 
3807 		if (sc->pectl == 0) {
3808 			pectl = pci_read_config(dev, reg + 0x8, 2);
3809 			pectl = (pectl & ~0x7000) | (5 << 12);
3810 			pci_write_config(dev, reg + 0x8, pectl, 2);
3811 			sc->pectl = pectl;
3812 		} else {
3813 			/* restore saved pectl after watchdog reset */
3814 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3815 		}
3816 	}
3817 
3818 	/* Enable DMA and Memory space access */
3819 	pci_enable_busmaster(dev);
3820 }
3821 
3822 static uint32_t
3823 mxge_read_reboot(mxge_softc_t *sc)
3824 {
3825 	device_t dev = sc->dev;
3826 	uint32_t vs;
3827 
3828 	/* find the vendor specific offset */
3829 	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3830 		device_printf(sc->dev,
3831 			      "could not find vendor specific offset\n");
3832 		return (uint32_t)-1;
3833 	}
3834 	/* enable read32 mode */
3835 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3836 	/* tell NIC which register to read */
3837 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3838 	return (pci_read_config(dev, vs + 0x14, 4));
3839 }
3840 
3841 static void
3842 mxge_watchdog_reset(mxge_softc_t *sc)
3843 {
3844 	struct pci_devinfo *dinfo;
3845 	struct mxge_slice_state *ss;
3846 	int err, running, s, num_tx_slices = 1;
3847 	uint32_t reboot;
3848 	uint16_t cmd;
3849 
3850 	err = ENXIO;
3851 
3852 	device_printf(sc->dev, "Watchdog reset!\n");
3853 
3854 	/*
3855 	 * check to see if the NIC rebooted.  If it did, then all of
3856 	 * PCI config space has been reset, and things like the
3857 	 * busmaster bit will be zero.  If this is the case, then we
3858 	 * must restore PCI config space before the NIC can be used
3859 	 * again
3860 	 */
3861 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3862 	if (cmd == 0xffff) {
3863 		/*
3864 		 * maybe the watchdog caught the NIC rebooting; wait
3865 		 * up to 100ms for it to finish.  If it does not come
3866 		 * back, then give up
3867 		 */
3868 		DELAY(1000*100);
3869 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3870 		if (cmd == 0xffff) {
3871 			device_printf(sc->dev, "NIC disappeared!\n");
3872 		}
3873 	}
3874 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3875 		/* print the reboot status */
3876 		reboot = mxge_read_reboot(sc);
3877 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3878 			      reboot);
3879 		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3880 		if (running) {
3881 
3882 			/*
3883 			 * quiesce NIC so that TX routines will not try to
3884 			 * xmit after restoration of BAR
3885 			 */
3886 
3887 			/* Mark the link as down */
3888 			if (sc->link_state) {
3889 				sc->link_state = 0;
3890 				if_link_state_change(sc->ifp,
3891 						     LINK_STATE_DOWN);
3892 			}
3893 #ifdef IFNET_BUF_RING
3894 			num_tx_slices = sc->num_slices;
3895 #endif
3896 			/* grab all TX locks to ensure no tx  */
3897 			for (s = 0; s < num_tx_slices; s++) {
3898 				ss = &sc->ss[s];
3899 				mtx_lock(&ss->tx.mtx);
3900 			}
3901 			mxge_close(sc, 1);
3902 		}
3903 		/* restore PCI configuration space */
3904 		dinfo = device_get_ivars(sc->dev);
3905 		pci_cfg_restore(sc->dev, dinfo);
3906 
3907 		/* and redo any changes we made to our config space */
3908 		mxge_setup_cfg_space(sc);
3909 
3910 		/* reload f/w */
3911 		err = mxge_load_firmware(sc, 0);
3912 		if (err) {
3913 			device_printf(sc->dev,
3914 				      "Unable to re-load f/w\n");
3915 		}
3916 		if (running) {
3917 			if (!err)
3918 				err = mxge_open(sc);
3919 			/* release all TX locks */
3920 			for (s = 0; s < num_tx_slices; s++) {
3921 				ss = &sc->ss[s];
3922 #ifdef IFNET_BUF_RING
3923 				mxge_start_locked(ss);
3924 #endif
3925 				mtx_unlock(&ss->tx.mtx);
3926 			}
3927 		}
3928 		sc->watchdog_resets++;
3929 	} else {
3930 		device_printf(sc->dev,
3931 			      "NIC did not reboot, not resetting\n");
3932 		err = 0;
3933 	}
3934 	if (err) {
3935 		device_printf(sc->dev, "watchdog reset failed\n");
3936 	} else {
3937 		if (sc->dying == 2)
3938 			sc->dying = 0;
3939 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3940 	}
3941 }
3942 
3943 static void
3944 mxge_watchdog_task(void *arg, int pending)
3945 {
3946 	mxge_softc_t *sc = arg;
3947 
3948 
3949 	mtx_lock(&sc->driver_mtx);
3950 	mxge_watchdog_reset(sc);
3951 	mtx_unlock(&sc->driver_mtx);
3952 }
3953 
3954 static void
3955 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3956 {
3957 	tx = &sc->ss[slice].tx;
3958 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3959 	device_printf(sc->dev,
3960 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3961 		      tx->req, tx->done, tx->queue_active);
3962 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3963 			      tx->activate, tx->deactivate);
3964 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3965 		      tx->pkt_done,
3966 		      be32toh(sc->ss->fw_stats->send_done_count));
3967 }
3968 
3969 static int
3970 mxge_watchdog(mxge_softc_t *sc)
3971 {
3972 	mxge_tx_ring_t *tx;
3973 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3974 	int i, err = 0;
3975 
3976 	/* see if we have outstanding transmits, which
3977 	   have been pending for more than mxge_ticks */
3978 	for (i = 0;
3979 #ifdef IFNET_BUF_RING
3980 	     (i < sc->num_slices) && (err == 0);
3981 #else
3982 	     (i < 1) && (err == 0);
3983 #endif
3984 	     i++) {
3985 		tx = &sc->ss[i].tx;
3986 		if (tx->req != tx->done &&
3987 		    tx->watchdog_req != tx->watchdog_done &&
3988 		    tx->done == tx->watchdog_done) {
3989 			/* check for pause blocking before resetting */
3990 			if (tx->watchdog_rx_pause == rx_pause) {
3991 				mxge_warn_stuck(sc, tx, i);
3992 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3993 				return (ENXIO);
3994 			}
3995 			else
3996 				device_printf(sc->dev, "Flow control blocking "
3997 					      "xmits, check link partner\n");
3998 		}
3999 
4000 		tx->watchdog_req = tx->req;
4001 		tx->watchdog_done = tx->done;
4002 		tx->watchdog_rx_pause = rx_pause;
4003 	}
4004 
4005 	if (sc->need_media_probe)
4006 		mxge_media_probe(sc);
4007 	return (err);
4008 }
4009 
4010 static uint64_t
4011 mxge_get_counter(struct ifnet *ifp, ift_counter cnt)
4012 {
4013 	struct mxge_softc *sc;
4014 	uint64_t rv;
4015 
4016 	sc = if_getsoftc(ifp);
4017 	rv = 0;
4018 
4019 	switch (cnt) {
4020 	case IFCOUNTER_IPACKETS:
4021 		for (int s = 0; s < sc->num_slices; s++)
4022 			rv += sc->ss[s].ipackets;
4023 		return (rv);
4024 	case IFCOUNTER_OPACKETS:
4025 		for (int s = 0; s < sc->num_slices; s++)
4026 			rv += sc->ss[s].opackets;
4027 		return (rv);
4028 	case IFCOUNTER_OERRORS:
4029 		for (int s = 0; s < sc->num_slices; s++)
4030 			rv += sc->ss[s].oerrors;
4031 		return (rv);
4032 #ifdef IFNET_BUF_RING
4033 	case IFCOUNTER_OBYTES:
4034 		for (int s = 0; s < sc->num_slices; s++)
4035 			rv += sc->ss[s].obytes;
4036 		return (rv);
4037 	case IFCOUNTER_OMCASTS:
4038 		for (int s = 0; s < sc->num_slices; s++)
4039 			rv += sc->ss[s].omcasts;
4040 		return (rv);
4041 	case IFCOUNTER_OQDROPS:
4042 		for (int s = 0; s < sc->num_slices; s++)
4043 			rv += sc->ss[s].tx.br->br_drops;
4044 		return (rv);
4045 #endif
4046 	default:
4047 		return (if_get_counter_default(ifp, cnt));
4048 	}
4049 }
4050 
4051 static void
4052 mxge_tick(void *arg)
4053 {
4054 	mxge_softc_t *sc = arg;
4055 	u_long pkts = 0;
4056 	int err = 0;
4057 	int running, ticks;
4058 	uint16_t cmd;
4059 
4060 	ticks = mxge_ticks;
4061 	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4062 	if (running) {
4063 		if (!sc->watchdog_countdown) {
4064 			err = mxge_watchdog(sc);
4065 			sc->watchdog_countdown = 4;
4066 		}
4067 		sc->watchdog_countdown--;
4068 	}
4069 	if (pkts == 0) {
4070 		/* ensure NIC did not suffer h/w fault while idle */
4071 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
4072 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4073 			sc->dying = 2;
4074 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4075 			err = ENXIO;
4076 		}
4077 		/* look less often if NIC is idle */
4078 		ticks *= 4;
4079 	}
4080 
4081 	if (err == 0)
4082 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4083 
4084 }
4085 
4086 static int
4087 mxge_media_change(struct ifnet *ifp)
4088 {
4089 	return EINVAL;
4090 }
4091 
4092 static int
4093 mxge_change_mtu(mxge_softc_t *sc, int mtu)
4094 {
4095 	struct ifnet *ifp = sc->ifp;
4096 	int real_mtu, old_mtu;
4097 	int err = 0;
4098 
4099 
4100 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4101 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4102 		return EINVAL;
4103 	mtx_lock(&sc->driver_mtx);
4104 	old_mtu = ifp->if_mtu;
4105 	ifp->if_mtu = mtu;
4106 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4107 		mxge_close(sc, 0);
4108 		err = mxge_open(sc);
4109 		if (err != 0) {
4110 			ifp->if_mtu = old_mtu;
4111 			mxge_close(sc, 0);
4112 			(void) mxge_open(sc);
4113 		}
4114 	}
4115 	mtx_unlock(&sc->driver_mtx);
4116 	return err;
4117 }
4118 
4119 static void
4120 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4121 {
4122 	mxge_softc_t *sc = ifp->if_softc;
4123 
4124 
4125 	if (sc == NULL)
4126 		return;
4127 	ifmr->ifm_status = IFM_AVALID;
4128 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4129 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4130 	ifmr->ifm_active |= sc->current_media;
4131 }
4132 
4133 static int
4134 mxge_fetch_i2c(mxge_softc_t *sc, struct ifi2creq *i2c)
4135 {
4136 	mxge_cmd_t cmd;
4137 	uint32_t i2c_args;
4138 	int i, ms, err;
4139 
4140 
4141 	if (i2c->dev_addr != 0xA0 &&
4142 	    i2c->dev_addr != 0xA2)
4143 		return (EINVAL);
4144 	if (i2c->len > sizeof(i2c->data))
4145 		return (EINVAL);
4146 
4147 	for (i = 0; i < i2c->len; i++) {
4148 		i2c_args = i2c->dev_addr << 0x8;
4149 		i2c_args |= i2c->offset + i;
4150 		cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
4151 		cmd.data1 = i2c_args;
4152 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
4153 
4154 		if (err != MXGEFW_CMD_OK)
4155 			return (EIO);
4156 		/* now we wait for the data to be cached */
4157 		cmd.data0 = i2c_args & 0xff;
4158 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
4159 		for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
4160 			cmd.data0 = i2c_args & 0xff;
4161 			err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
4162 			if (err == EBUSY)
4163 				DELAY(1000);
4164 		}
4165 		if (err != MXGEFW_CMD_OK)
4166 			return (EIO);
4167 		i2c->data[i] = cmd.data0;
4168 	}
4169 	return (0);
4170 }
4171 
4172 static int
4173 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4174 {
4175 	mxge_softc_t *sc = ifp->if_softc;
4176 	struct ifreq *ifr = (struct ifreq *)data;
4177 	struct ifi2creq i2c;
4178 	int err, mask;
4179 
4180 	err = 0;
4181 	switch (command) {
4182 	case SIOCSIFMTU:
4183 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4184 		break;
4185 
4186 	case SIOCSIFFLAGS:
4187 		mtx_lock(&sc->driver_mtx);
4188 		if (sc->dying) {
4189 			mtx_unlock(&sc->driver_mtx);
4190 			return EINVAL;
4191 		}
4192 		if (ifp->if_flags & IFF_UP) {
4193 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4194 				err = mxge_open(sc);
4195 			} else {
4196 				/* take care of promis can allmulti
4197 				   flag chages */
4198 				mxge_change_promisc(sc,
4199 						    ifp->if_flags & IFF_PROMISC);
4200 				mxge_set_multicast_list(sc);
4201 			}
4202 		} else {
4203 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4204 				mxge_close(sc, 0);
4205 			}
4206 		}
4207 		mtx_unlock(&sc->driver_mtx);
4208 		break;
4209 
4210 	case SIOCADDMULTI:
4211 	case SIOCDELMULTI:
4212 		mtx_lock(&sc->driver_mtx);
4213 		if (sc->dying) {
4214 			mtx_unlock(&sc->driver_mtx);
4215 			return (EINVAL);
4216 		}
4217 		mxge_set_multicast_list(sc);
4218 		mtx_unlock(&sc->driver_mtx);
4219 		break;
4220 
4221 	case SIOCSIFCAP:
4222 		mtx_lock(&sc->driver_mtx);
4223 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4224 		if (mask & IFCAP_TXCSUM) {
4225 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4226 				mask &= ~IFCAP_TSO4;
4227 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4228 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
4229 			} else {
4230 				ifp->if_capenable |= IFCAP_TXCSUM;
4231 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4232 			}
4233 		}
4234 		if (mask & IFCAP_RXCSUM) {
4235 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4236 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4237 			} else {
4238 				ifp->if_capenable |= IFCAP_RXCSUM;
4239 			}
4240 		}
4241 		if (mask & IFCAP_TSO4) {
4242 			if (IFCAP_TSO4 & ifp->if_capenable) {
4243 				ifp->if_capenable &= ~IFCAP_TSO4;
4244 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4245 				ifp->if_capenable |= IFCAP_TSO4;
4246 				ifp->if_hwassist |= CSUM_TSO;
4247 			} else {
4248 				printf("mxge requires tx checksum offload"
4249 				       " be enabled to use TSO\n");
4250 				err = EINVAL;
4251 			}
4252 		}
4253 #if IFCAP_TSO6
4254 		if (mask & IFCAP_TXCSUM_IPV6) {
4255 			if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4256 				mask &= ~IFCAP_TSO6;
4257 				ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
4258 						       | IFCAP_TSO6);
4259 				ifp->if_hwassist &= ~(CSUM_TCP_IPV6
4260 						      | CSUM_UDP);
4261 			} else {
4262 				ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
4263 				ifp->if_hwassist |= (CSUM_TCP_IPV6
4264 						     | CSUM_UDP_IPV6);
4265 			}
4266 		}
4267 		if (mask & IFCAP_RXCSUM_IPV6) {
4268 			if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
4269 				ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
4270 			} else {
4271 				ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
4272 			}
4273 		}
4274 		if (mask & IFCAP_TSO6) {
4275 			if (IFCAP_TSO6 & ifp->if_capenable) {
4276 				ifp->if_capenable &= ~IFCAP_TSO6;
4277 			} else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4278 				ifp->if_capenable |= IFCAP_TSO6;
4279 				ifp->if_hwassist |= CSUM_TSO;
4280 			} else {
4281 				printf("mxge requires tx checksum offload"
4282 				       " be enabled to use TSO\n");
4283 				err = EINVAL;
4284 			}
4285 		}
4286 #endif /*IFCAP_TSO6 */
4287 
4288 		if (mask & IFCAP_LRO)
4289 			ifp->if_capenable ^= IFCAP_LRO;
4290 		if (mask & IFCAP_VLAN_HWTAGGING)
4291 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4292 		if (mask & IFCAP_VLAN_HWTSO)
4293 			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4294 
4295 		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4296 		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4297 			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4298 
4299 		mtx_unlock(&sc->driver_mtx);
4300 		VLAN_CAPABILITIES(ifp);
4301 
4302 		break;
4303 
4304 	case SIOCGIFMEDIA:
4305 		mtx_lock(&sc->driver_mtx);
4306 		if (sc->dying) {
4307 			mtx_unlock(&sc->driver_mtx);
4308 			return (EINVAL);
4309 		}
4310 		mxge_media_probe(sc);
4311 		mtx_unlock(&sc->driver_mtx);
4312 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4313 				    &sc->media, command);
4314 		break;
4315 
4316 	case SIOCGI2C:
4317 		if (sc->connector != MXGE_XFP &&
4318 		    sc->connector != MXGE_SFP) {
4319 			err = ENXIO;
4320 			break;
4321 		}
4322 		err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
4323 		if (err != 0)
4324 			break;
4325 		mtx_lock(&sc->driver_mtx);
4326 		if (sc->dying) {
4327 			mtx_unlock(&sc->driver_mtx);
4328 			return (EINVAL);
4329 		}
4330 		err = mxge_fetch_i2c(sc, &i2c);
4331 		mtx_unlock(&sc->driver_mtx);
4332 		if (err == 0)
4333 			err = copyout(&i2c, ifr_data_get_ptr(ifr),
4334 			    sizeof(i2c));
4335 		break;
4336 	default:
4337 		err = ether_ioctl(ifp, command, data);
4338 		break;
4339 	}
4340 	return err;
4341 }
4342 
4343 static void
4344 mxge_fetch_tunables(mxge_softc_t *sc)
4345 {
4346 
4347 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4348 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4349 			  &mxge_flow_control);
4350 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4351 			  &mxge_intr_coal_delay);
4352 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4353 			  &mxge_nvidia_ecrc_enable);
4354 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4355 			  &mxge_force_firmware);
4356 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4357 			  &mxge_deassert_wait);
4358 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4359 			  &mxge_verbose);
4360 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4361 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4362 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4363 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4364 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4365 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4366 
4367 	if (bootverbose)
4368 		mxge_verbose = 1;
4369 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4370 		mxge_intr_coal_delay = 30;
4371 	if (mxge_ticks == 0)
4372 		mxge_ticks = hz / 2;
4373 	sc->pause = mxge_flow_control;
4374 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4375 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4376 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4377 	}
4378 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4379 	    mxge_initial_mtu < ETHER_MIN_LEN)
4380 		mxge_initial_mtu = ETHERMTU_JUMBO;
4381 
4382 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4383 		mxge_throttle = MXGE_MAX_THROTTLE;
4384 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4385 		mxge_throttle = MXGE_MIN_THROTTLE;
4386 	sc->throttle = mxge_throttle;
4387 }
4388 
4389 
4390 static void
4391 mxge_free_slices(mxge_softc_t *sc)
4392 {
4393 	struct mxge_slice_state *ss;
4394 	int i;
4395 
4396 
4397 	if (sc->ss == NULL)
4398 		return;
4399 
4400 	for (i = 0; i < sc->num_slices; i++) {
4401 		ss = &sc->ss[i];
4402 		if (ss->fw_stats != NULL) {
4403 			mxge_dma_free(&ss->fw_stats_dma);
4404 			ss->fw_stats = NULL;
4405 #ifdef IFNET_BUF_RING
4406 			if (ss->tx.br != NULL) {
4407 				drbr_free(ss->tx.br, M_DEVBUF);
4408 				ss->tx.br = NULL;
4409 			}
4410 #endif
4411 			mtx_destroy(&ss->tx.mtx);
4412 		}
4413 		if (ss->rx_done.entry != NULL) {
4414 			mxge_dma_free(&ss->rx_done.dma);
4415 			ss->rx_done.entry = NULL;
4416 		}
4417 	}
4418 	free(sc->ss, M_DEVBUF);
4419 	sc->ss = NULL;
4420 }
4421 
4422 static int
4423 mxge_alloc_slices(mxge_softc_t *sc)
4424 {
4425 	mxge_cmd_t cmd;
4426 	struct mxge_slice_state *ss;
4427 	size_t bytes;
4428 	int err, i, max_intr_slots;
4429 
4430 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4431 	if (err != 0) {
4432 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4433 		return err;
4434 	}
4435 	sc->rx_ring_size = cmd.data0;
4436 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4437 
4438 	bytes = sizeof (*sc->ss) * sc->num_slices;
4439 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4440 	if (sc->ss == NULL)
4441 		return (ENOMEM);
4442 	for (i = 0; i < sc->num_slices; i++) {
4443 		ss = &sc->ss[i];
4444 
4445 		ss->sc = sc;
4446 
4447 		/* allocate per-slice rx interrupt queues */
4448 
4449 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4450 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4451 		if (err != 0)
4452 			goto abort;
4453 		ss->rx_done.entry = ss->rx_done.dma.addr;
4454 		bzero(ss->rx_done.entry, bytes);
4455 
4456 		/*
4457 		 * allocate the per-slice firmware stats; stats
4458 		 * (including tx) are used used only on the first
4459 		 * slice for now
4460 		 */
4461 #ifndef IFNET_BUF_RING
4462 		if (i > 0)
4463 			continue;
4464 #endif
4465 
4466 		bytes = sizeof (*ss->fw_stats);
4467 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4468 				     sizeof (*ss->fw_stats), 64);
4469 		if (err != 0)
4470 			goto abort;
4471 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4472 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4473 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4474 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4475 #ifdef IFNET_BUF_RING
4476 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4477 					   &ss->tx.mtx);
4478 #endif
4479 	}
4480 
4481 	return (0);
4482 
4483 abort:
4484 	mxge_free_slices(sc);
4485 	return (ENOMEM);
4486 }
4487 
4488 static void
4489 mxge_slice_probe(mxge_softc_t *sc)
4490 {
4491 	mxge_cmd_t cmd;
4492 	char *old_fw;
4493 	int msix_cnt, status, max_intr_slots;
4494 
4495 	sc->num_slices = 1;
4496 	/*
4497 	 *  don't enable multiple slices if they are not enabled,
4498 	 *  or if this is not an SMP system
4499 	 */
4500 
4501 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4502 		return;
4503 
4504 	/* see how many MSI-X interrupts are available */
4505 	msix_cnt = pci_msix_count(sc->dev);
4506 	if (msix_cnt < 2)
4507 		return;
4508 
4509 	/* now load the slice aware firmware see what it supports */
4510 	old_fw = sc->fw_name;
4511 	if (old_fw == mxge_fw_aligned)
4512 		sc->fw_name = mxge_fw_rss_aligned;
4513 	else
4514 		sc->fw_name = mxge_fw_rss_unaligned;
4515 	status = mxge_load_firmware(sc, 0);
4516 	if (status != 0) {
4517 		device_printf(sc->dev, "Falling back to a single slice\n");
4518 		return;
4519 	}
4520 
4521 	/* try to send a reset command to the card to see if it
4522 	   is alive */
4523 	memset(&cmd, 0, sizeof (cmd));
4524 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4525 	if (status != 0) {
4526 		device_printf(sc->dev, "failed reset\n");
4527 		goto abort_with_fw;
4528 	}
4529 
4530 	/* get rx ring size */
4531 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4532 	if (status != 0) {
4533 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4534 		goto abort_with_fw;
4535 	}
4536 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4537 
4538 	/* tell it the size of the interrupt queues */
4539 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4540 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4541 	if (status != 0) {
4542 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4543 		goto abort_with_fw;
4544 	}
4545 
4546 	/* ask the maximum number of slices it supports */
4547 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4548 	if (status != 0) {
4549 		device_printf(sc->dev,
4550 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4551 		goto abort_with_fw;
4552 	}
4553 	sc->num_slices = cmd.data0;
4554 	if (sc->num_slices > msix_cnt)
4555 		sc->num_slices = msix_cnt;
4556 
4557 	if (mxge_max_slices == -1) {
4558 		/* cap to number of CPUs in system */
4559 		if (sc->num_slices > mp_ncpus)
4560 			sc->num_slices = mp_ncpus;
4561 	} else {
4562 		if (sc->num_slices > mxge_max_slices)
4563 			sc->num_slices = mxge_max_slices;
4564 	}
4565 	/* make sure it is a power of two */
4566 	while (sc->num_slices & (sc->num_slices - 1))
4567 		sc->num_slices--;
4568 
4569 	if (mxge_verbose)
4570 		device_printf(sc->dev, "using %d slices\n",
4571 			      sc->num_slices);
4572 
4573 	return;
4574 
4575 abort_with_fw:
4576 	sc->fw_name = old_fw;
4577 	(void) mxge_load_firmware(sc, 0);
4578 }
4579 
4580 static int
4581 mxge_add_msix_irqs(mxge_softc_t *sc)
4582 {
4583 	size_t bytes;
4584 	int count, err, i, rid;
4585 
4586 	rid = PCIR_BAR(2);
4587 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4588 						    &rid, RF_ACTIVE);
4589 
4590 	if (sc->msix_table_res == NULL) {
4591 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4592 		return ENXIO;
4593 	}
4594 
4595 	count = sc->num_slices;
4596 	err = pci_alloc_msix(sc->dev, &count);
4597 	if (err != 0) {
4598 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4599 			      "err = %d \n", sc->num_slices, err);
4600 		goto abort_with_msix_table;
4601 	}
4602 	if (count < sc->num_slices) {
4603 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4604 			      count, sc->num_slices);
4605 		device_printf(sc->dev,
4606 			      "Try setting hw.mxge.max_slices to %d\n",
4607 			      count);
4608 		err = ENOSPC;
4609 		goto abort_with_msix;
4610 	}
4611 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4612 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4613 	if (sc->msix_irq_res == NULL) {
4614 		err = ENOMEM;
4615 		goto abort_with_msix;
4616 	}
4617 
4618 	for (i = 0; i < sc->num_slices; i++) {
4619 		rid = i + 1;
4620 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4621 							  SYS_RES_IRQ,
4622 							  &rid, RF_ACTIVE);
4623 		if (sc->msix_irq_res[i] == NULL) {
4624 			device_printf(sc->dev, "couldn't allocate IRQ res"
4625 				      " for message %d\n", i);
4626 			err = ENXIO;
4627 			goto abort_with_res;
4628 		}
4629 	}
4630 
4631 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4632 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4633 
4634 	for (i = 0; i < sc->num_slices; i++) {
4635 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4636 				     INTR_TYPE_NET | INTR_MPSAFE,
4637 #if __FreeBSD_version > 700030
4638 				     NULL,
4639 #endif
4640 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4641 		if (err != 0) {
4642 			device_printf(sc->dev, "couldn't setup intr for "
4643 				      "message %d\n", i);
4644 			goto abort_with_intr;
4645 		}
4646 		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4647 				  sc->msix_ih[i], "s%d", i);
4648 	}
4649 
4650 	if (mxge_verbose) {
4651 		device_printf(sc->dev, "using %d msix IRQs:",
4652 			      sc->num_slices);
4653 		for (i = 0; i < sc->num_slices; i++)
4654 			printf(" %jd", rman_get_start(sc->msix_irq_res[i]));
4655 		printf("\n");
4656 	}
4657 	return (0);
4658 
4659 abort_with_intr:
4660 	for (i = 0; i < sc->num_slices; i++) {
4661 		if (sc->msix_ih[i] != NULL) {
4662 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4663 					  sc->msix_ih[i]);
4664 			sc->msix_ih[i] = NULL;
4665 		}
4666 	}
4667 	free(sc->msix_ih, M_DEVBUF);
4668 
4669 
4670 abort_with_res:
4671 	for (i = 0; i < sc->num_slices; i++) {
4672 		rid = i + 1;
4673 		if (sc->msix_irq_res[i] != NULL)
4674 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4675 					     sc->msix_irq_res[i]);
4676 		sc->msix_irq_res[i] = NULL;
4677 	}
4678 	free(sc->msix_irq_res, M_DEVBUF);
4679 
4680 
4681 abort_with_msix:
4682 	pci_release_msi(sc->dev);
4683 
4684 abort_with_msix_table:
4685 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4686 			     sc->msix_table_res);
4687 
4688 	return err;
4689 }
4690 
4691 static int
4692 mxge_add_single_irq(mxge_softc_t *sc)
4693 {
4694 	int count, err, rid;
4695 
4696 	count = pci_msi_count(sc->dev);
4697 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4698 		rid = 1;
4699 	} else {
4700 		rid = 0;
4701 		sc->legacy_irq = 1;
4702 	}
4703 	sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &rid,
4704 					     RF_SHAREABLE | RF_ACTIVE);
4705 	if (sc->irq_res == NULL) {
4706 		device_printf(sc->dev, "could not alloc interrupt\n");
4707 		return ENXIO;
4708 	}
4709 	if (mxge_verbose)
4710 		device_printf(sc->dev, "using %s irq %jd\n",
4711 			      sc->legacy_irq ? "INTx" : "MSI",
4712 			      rman_get_start(sc->irq_res));
4713 	err = bus_setup_intr(sc->dev, sc->irq_res,
4714 			     INTR_TYPE_NET | INTR_MPSAFE,
4715 #if __FreeBSD_version > 700030
4716 			     NULL,
4717 #endif
4718 			     mxge_intr, &sc->ss[0], &sc->ih);
4719 	if (err != 0) {
4720 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4721 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4722 		if (!sc->legacy_irq)
4723 			pci_release_msi(sc->dev);
4724 	}
4725 	return err;
4726 }
4727 
4728 static void
4729 mxge_rem_msix_irqs(mxge_softc_t *sc)
4730 {
4731 	int i, rid;
4732 
4733 	for (i = 0; i < sc->num_slices; i++) {
4734 		if (sc->msix_ih[i] != NULL) {
4735 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4736 					  sc->msix_ih[i]);
4737 			sc->msix_ih[i] = NULL;
4738 		}
4739 	}
4740 	free(sc->msix_ih, M_DEVBUF);
4741 
4742 	for (i = 0; i < sc->num_slices; i++) {
4743 		rid = i + 1;
4744 		if (sc->msix_irq_res[i] != NULL)
4745 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4746 					     sc->msix_irq_res[i]);
4747 		sc->msix_irq_res[i] = NULL;
4748 	}
4749 	free(sc->msix_irq_res, M_DEVBUF);
4750 
4751 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4752 			     sc->msix_table_res);
4753 
4754 	pci_release_msi(sc->dev);
4755 	return;
4756 }
4757 
4758 static void
4759 mxge_rem_single_irq(mxge_softc_t *sc)
4760 {
4761 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4762 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4763 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4764 	if (!sc->legacy_irq)
4765 		pci_release_msi(sc->dev);
4766 }
4767 
4768 static void
4769 mxge_rem_irq(mxge_softc_t *sc)
4770 {
4771 	if (sc->num_slices > 1)
4772 		mxge_rem_msix_irqs(sc);
4773 	else
4774 		mxge_rem_single_irq(sc);
4775 }
4776 
4777 static int
4778 mxge_add_irq(mxge_softc_t *sc)
4779 {
4780 	int err;
4781 
4782 	if (sc->num_slices > 1)
4783 		err = mxge_add_msix_irqs(sc);
4784 	else
4785 		err = mxge_add_single_irq(sc);
4786 
4787 	if (0 && err == 0 && sc->num_slices > 1) {
4788 		mxge_rem_msix_irqs(sc);
4789 		err = mxge_add_msix_irqs(sc);
4790 	}
4791 	return err;
4792 }
4793 
4794 
4795 static int
4796 mxge_attach(device_t dev)
4797 {
4798 	mxge_cmd_t cmd;
4799 	mxge_softc_t *sc = device_get_softc(dev);
4800 	struct ifnet *ifp;
4801 	int err, rid;
4802 
4803 	sc->dev = dev;
4804 	mxge_fetch_tunables(sc);
4805 
4806 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4807 	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4808 				  taskqueue_thread_enqueue, &sc->tq);
4809 	if (sc->tq == NULL) {
4810 		err = ENOMEM;
4811 		goto abort_with_nothing;
4812 	}
4813 
4814 	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4815 				 1,			/* alignment */
4816 				 0,			/* boundary */
4817 				 BUS_SPACE_MAXADDR,	/* low */
4818 				 BUS_SPACE_MAXADDR,	/* high */
4819 				 NULL, NULL,		/* filter */
4820 				 65536 + 256,		/* maxsize */
4821 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4822 				 65536,			/* maxsegsize */
4823 				 0,			/* flags */
4824 				 NULL, NULL,		/* lock */
4825 				 &sc->parent_dmat);	/* tag */
4826 
4827 	if (err != 0) {
4828 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4829 			      err);
4830 		goto abort_with_tq;
4831 	}
4832 
4833 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4834 	if (ifp == NULL) {
4835 		device_printf(dev, "can not if_alloc()\n");
4836 		err = ENOSPC;
4837 		goto abort_with_parent_dmat;
4838 	}
4839 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4840 
4841 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4842 		 device_get_nameunit(dev));
4843 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4844 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4845 		 "%s:drv", device_get_nameunit(dev));
4846 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4847 		 MTX_NETWORK_LOCK, MTX_DEF);
4848 
4849 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4850 
4851 	mxge_setup_cfg_space(sc);
4852 
4853 	/* Map the board into the kernel */
4854 	rid = PCIR_BARS;
4855 	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
4856 					     RF_ACTIVE);
4857 	if (sc->mem_res == NULL) {
4858 		device_printf(dev, "could not map memory\n");
4859 		err = ENXIO;
4860 		goto abort_with_lock;
4861 	}
4862 	sc->sram = rman_get_virtual(sc->mem_res);
4863 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4864 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4865 		device_printf(dev, "impossible memory region size %jd\n",
4866 			      rman_get_size(sc->mem_res));
4867 		err = ENXIO;
4868 		goto abort_with_mem_res;
4869 	}
4870 
4871 	/* make NULL terminated copy of the EEPROM strings section of
4872 	   lanai SRAM */
4873 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4874 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4875 				rman_get_bushandle(sc->mem_res),
4876 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4877 				sc->eeprom_strings,
4878 				MXGE_EEPROM_STRINGS_SIZE - 2);
4879 	err = mxge_parse_strings(sc);
4880 	if (err != 0)
4881 		goto abort_with_mem_res;
4882 
4883 	/* Enable write combining for efficient use of PCIe bus */
4884 	mxge_enable_wc(sc);
4885 
4886 	/* Allocate the out of band dma memory */
4887 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4888 			     sizeof (mxge_cmd_t), 64);
4889 	if (err != 0)
4890 		goto abort_with_mem_res;
4891 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4892 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4893 	if (err != 0)
4894 		goto abort_with_cmd_dma;
4895 
4896 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4897 	if (err != 0)
4898 		goto abort_with_zeropad_dma;
4899 
4900 	/* select & load the firmware */
4901 	err = mxge_select_firmware(sc);
4902 	if (err != 0)
4903 		goto abort_with_dmabench;
4904 	sc->intr_coal_delay = mxge_intr_coal_delay;
4905 
4906 	mxge_slice_probe(sc);
4907 	err = mxge_alloc_slices(sc);
4908 	if (err != 0)
4909 		goto abort_with_dmabench;
4910 
4911 	err = mxge_reset(sc, 0);
4912 	if (err != 0)
4913 		goto abort_with_slices;
4914 
4915 	err = mxge_alloc_rings(sc);
4916 	if (err != 0) {
4917 		device_printf(sc->dev, "failed to allocate rings\n");
4918 		goto abort_with_slices;
4919 	}
4920 
4921 	err = mxge_add_irq(sc);
4922 	if (err != 0) {
4923 		device_printf(sc->dev, "failed to add irq\n");
4924 		goto abort_with_rings;
4925 	}
4926 
4927 	ifp->if_baudrate = IF_Gbps(10);
4928 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4929 		IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4930 		IFCAP_RXCSUM_IPV6;
4931 #if defined(INET) || defined(INET6)
4932 	ifp->if_capabilities |= IFCAP_LRO;
4933 #endif
4934 
4935 #ifdef MXGE_NEW_VLAN_API
4936 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4937 
4938 	/* Only FW 1.4.32 and newer can do TSO over vlans */
4939 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4940 	    sc->fw_ver_tiny >= 32)
4941 		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4942 #endif
4943 	sc->max_mtu = mxge_max_mtu(sc);
4944 	if (sc->max_mtu >= 9000)
4945 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4946 	else
4947 		device_printf(dev, "MTU limited to %d.  Install "
4948 			      "latest firmware for 9000 byte jumbo support\n",
4949 			      sc->max_mtu - ETHER_HDR_LEN);
4950 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4951 	ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
4952 	/* check to see if f/w supports TSO for IPv6 */
4953 	if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4954 		if (CSUM_TCP_IPV6)
4955 			ifp->if_capabilities |= IFCAP_TSO6;
4956 		sc->max_tso6_hlen = min(cmd.data0,
4957 					sizeof (sc->ss[0].scratch));
4958 	}
4959 	ifp->if_capenable = ifp->if_capabilities;
4960 	if (sc->lro_cnt == 0)
4961 		ifp->if_capenable &= ~IFCAP_LRO;
4962 	ifp->if_init = mxge_init;
4963 	ifp->if_softc = sc;
4964 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4965 	ifp->if_ioctl = mxge_ioctl;
4966 	ifp->if_start = mxge_start;
4967 	ifp->if_get_counter = mxge_get_counter;
4968 	ifp->if_hw_tsomax = IP_MAXPACKET - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
4969 	ifp->if_hw_tsomaxsegcount = sc->ss[0].tx.max_desc;
4970 	ifp->if_hw_tsomaxsegsize = IP_MAXPACKET;
4971 	/* Initialise the ifmedia structure */
4972 	ifmedia_init(&sc->media, 0, mxge_media_change,
4973 		     mxge_media_status);
4974 	mxge_media_init(sc);
4975 	mxge_media_probe(sc);
4976 	sc->dying = 0;
4977 	ether_ifattach(ifp, sc->mac_addr);
4978 	/* ether_ifattach sets mtu to ETHERMTU */
4979 	if (mxge_initial_mtu != ETHERMTU)
4980 		mxge_change_mtu(sc, mxge_initial_mtu);
4981 
4982 	mxge_add_sysctls(sc);
4983 #ifdef IFNET_BUF_RING
4984 	ifp->if_transmit = mxge_transmit;
4985 	ifp->if_qflush = mxge_qflush;
4986 #endif
4987 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4988 				device_get_nameunit(sc->dev));
4989 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4990 	return 0;
4991 
4992 abort_with_rings:
4993 	mxge_free_rings(sc);
4994 abort_with_slices:
4995 	mxge_free_slices(sc);
4996 abort_with_dmabench:
4997 	mxge_dma_free(&sc->dmabench_dma);
4998 abort_with_zeropad_dma:
4999 	mxge_dma_free(&sc->zeropad_dma);
5000 abort_with_cmd_dma:
5001 	mxge_dma_free(&sc->cmd_dma);
5002 abort_with_mem_res:
5003 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5004 abort_with_lock:
5005 	pci_disable_busmaster(dev);
5006 	mtx_destroy(&sc->cmd_mtx);
5007 	mtx_destroy(&sc->driver_mtx);
5008 	if_free(ifp);
5009 abort_with_parent_dmat:
5010 	bus_dma_tag_destroy(sc->parent_dmat);
5011 abort_with_tq:
5012 	if (sc->tq != NULL) {
5013 		taskqueue_drain(sc->tq, &sc->watchdog_task);
5014 		taskqueue_free(sc->tq);
5015 		sc->tq = NULL;
5016 	}
5017 abort_with_nothing:
5018 	return err;
5019 }
5020 
5021 static int
5022 mxge_detach(device_t dev)
5023 {
5024 	mxge_softc_t *sc = device_get_softc(dev);
5025 
5026 	if (mxge_vlans_active(sc)) {
5027 		device_printf(sc->dev,
5028 			      "Detach vlans before removing module\n");
5029 		return EBUSY;
5030 	}
5031 	mtx_lock(&sc->driver_mtx);
5032 	sc->dying = 1;
5033 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
5034 		mxge_close(sc, 0);
5035 	mtx_unlock(&sc->driver_mtx);
5036 	ether_ifdetach(sc->ifp);
5037 	if (sc->tq != NULL) {
5038 		taskqueue_drain(sc->tq, &sc->watchdog_task);
5039 		taskqueue_free(sc->tq);
5040 		sc->tq = NULL;
5041 	}
5042 	callout_drain(&sc->co_hdl);
5043 	ifmedia_removeall(&sc->media);
5044 	mxge_dummy_rdma(sc, 0);
5045 	mxge_rem_sysctls(sc);
5046 	mxge_rem_irq(sc);
5047 	mxge_free_rings(sc);
5048 	mxge_free_slices(sc);
5049 	mxge_dma_free(&sc->dmabench_dma);
5050 	mxge_dma_free(&sc->zeropad_dma);
5051 	mxge_dma_free(&sc->cmd_dma);
5052 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5053 	pci_disable_busmaster(dev);
5054 	mtx_destroy(&sc->cmd_mtx);
5055 	mtx_destroy(&sc->driver_mtx);
5056 	if_free(sc->ifp);
5057 	bus_dma_tag_destroy(sc->parent_dmat);
5058 	return 0;
5059 }
5060 
5061 static int
5062 mxge_shutdown(device_t dev)
5063 {
5064 	return 0;
5065 }
5066 
5067 /*
5068   This file uses Myri10GE driver indentation.
5069 
5070   Local Variables:
5071   c-file-style:"linux"
5072   tab-width:8
5073   End:
5074 */
5075