xref: /dragonfly/sys/dev/netif/mxge/if_mxge.c (revision dcd37f7d)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2009, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 $FreeBSD: src/sys/dev/mxge/if_mxge.c,v 1.63 2009/06/26 11:45:06 rwatson Exp $
29 
30 ***************************************************************************/
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/linker.h>
35 #include <sys/firmware.h>
36 #include <sys/endian.h>
37 #include <sys/in_cksum.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kernel.h>
42 #include <sys/module.h>
43 #include <sys/serialize.h>
44 #include <sys/socket.h>
45 #include <sys/sysctl.h>
46 
47 /* count xmits ourselves, rather than via drbr */
48 #define NO_SLOW_STATS
49 #include <net/if.h>
50 #include <net/if_arp.h>
51 #include <net/ifq_var.h>
52 #include <net/ethernet.h>
53 #include <net/if_dl.h>
54 #include <net/if_media.h>
55 
56 #include <net/bpf.h>
57 
58 #include <net/if_types.h>
59 #include <net/vlan/if_vlan_var.h>
60 #include <net/zlib.h>
61 
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/tcp.h>
66 
67 #include <sys/bus.h>
68 #include <sys/rman.h>
69 
70 #include <bus/pci/pcireg.h>
71 #include <bus/pci/pcivar.h>
72 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
73 
74 #include <vm/vm.h>		/* for pmap_mapdev() */
75 #include <vm/pmap.h>
76 
77 #if defined(__i386) || defined(__x86_64)
78 #include <machine/specialreg.h>
79 #endif
80 
81 #include <dev/netif/mxge/mxge_mcp.h>
82 #include <dev/netif/mxge/mcp_gen_header.h>
83 /*#define MXGE_FAKE_IFP*/
84 #include <dev/netif/mxge/if_mxge_var.h>
85 #ifdef IFNET_BUF_RING
86 #include <sys/buf_ring.h>
87 #endif
88 
89 #include "opt_inet.h"
90 
91 /* tunable params */
92 static int mxge_nvidia_ecrc_enable = 1;
93 static int mxge_force_firmware = 0;
94 static int mxge_intr_coal_delay = 30;
95 static int mxge_deassert_wait = 1;
96 static int mxge_flow_control = 1;
97 static int mxge_verbose = 0;
98 static int mxge_lro_cnt = 8;
99 static int mxge_ticks;
100 static int mxge_max_slices = 1;
101 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
102 static int mxge_always_promisc = 0;
103 /* XXX: not yet */
104 /* static int mxge_initial_mtu = ETHERMTU_JUMBO; */
105 static int mxge_initial_mtu = ETHERMTU;
106 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
107 static char *mxge_fw_aligned = "mxge_eth_z8e";
108 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
109 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
110 
111 static int mxge_probe(device_t dev);
112 static int mxge_attach(device_t dev);
113 static int mxge_detach(device_t dev);
114 static int mxge_shutdown(device_t dev);
115 static void mxge_intr(void *arg);
116 
117 static device_method_t mxge_methods[] =
118 {
119   /* Device interface */
120   DEVMETHOD(device_probe, mxge_probe),
121   DEVMETHOD(device_attach, mxge_attach),
122   DEVMETHOD(device_detach, mxge_detach),
123   DEVMETHOD(device_shutdown, mxge_shutdown),
124   {0, 0}
125 };
126 
127 static driver_t mxge_driver =
128 {
129   "mxge",
130   mxge_methods,
131   sizeof(mxge_softc_t),
132 };
133 
134 static devclass_t mxge_devclass;
135 
136 /* Declare ourselves to be a child of the PCI bus.*/
137 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
138 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
139 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
140 
141 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
142 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
143 static int mxge_close(mxge_softc_t *sc);
144 static int mxge_open(mxge_softc_t *sc);
145 static void mxge_tick(void *arg);
146 
147 /* XXX: we don't have Large Receive Offload support yet */
148  inline int
149 mxge_lro_rx(struct mxge_slice_state *ss, struct mbuf *m_head, uint32_t csum)
150 {
151 	(void)ss;
152 	(void)m_head;
153 	(void)csum;
154 	return 1;
155 }
156 
157  inline void
158 mxge_lro_flush(struct mxge_slice_state *ss, struct lro_entry *lro)
159 {
160 	(void)ss;
161 	(void)lro;
162 }
163 
164 static int
165 mxge_probe(device_t dev)
166 {
167 	int rev;
168 
169 
170 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
171 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
172 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
173 		rev = pci_get_revid(dev);
174 		switch (rev) {
175 		case MXGE_PCI_REV_Z8E:
176 			device_set_desc(dev, "Myri10G-PCIE-8A");
177 			break;
178 		case MXGE_PCI_REV_Z8ES:
179 			device_set_desc(dev, "Myri10G-PCIE-8B");
180 			break;
181 		default:
182 			device_set_desc(dev, "Myri10G-PCIE-8??");
183 			device_printf(dev, "Unrecognized rev %d NIC\n",
184 				      rev);
185 			break;
186 		}
187 		return 0;
188 	}
189 	return ENXIO;
190 }
191 
192 static void
193 mxge_enable_wc(mxge_softc_t *sc)
194 {
195 #if 0
196 #if defined(__i386) || defined(__x86_64)
197 	vm_offset_t len;
198 	int err;
199 
200 	sc->wc = 1;
201 	len = rman_get_size(sc->mem_res);
202 	err = pmap_change_attr((vm_offset_t) sc->sram,
203 			       len, PAT_WRITE_COMBINING);
204 	if (err != 0) {
205 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
206 			      err);
207 		sc->wc = 0;
208 	}
209 #endif
210 #else
211 	sc->wc = 0;	/* TBD: PAT support */
212 #endif
213 }
214 
215 
216 /* callback to get our DMA address */
217 static void
218 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
219 			 int error)
220 {
221 	if (error == 0) {
222 		*(bus_addr_t *) arg = segs->ds_addr;
223 	}
224 }
225 
226 static int
227 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
228 		   bus_size_t alignment)
229 {
230 	int err;
231 	device_t dev = sc->dev;
232 	bus_size_t boundary, maxsegsize;
233 
234 	if (bytes > 4096 && alignment == 4096) {
235 		boundary = 0;
236 		maxsegsize = bytes;
237 	} else {
238 		boundary = 4096;
239 		maxsegsize = 4096;
240 	}
241 
242 	/* allocate DMAable memory tags */
243 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
244 				 alignment,		/* alignment */
245 				 boundary,		/* boundary */
246 				 BUS_SPACE_MAXADDR,	/* low */
247 				 BUS_SPACE_MAXADDR,	/* high */
248 				 NULL, NULL,		/* filter */
249 				 bytes,			/* maxsize */
250 				 1,			/* num segs */
251 				 maxsegsize,		/* maxsegsize */
252 				 BUS_DMA_COHERENT,	/* flags */
253 				 &dma->dmat);		/* tag */
254 	if (err != 0) {
255 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
256 		return err;
257 	}
258 
259 	/* allocate DMAable memory & map */
260 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
261 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
262 				| BUS_DMA_ZERO),  &dma->map);
263 	if (err != 0) {
264 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
265 		goto abort_with_dmat;
266 	}
267 
268 	/* load the memory */
269 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
270 			      mxge_dmamap_callback,
271 			      (void *)&dma->bus_addr, 0);
272 	if (err != 0) {
273 		device_printf(dev, "couldn't load map (err = %d)\n", err);
274 		goto abort_with_mem;
275 	}
276 	return 0;
277 
278 abort_with_mem:
279 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
280 abort_with_dmat:
281 	(void)bus_dma_tag_destroy(dma->dmat);
282 	return err;
283 }
284 
285 
286 static void
287 mxge_dma_free(mxge_dma_t *dma)
288 {
289 	bus_dmamap_unload(dma->dmat, dma->map);
290 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
291 	(void)bus_dma_tag_destroy(dma->dmat);
292 }
293 
294 /*
295  * The eeprom strings on the lanaiX have the format
296  * SN=x\0
297  * MAC=x:x:x:x:x:x\0
298  * PC=text\0
299  */
300 
301 static int
302 mxge_parse_strings(mxge_softc_t *sc)
303 {
304 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
305 
306 	char *ptr, *limit;
307 	int i, found_mac;
308 
309 	ptr = sc->eeprom_strings;
310 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
311 	found_mac = 0;
312 	while (ptr < limit && *ptr != '\0') {
313 		if (memcmp(ptr, "MAC=", 4) == 0) {
314 			ptr += 1;
315 			sc->mac_addr_string = ptr;
316 			for (i = 0; i < 6; i++) {
317 				ptr += 3;
318 				if ((ptr + 2) > limit)
319 					goto abort;
320 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
321 				found_mac = 1;
322 			}
323 		} else if (memcmp(ptr, "PC=", 3) == 0) {
324 			ptr += 3;
325 			strncpy(sc->product_code_string, ptr,
326 				sizeof (sc->product_code_string) - 1);
327 		} else if (memcmp(ptr, "SN=", 3) == 0) {
328 			ptr += 3;
329 			strncpy(sc->serial_number_string, ptr,
330 				sizeof (sc->serial_number_string) - 1);
331 		}
332 		MXGE_NEXT_STRING(ptr);
333 	}
334 
335 	if (found_mac)
336 		return 0;
337 
338  abort:
339 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
340 
341 	return ENXIO;
342 }
343 
344 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
345 static void
346 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
347 {
348 	uint32_t val;
349 	unsigned long base, off;
350 	char *va, *cfgptr;
351 	device_t pdev, mcp55;
352 	uint16_t vendor_id, device_id, word;
353 	uintptr_t bus, slot, func, ivend, idev;
354 	uint32_t *ptr32;
355 
356 
357 	if (!mxge_nvidia_ecrc_enable)
358 		return;
359 
360 	pdev = device_get_parent(device_get_parent(sc->dev));
361 	if (pdev == NULL) {
362 		device_printf(sc->dev, "could not find parent?\n");
363 		return;
364 	}
365 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
366 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
367 
368 	if (vendor_id != 0x10de)
369 		return;
370 
371 	base = 0;
372 
373 	if (device_id == 0x005d) {
374 		/* ck804, base address is magic */
375 		base = 0xe0000000UL;
376 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
377 		/* mcp55, base address stored in chipset */
378 		mcp55 = pci_find_bsf(0, 0, 0);
379 		if (mcp55 &&
380 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
381 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
382 			word = pci_read_config(mcp55, 0x90, 2);
383 			base = ((unsigned long)word & 0x7ffeU) << 25;
384 		}
385 	}
386 	if (!base)
387 		return;
388 
389 	/* XXXX
390 	   Test below is commented because it is believed that doing
391 	   config read/write beyond 0xff will access the config space
392 	   for the next larger function.  Uncomment this and remove
393 	   the hacky pmap_mapdev() way of accessing config space when
394 	   FreeBSD grows support for extended pcie config space access
395 	*/
396 #if 0
397 	/* See if we can, by some miracle, access the extended
398 	   config space */
399 	val = pci_read_config(pdev, 0x178, 4);
400 	if (val != 0xffffffff) {
401 		val |= 0x40;
402 		pci_write_config(pdev, 0x178, val, 4);
403 		return;
404 	}
405 #endif
406 	/* Rather than using normal pci config space writes, we must
407 	 * map the Nvidia config space ourselves.  This is because on
408 	 * opteron/nvidia class machine the 0xe000000 mapping is
409 	 * handled by the nvidia chipset, that means the internal PCI
410 	 * device (the on-chip northbridge), or the amd-8131 bridge
411 	 * and things behind them are not visible by this method.
412 	 */
413 
414 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
415 		      PCI_IVAR_BUS, &bus);
416 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
417 		      PCI_IVAR_SLOT, &slot);
418 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
419 		      PCI_IVAR_FUNCTION, &func);
420 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
421 		      PCI_IVAR_VENDOR, &ivend);
422 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
423 		      PCI_IVAR_DEVICE, &idev);
424 
425 	off =  base
426 		+ 0x00100000UL * (unsigned long)bus
427 		+ 0x00001000UL * (unsigned long)(func
428 						 + 8 * slot);
429 
430 	/* map it into the kernel */
431 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
432 
433 
434 	if (va == NULL) {
435 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
436 		return;
437 	}
438 	/* get a pointer to the config space mapped into the kernel */
439 	cfgptr = va + (off & PAGE_MASK);
440 
441 	/* make sure that we can really access it */
442 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
443 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
444 	if (! (vendor_id == ivend && device_id == idev)) {
445 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
446 			      vendor_id, device_id);
447 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
448 		return;
449 	}
450 
451 	ptr32 = (uint32_t*)(cfgptr + 0x178);
452 	val = *ptr32;
453 
454 	if (val == 0xffffffff) {
455 		device_printf(sc->dev, "extended mapping failed\n");
456 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
457 		return;
458 	}
459 	*ptr32 = val | 0x40;
460 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
461 	if (mxge_verbose)
462 		device_printf(sc->dev,
463 			      "Enabled ECRC on upstream Nvidia bridge "
464 			      "at %d:%d:%d\n",
465 			      (int)bus, (int)slot, (int)func);
466 	return;
467 }
468 #else
469 static void
470 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
471 {
472 	device_printf(sc->dev,
473 		      "Nforce 4 chipset on non-x86/x86_64!?!?!\n");
474 	return;
475 }
476 #endif
477 
478 
479 static int
480 mxge_dma_test(mxge_softc_t *sc, int test_type)
481 {
482 	mxge_cmd_t cmd;
483 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
484 	int status;
485 	uint32_t len;
486 	char *test = " ";
487 
488 
489 	/* Run a small DMA test.
490 	 * The magic multipliers to the length tell the firmware
491 	 * to do DMA read, write, or read+write tests.  The
492 	 * results are returned in cmd.data0.  The upper 16
493 	 * bits of the return is the number of transfers completed.
494 	 * The lower 16 bits is the time in 0.5us ticks that the
495 	 * transfers took to complete.
496 	 */
497 
498 	len = sc->tx_boundary;
499 
500 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
501 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
502 	cmd.data2 = len * 0x10000;
503 	status = mxge_send_cmd(sc, test_type, &cmd);
504 	if (status != 0) {
505 		test = "read";
506 		goto abort;
507 	}
508 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
509 		(cmd.data0 & 0xffff);
510 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
511 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
512 	cmd.data2 = len * 0x1;
513 	status = mxge_send_cmd(sc, test_type, &cmd);
514 	if (status != 0) {
515 		test = "write";
516 		goto abort;
517 	}
518 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
519 		(cmd.data0 & 0xffff);
520 
521 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
522 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
523 	cmd.data2 = len * 0x10001;
524 	status = mxge_send_cmd(sc, test_type, &cmd);
525 	if (status != 0) {
526 		test = "read/write";
527 		goto abort;
528 	}
529 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
530 		(cmd.data0 & 0xffff);
531 
532 abort:
533 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
534 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
535 			      test, status);
536 
537 	return status;
538 }
539 
540 /*
541  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
542  * when the PCI-E Completion packets are aligned on an 8-byte
543  * boundary.  Some PCI-E chip sets always align Completion packets; on
544  * the ones that do not, the alignment can be enforced by enabling
545  * ECRC generation (if supported).
546  *
547  * When PCI-E Completion packets are not aligned, it is actually more
548  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
549  *
550  * If the driver can neither enable ECRC nor verify that it has
551  * already been enabled, then it must use a firmware image which works
552  * around unaligned completion packets (ethp_z8e.dat), and it should
553  * also ensure that it never gives the device a Read-DMA which is
554  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
555  * enabled, then the driver should use the aligned (eth_z8e.dat)
556  * firmware image, and set tx_boundary to 4KB.
557  */
558 
559 static int
560 mxge_firmware_probe(mxge_softc_t *sc)
561 {
562 	device_t dev = sc->dev;
563 	int reg, status;
564 	uint16_t pectl;
565 
566 	sc->tx_boundary = 4096;
567 	/*
568 	 * Verify the max read request size was set to 4KB
569 	 * before trying the test with 4KB.
570 	 */
571 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
572 		pectl = pci_read_config(dev, reg + 0x8, 2);
573 		if ((pectl & (5 << 12)) != (5 << 12)) {
574 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
575 				      pectl);
576 			sc->tx_boundary = 2048;
577 		}
578 	}
579 
580 	/*
581 	 * load the optimized firmware (which assumes aligned PCIe
582 	 * completions) in order to see if it works on this host.
583 	 */
584 	sc->fw_name = mxge_fw_aligned;
585 	status = mxge_load_firmware(sc, 1);
586 	if (status != 0) {
587 		return status;
588 	}
589 
590 	/*
591 	 * Enable ECRC if possible
592 	 */
593 	mxge_enable_nvidia_ecrc(sc);
594 
595 	/*
596 	 * Run a DMA test which watches for unaligned completions and
597 	 * aborts on the first one seen.
598 	 */
599 
600 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
601 	if (status == 0)
602 		return 0; /* keep the aligned firmware */
603 
604 	if (status != E2BIG)
605 		device_printf(dev, "DMA test failed: %d\n", status);
606 	if (status == ENOSYS)
607 		device_printf(dev, "Falling back to ethp! "
608 			      "Please install up to date fw\n");
609 	return status;
610 }
611 
612 static int
613 mxge_select_firmware(mxge_softc_t *sc)
614 {
615 	int aligned = 0;
616 
617 
618 	if (mxge_force_firmware != 0) {
619 		if (mxge_force_firmware == 1)
620 			aligned = 1;
621 		else
622 			aligned = 0;
623 		if (mxge_verbose)
624 			device_printf(sc->dev,
625 				      "Assuming %s completions (forced)\n",
626 				      aligned ? "aligned" : "unaligned");
627 		goto abort;
628 	}
629 
630 	/* if the PCIe link width is 4 or less, we can use the aligned
631 	   firmware and skip any checks */
632 	if (sc->link_width != 0 && sc->link_width <= 4) {
633 		device_printf(sc->dev,
634 			      "PCIe x%d Link, expect reduced performance\n",
635 			      sc->link_width);
636 		aligned = 1;
637 		goto abort;
638 	}
639 
640 	if (0 == mxge_firmware_probe(sc))
641 		return 0;
642 
643 abort:
644 	if (aligned) {
645 		sc->fw_name = mxge_fw_aligned;
646 		sc->tx_boundary = 4096;
647 	} else {
648 		sc->fw_name = mxge_fw_unaligned;
649 		sc->tx_boundary = 2048;
650 	}
651 	return (mxge_load_firmware(sc, 0));
652 }
653 
654 union qualhack
655 {
656         const char *ro_char;
657         char *rw_char;
658 };
659 
660 static int
661 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
662 {
663 
664 
665 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
666 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
667 			      be32toh(hdr->mcp_type));
668 		return EIO;
669 	}
670 
671 	/* save firmware version for sysctl */
672 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
673 	if (mxge_verbose)
674 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
675 
676 	ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
677 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
678 
679 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
680 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
681 		device_printf(sc->dev, "Found firmware version %s\n",
682 			      sc->fw_version);
683 		device_printf(sc->dev, "Driver needs %d.%d\n",
684 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
685 		return EINVAL;
686 	}
687 	return 0;
688 
689 }
690 
691 static void *
692 z_alloc(void *nil, u_int items, u_int size)
693 {
694         void *ptr;
695 
696         ptr = kmalloc(items * size, M_TEMP, M_NOWAIT);
697         return ptr;
698 }
699 
700 static void
701 z_free(void *nil, void *ptr)
702 {
703         kfree(ptr, M_TEMP);
704 }
705 
706 
707 static int
708 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
709 {
710 	z_stream zs;
711 	char *inflate_buffer;
712 	const struct firmware *fw;
713 	const mcp_gen_header_t *hdr;
714 	unsigned hdr_offset;
715 	int status;
716 	unsigned int i;
717 	char dummy;
718 	size_t fw_len;
719 
720 	fw = firmware_get(sc->fw_name);
721 	if (fw == NULL) {
722 		device_printf(sc->dev, "Could not find firmware image %s\n",
723 			      sc->fw_name);
724 		return ENOENT;
725 	}
726 
727 
728 
729 	/* setup zlib and decompress f/w */
730 	bzero(&zs, sizeof (zs));
731 	zs.zalloc = z_alloc;
732 	zs.zfree = z_free;
733 	status = inflateInit(&zs);
734 	if (status != Z_OK) {
735 		status = EIO;
736 		goto abort_with_fw;
737 	}
738 
739 	/* the uncompressed size is stored as the firmware version,
740 	   which would otherwise go unused */
741 	fw_len = (size_t) fw->version;
742 	inflate_buffer = kmalloc(fw_len, M_TEMP, M_NOWAIT);
743 	if (inflate_buffer == NULL)
744 		goto abort_with_zs;
745 	zs.avail_in = fw->datasize;
746 	zs.next_in = __DECONST(char *, fw->data);
747 	zs.avail_out = fw_len;
748 	zs.next_out = inflate_buffer;
749 	status = inflate(&zs, Z_FINISH);
750 	if (status != Z_STREAM_END) {
751 		device_printf(sc->dev, "zlib %d\n", status);
752 		status = EIO;
753 		goto abort_with_buffer;
754 	}
755 
756 	/* check id */
757 	hdr_offset = htobe32(*(const uint32_t *)
758 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
759 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
760 		device_printf(sc->dev, "Bad firmware file");
761 		status = EIO;
762 		goto abort_with_buffer;
763 	}
764 	hdr = (const void*)(inflate_buffer + hdr_offset);
765 
766 	status = mxge_validate_firmware(sc, hdr);
767 	if (status != 0)
768 		goto abort_with_buffer;
769 
770 	/* Copy the inflated firmware to NIC SRAM. */
771 	for (i = 0; i < fw_len; i += 256) {
772 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
773 			      inflate_buffer + i,
774 			      min(256U, (unsigned)(fw_len - i)));
775 		wmb();
776 		dummy = *sc->sram;
777 		wmb();
778 	}
779 
780 	*limit = fw_len;
781 	status = 0;
782 abort_with_buffer:
783 	kfree(inflate_buffer, M_TEMP);
784 abort_with_zs:
785 	inflateEnd(&zs);
786 abort_with_fw:
787 	firmware_put(fw, FIRMWARE_UNLOAD);
788 	return status;
789 }
790 
791 /*
792  * Enable or disable periodic RDMAs from the host to make certain
793  * chipsets resend dropped PCIe messages
794  */
795 
796 static void
797 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
798 {
799 	char buf_bytes[72];
800 	volatile uint32_t *confirm;
801 	volatile char *submit;
802 	uint32_t *buf, dma_low, dma_high;
803 	int i;
804 
805 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
806 
807 	/* clear confirmation addr */
808 	confirm = (volatile uint32_t *)sc->cmd;
809 	*confirm = 0;
810 	wmb();
811 
812 	/* send an rdma command to the PCIe engine, and wait for the
813 	   response in the confirmation address.  The firmware should
814 	   write a -1 there to indicate it is alive and well
815 	*/
816 
817 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
818 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
819 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
820 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
821 	buf[2] = htobe32(0xffffffff);		/* confirm data */
822 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
823 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
824 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
825 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
826 	buf[5] = htobe32(enable);			/* enable? */
827 
828 
829 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
830 
831 	mxge_pio_copy(submit, buf, 64);
832 	wmb();
833 	DELAY(1000);
834 	wmb();
835 	i = 0;
836 	while (*confirm != 0xffffffff && i < 20) {
837 		DELAY(1000);
838 		i++;
839 	}
840 	if (*confirm != 0xffffffff) {
841 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
842 			      (enable ? "enable" : "disable"), confirm,
843 			      *confirm);
844 	}
845 	return;
846 }
847 
848 static int
849 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
850 {
851 	mcp_cmd_t *buf;
852 	char buf_bytes[sizeof(*buf) + 8];
853 	volatile mcp_cmd_response_t *response = sc->cmd;
854 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
855 	uint32_t dma_low, dma_high;
856 	int err, sleep_total = 0;
857 
858 	/*
859 	 * We may be called during attach, before if_serializer is available.
860 	 * This is not a fast path, just check for NULL
861 	 */
862 
863 	if (sc->ifp->if_serializer)
864 		ASSERT_SERIALIZED(sc->ifp->if_serializer);
865 
866 	/* ensure buf is aligned to 8 bytes */
867 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
868 
869 	buf->data0 = htobe32(data->data0);
870 	buf->data1 = htobe32(data->data1);
871 	buf->data2 = htobe32(data->data2);
872 	buf->cmd = htobe32(cmd);
873 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
874 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
875 
876 	buf->response_addr.low = htobe32(dma_low);
877 	buf->response_addr.high = htobe32(dma_high);
878 
879 
880 	response->result = 0xffffffff;
881 	wmb();
882 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
883 
884 	/* wait up to 20ms */
885 	err = EAGAIN;
886 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
887 		bus_dmamap_sync(sc->cmd_dma.dmat,
888 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
889 		wmb();
890 		switch (be32toh(response->result)) {
891 		case 0:
892 			data->data0 = be32toh(response->data);
893 			err = 0;
894 			break;
895 		case 0xffffffff:
896 			DELAY(1000);
897 			break;
898 		case MXGEFW_CMD_UNKNOWN:
899 			err = ENOSYS;
900 			break;
901 		case MXGEFW_CMD_ERROR_UNALIGNED:
902 			err = E2BIG;
903 			break;
904 		case MXGEFW_CMD_ERROR_BUSY:
905 			err = EBUSY;
906 			break;
907 		default:
908 			device_printf(sc->dev,
909 				      "mxge: command %d "
910 				      "failed, result = %d\n",
911 				      cmd, be32toh(response->result));
912 			err = ENXIO;
913 			break;
914 		}
915 		if (err != EAGAIN)
916 			break;
917 	}
918 	if (err == EAGAIN)
919 		device_printf(sc->dev, "mxge: command %d timed out"
920 			      "result = %d\n",
921 			      cmd, be32toh(response->result));
922 	return err;
923 }
924 
925 static int
926 mxge_adopt_running_firmware(mxge_softc_t *sc)
927 {
928 	struct mcp_gen_header *hdr;
929 	const size_t bytes = sizeof (struct mcp_gen_header);
930 	size_t hdr_offset;
931 	int status;
932 
933 	/* find running firmware header */
934 	hdr_offset = htobe32(*(volatile uint32_t *)
935 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
936 
937 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
938 		device_printf(sc->dev,
939 			      "Running firmware has bad header offset (%d)\n",
940 			      (int)hdr_offset);
941 		return EIO;
942 	}
943 
944 	/* copy header of running firmware from SRAM to host memory to
945 	 * validate firmware */
946 	hdr = kmalloc(bytes, M_DEVBUF, M_NOWAIT);
947 	if (hdr == NULL) {
948 		device_printf(sc->dev, "could not kmalloc firmware hdr\n");
949 		return ENOMEM;
950 	}
951 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
952 				rman_get_bushandle(sc->mem_res),
953 				hdr_offset, (char *)hdr, bytes);
954 	status = mxge_validate_firmware(sc, hdr);
955 	kfree(hdr, M_DEVBUF);
956 
957 	/*
958 	 * check to see if adopted firmware has bug where adopting
959 	 * it will cause broadcasts to be filtered unless the NIC
960 	 * is kept in ALLMULTI mode
961 	 */
962 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
963 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
964 		sc->adopted_rx_filter_bug = 1;
965 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
966 			      "working around rx filter bug\n",
967 			      sc->fw_ver_major, sc->fw_ver_minor,
968 			      sc->fw_ver_tiny);
969 	}
970 
971 	return status;
972 }
973 
974 
975 static int
976 mxge_load_firmware(mxge_softc_t *sc, int adopt)
977 {
978 	volatile uint32_t *confirm;
979 	volatile char *submit;
980 	char buf_bytes[72];
981 	uint32_t *buf, size, dma_low, dma_high;
982 	int status, i;
983 
984 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
985 
986 	size = sc->sram_size;
987 	status = mxge_load_firmware_helper(sc, &size);
988 	if (status) {
989 		if (!adopt)
990 			return status;
991 		/* Try to use the currently running firmware, if
992 		   it is new enough */
993 		status = mxge_adopt_running_firmware(sc);
994 		if (status) {
995 			device_printf(sc->dev,
996 				      "failed to adopt running firmware\n");
997 			return status;
998 		}
999 		device_printf(sc->dev,
1000 			      "Successfully adopted running firmware\n");
1001 		if (sc->tx_boundary == 4096) {
1002 			device_printf(sc->dev,
1003 				"Using firmware currently running on NIC"
1004 				 ".  For optimal\n");
1005 			device_printf(sc->dev,
1006 				 "performance consider loading optimized "
1007 				 "firmware\n");
1008 		}
1009 		sc->fw_name = mxge_fw_unaligned;
1010 		sc->tx_boundary = 2048;
1011 		return 0;
1012 	}
1013 	/* clear confirmation addr */
1014 	confirm = (volatile uint32_t *)sc->cmd;
1015 	*confirm = 0;
1016 	wmb();
1017 	/* send a reload command to the bootstrap MCP, and wait for the
1018 	   response in the confirmation address.  The firmware should
1019 	   write a -1 there to indicate it is alive and well
1020 	*/
1021 
1022 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1023 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1024 
1025 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1026 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1027 	buf[2] = htobe32(0xffffffff);	/* confirm data */
1028 
1029 	/* FIX: All newest firmware should un-protect the bottom of
1030 	   the sram before handoff. However, the very first interfaces
1031 	   do not. Therefore the handoff copy must skip the first 8 bytes
1032 	*/
1033 					/* where the code starts*/
1034 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1035 	buf[4] = htobe32(size - 8); 	/* length of code */
1036 	buf[5] = htobe32(8);		/* where to copy to */
1037 	buf[6] = htobe32(0);		/* where to jump to */
1038 
1039 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1040 	mxge_pio_copy(submit, buf, 64);
1041 	wmb();
1042 	DELAY(1000);
1043 	wmb();
1044 	i = 0;
1045 	while (*confirm != 0xffffffff && i < 20) {
1046 		DELAY(1000*10);
1047 		i++;
1048 		bus_dmamap_sync(sc->cmd_dma.dmat,
1049 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1050 	}
1051 	if (*confirm != 0xffffffff) {
1052 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1053 			confirm, *confirm);
1054 
1055 		return ENXIO;
1056 	}
1057 	return 0;
1058 }
1059 
1060 static int
1061 mxge_update_mac_address(mxge_softc_t *sc)
1062 {
1063 	mxge_cmd_t cmd;
1064 	uint8_t *addr = sc->mac_addr;
1065 	int status;
1066 
1067 
1068 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1069 		     | (addr[2] << 8) | addr[3]);
1070 
1071 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1072 
1073 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1074 	return status;
1075 }
1076 
1077 static int
1078 mxge_change_pause(mxge_softc_t *sc, int pause)
1079 {
1080 	mxge_cmd_t cmd;
1081 	int status;
1082 
1083 	if (pause)
1084 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1085 				       &cmd);
1086 	else
1087 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1088 				       &cmd);
1089 
1090 	if (status) {
1091 		device_printf(sc->dev, "Failed to set flow control mode\n");
1092 		return ENXIO;
1093 	}
1094 	sc->pause = pause;
1095 	return 0;
1096 }
1097 
1098 static void
1099 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1100 {
1101 	mxge_cmd_t cmd;
1102 	int status;
1103 
1104 	if( sc->ifp->if_serializer)
1105 		ASSERT_SERIALIZED(sc->ifp->if_serializer);
1106 	if (mxge_always_promisc)
1107 		promisc = 1;
1108 
1109 	if (promisc)
1110 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1111 				       &cmd);
1112 	else
1113 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1114 				       &cmd);
1115 
1116 	if (status) {
1117 		device_printf(sc->dev, "Failed to set promisc mode\n");
1118 	}
1119 }
1120 
1121 static void
1122 mxge_set_multicast_list(mxge_softc_t *sc)
1123 {
1124 	mxge_cmd_t cmd;
1125 	struct ifmultiaddr *ifma;
1126 	struct ifnet *ifp = sc->ifp;
1127 	int err;
1128 
1129 	if (ifp->if_serializer)
1130 		ASSERT_SERIALIZED(ifp->if_serializer);
1131 
1132 	/* This firmware is known to not support multicast */
1133 	if (!sc->fw_multicast_support)
1134 		return;
1135 
1136 	/* Disable multicast filtering while we play with the lists*/
1137 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1138 	if (err != 0) {
1139 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1140 		       " error status: %d\n", err);
1141 		return;
1142 	}
1143 
1144 	if (sc->adopted_rx_filter_bug)
1145 		return;
1146 
1147 	if (ifp->if_flags & IFF_ALLMULTI)
1148 		/* request to disable multicast filtering, so quit here */
1149 		return;
1150 
1151 	/* Flush all the filters */
1152 
1153 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1154 	if (err != 0) {
1155 		device_printf(sc->dev,
1156 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1157 			      ", error status: %d\n", err);
1158 		return;
1159 	}
1160 
1161 	/* Walk the multicast list, and add each address */
1162 
1163 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1164 		if (ifma->ifma_addr->sa_family != AF_LINK)
1165 			continue;
1166 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1167 		      &cmd.data0, 4);
1168 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1169 		      &cmd.data1, 2);
1170 		cmd.data0 = htonl(cmd.data0);
1171 		cmd.data1 = htonl(cmd.data1);
1172 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1173 		if (err != 0) {
1174 			device_printf(sc->dev, "Failed "
1175 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1176 			       "%d\t", err);
1177 			/* abort, leaving multicast filtering off */
1178 			return;
1179 		}
1180 	}
1181 	/* Enable multicast filtering */
1182 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1183 	if (err != 0) {
1184 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1185 		       ", error status: %d\n", err);
1186 	}
1187 }
1188 
1189 static int
1190 mxge_max_mtu(mxge_softc_t *sc)
1191 {
1192 	mxge_cmd_t cmd;
1193 	int status;
1194 
1195 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1196 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1197 
1198 	/* try to set nbufs to see if it we can
1199 	   use virtually contiguous jumbos */
1200 	cmd.data0 = 0;
1201 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1202 			       &cmd);
1203 	if (status == 0)
1204 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1205 
1206 	/* otherwise, we're limited to MJUMPAGESIZE */
1207 	return MJUMPAGESIZE - MXGEFW_PAD;
1208 }
1209 
1210 static int
1211 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1212 {
1213 	struct mxge_slice_state *ss;
1214 	mxge_rx_done_t *rx_done;
1215 	volatile uint32_t *irq_claim;
1216 	mxge_cmd_t cmd;
1217 	int slice, status;
1218 
1219 	/* try to send a reset command to the card to see if it
1220 	   is alive */
1221 	memset(&cmd, 0, sizeof (cmd));
1222 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1223 	if (status != 0) {
1224 		device_printf(sc->dev, "failed reset\n");
1225 		return ENXIO;
1226 	}
1227 
1228 	mxge_dummy_rdma(sc, 1);
1229 
1230 
1231 	/* set the intrq size */
1232 	cmd.data0 = sc->rx_ring_size;
1233 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1234 
1235 	/*
1236 	 * Even though we already know how many slices are supported
1237 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1238 	 * has magic side effects, and must be called after a reset.
1239 	 * It must be called prior to calling any RSS related cmds,
1240 	 * including assigning an interrupt queue for anything but
1241 	 * slice 0.  It must also be called *after*
1242 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1243 	 * the firmware to compute offsets.
1244 	 */
1245 
1246 	if (sc->num_slices > 1) {
1247 		/* ask the maximum number of slices it supports */
1248 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1249 					   &cmd);
1250 		if (status != 0) {
1251 			device_printf(sc->dev,
1252 				      "failed to get number of slices\n");
1253 			return status;
1254 		}
1255 		/*
1256 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1257 		 * to setting up the interrupt queue DMA
1258 		 */
1259 		cmd.data0 = sc->num_slices;
1260 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1261 #ifdef IFNET_BUF_RING
1262 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1263 #endif
1264 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1265 					   &cmd);
1266 		if (status != 0) {
1267 			device_printf(sc->dev,
1268 				      "failed to set number of slices\n");
1269 			return status;
1270 		}
1271 	}
1272 
1273 
1274 	if (interrupts_setup) {
1275 		/* Now exchange information about interrupts  */
1276 		for (slice = 0; slice < sc->num_slices; slice++) {
1277 			rx_done = &sc->ss[slice].rx_done;
1278 			memset(rx_done->entry, 0, sc->rx_ring_size);
1279 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1280 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1281 			cmd.data2 = slice;
1282 			status |= mxge_send_cmd(sc,
1283 						MXGEFW_CMD_SET_INTRQ_DMA,
1284 						&cmd);
1285 		}
1286 	}
1287 
1288 	status |= mxge_send_cmd(sc,
1289 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1290 
1291 
1292 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1293 
1294 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1295 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1296 
1297 
1298 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1299 				&cmd);
1300 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1301 	if (status != 0) {
1302 		device_printf(sc->dev, "failed set interrupt parameters\n");
1303 		return status;
1304 	}
1305 
1306 
1307 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1308 
1309 
1310 	/* run a DMA benchmark */
1311 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1312 
1313 	for (slice = 0; slice < sc->num_slices; slice++) {
1314 		ss = &sc->ss[slice];
1315 
1316 		ss->irq_claim = irq_claim + (2 * slice);
1317 		/* reset mcp/driver shared state back to 0 */
1318 		ss->rx_done.idx = 0;
1319 		ss->rx_done.cnt = 0;
1320 		ss->tx.req = 0;
1321 		ss->tx.done = 0;
1322 		ss->tx.pkt_done = 0;
1323 		ss->tx.queue_active = 0;
1324 		ss->tx.activate = 0;
1325 		ss->tx.deactivate = 0;
1326 		ss->tx.wake = 0;
1327 		ss->tx.defrag = 0;
1328 		ss->tx.stall = 0;
1329 		ss->rx_big.cnt = 0;
1330 		ss->rx_small.cnt = 0;
1331 		ss->lro_bad_csum = 0;
1332 		ss->lro_queued = 0;
1333 		ss->lro_flushed = 0;
1334 		if (ss->fw_stats != NULL) {
1335 			ss->fw_stats->valid = 0;
1336 			ss->fw_stats->send_done_count = 0;
1337 		}
1338 	}
1339 	sc->rdma_tags_available = 15;
1340 	status = mxge_update_mac_address(sc);
1341 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1342 	mxge_change_pause(sc, sc->pause);
1343 	mxge_set_multicast_list(sc);
1344 	return status;
1345 }
1346 
1347 static int
1348 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1349 {
1350         mxge_softc_t *sc;
1351         unsigned int intr_coal_delay;
1352         int err;
1353 
1354         sc = arg1;
1355         intr_coal_delay = sc->intr_coal_delay;
1356         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1357         if (err != 0) {
1358                 return err;
1359         }
1360         if (intr_coal_delay == sc->intr_coal_delay)
1361                 return 0;
1362 
1363         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1364                 return EINVAL;
1365 
1366 	lwkt_serialize_enter(sc->ifp->if_serializer);
1367 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1368 	sc->intr_coal_delay = intr_coal_delay;
1369 
1370 	lwkt_serialize_exit(sc->ifp->if_serializer);
1371         return err;
1372 }
1373 
1374 static int
1375 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1376 {
1377         mxge_softc_t *sc;
1378         unsigned int enabled;
1379         int err;
1380 
1381         sc = arg1;
1382         enabled = sc->pause;
1383         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1384         if (err != 0) {
1385                 return err;
1386         }
1387         if (enabled == sc->pause)
1388                 return 0;
1389 
1390 	lwkt_serialize_enter(sc->ifp->if_serializer);
1391 	err = mxge_change_pause(sc, enabled);
1392 	lwkt_serialize_exit(sc->ifp->if_serializer);
1393         return err;
1394 }
1395 
1396 static int
1397 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1398 {
1399 	struct ifnet *ifp;
1400 	int err = 0;
1401 
1402 	ifp = sc->ifp;
1403 	if (lro_cnt == 0)
1404 		ifp->if_capenable &= ~IFCAP_LRO;
1405 	else
1406 		ifp->if_capenable |= IFCAP_LRO;
1407 	sc->lro_cnt = lro_cnt;
1408 	if (ifp->if_flags & IFF_RUNNING) {
1409 		mxge_close(sc);
1410 		err = mxge_open(sc);
1411 	}
1412 	return err;
1413 }
1414 
1415 static int
1416 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1417 {
1418 	mxge_softc_t *sc;
1419 	unsigned int lro_cnt;
1420 	int err;
1421 
1422 	sc = arg1;
1423 	lro_cnt = sc->lro_cnt;
1424 	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1425 	if (err != 0)
1426 		return err;
1427 
1428 	if (lro_cnt == sc->lro_cnt)
1429 		return 0;
1430 
1431 	if (lro_cnt > 128)
1432 		return EINVAL;
1433 
1434 	lwkt_serialize_enter(sc->ifp->if_serializer);
1435 	err = mxge_change_lro_locked(sc, lro_cnt);
1436 	lwkt_serialize_exit(sc->ifp->if_serializer);
1437 	return err;
1438 }
1439 
1440 static int
1441 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1442 {
1443         int err;
1444 
1445         if (arg1 == NULL)
1446                 return EFAULT;
1447         arg2 = be32toh(*(int *)arg1);
1448         arg1 = NULL;
1449         err = sysctl_handle_int(oidp, arg1, arg2, req);
1450 
1451         return err;
1452 }
1453 
1454 static void
1455 mxge_rem_sysctls(mxge_softc_t *sc)
1456 {
1457 	struct mxge_slice_state *ss;
1458 	int slice;
1459 
1460 	if (sc->slice_sysctl_tree == NULL)
1461 		return;
1462 
1463 	for (slice = 0; slice < sc->num_slices; slice++) {
1464 		ss = &sc->ss[slice];
1465 		if (ss == NULL || ss->sysctl_tree == NULL)
1466 			continue;
1467 		sysctl_ctx_free(&ss->sysctl_ctx);
1468 		ss->sysctl_tree = NULL;
1469 	}
1470 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1471 	sc->slice_sysctl_tree = NULL;
1472 	sysctl_ctx_free(&sc->sysctl_ctx);
1473 	sc->sysctl_tree = NULL;
1474 
1475 }
1476 
1477 static void
1478 mxge_add_sysctls(mxge_softc_t *sc)
1479 {
1480 	struct sysctl_ctx_list *ctx;
1481 	struct sysctl_oid_list *children;
1482 	mcp_irq_data_t *fw;
1483 	struct mxge_slice_state *ss;
1484 	int slice;
1485 	char slice_num[8];
1486 
1487 	ctx = &sc->sysctl_ctx;
1488 	sysctl_ctx_init(ctx);
1489 	sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1490 					  OID_AUTO,
1491 					  device_get_nameunit(sc->dev),
1492 					  CTLFLAG_RD, 0, "");
1493 	if (sc->sysctl_tree == NULL) {
1494 		device_printf(sc->dev, "can't add sysctl node\n");
1495 		return;
1496 	}
1497 
1498 	children = SYSCTL_CHILDREN(sc->sysctl_tree);
1499 	fw = sc->ss[0].fw_stats;
1500 
1501 	/* random information */
1502 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1503 		       "firmware_version",
1504 		       CTLFLAG_RD, &sc->fw_version,
1505 		       0, "firmware version");
1506 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1507 		       "serial_number",
1508 		       CTLFLAG_RD, &sc->serial_number_string,
1509 		       0, "serial number");
1510 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1511 		       "product_code",
1512 		       CTLFLAG_RD, &sc->product_code_string,
1513 		       0, "product_code");
1514 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1515 		       "pcie_link_width",
1516 		       CTLFLAG_RD, &sc->link_width,
1517 		       0, "tx_boundary");
1518 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1519 		       "tx_boundary",
1520 		       CTLFLAG_RD, &sc->tx_boundary,
1521 		       0, "tx_boundary");
1522 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1523 		       "write_combine",
1524 		       CTLFLAG_RD, &sc->wc,
1525 		       0, "write combining PIO?");
1526 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1527 		       "read_dma_MBs",
1528 		       CTLFLAG_RD, &sc->read_dma,
1529 		       0, "DMA Read speed in MB/s");
1530 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1531 		       "write_dma_MBs",
1532 		       CTLFLAG_RD, &sc->write_dma,
1533 		       0, "DMA Write speed in MB/s");
1534 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1535 		       "read_write_dma_MBs",
1536 		       CTLFLAG_RD, &sc->read_write_dma,
1537 		       0, "DMA concurrent Read/Write speed in MB/s");
1538 
1539 
1540 	/* performance related tunables */
1541 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1542 			"intr_coal_delay",
1543 			CTLTYPE_INT|CTLFLAG_RW, sc,
1544 			0, mxge_change_intr_coal,
1545 			"I", "interrupt coalescing delay in usecs");
1546 
1547 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1548 			"flow_control_enabled",
1549 			CTLTYPE_INT|CTLFLAG_RW, sc,
1550 			0, mxge_change_flow_control,
1551 			"I", "interrupt coalescing delay in usecs");
1552 
1553 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1554 		       "deassert_wait",
1555 		       CTLFLAG_RW, &mxge_deassert_wait,
1556 		       0, "Wait for IRQ line to go low in ihandler");
1557 
1558 	/* stats block from firmware is in network byte order.
1559 	   Need to swap it */
1560 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1561 			"link_up",
1562 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1563 			0, mxge_handle_be32,
1564 			"I", "link up");
1565 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1566 			"rdma_tags_available",
1567 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1568 			0, mxge_handle_be32,
1569 			"I", "rdma_tags_available");
1570 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1571 			"dropped_bad_crc32",
1572 			CTLTYPE_INT|CTLFLAG_RD,
1573 			&fw->dropped_bad_crc32,
1574 			0, mxge_handle_be32,
1575 			"I", "dropped_bad_crc32");
1576 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1577 			"dropped_bad_phy",
1578 			CTLTYPE_INT|CTLFLAG_RD,
1579 			&fw->dropped_bad_phy,
1580 			0, mxge_handle_be32,
1581 			"I", "dropped_bad_phy");
1582 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1583 			"dropped_link_error_or_filtered",
1584 			CTLTYPE_INT|CTLFLAG_RD,
1585 			&fw->dropped_link_error_or_filtered,
1586 			0, mxge_handle_be32,
1587 			"I", "dropped_link_error_or_filtered");
1588 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1589 			"dropped_link_overflow",
1590 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1591 			0, mxge_handle_be32,
1592 			"I", "dropped_link_overflow");
1593 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1594 			"dropped_multicast_filtered",
1595 			CTLTYPE_INT|CTLFLAG_RD,
1596 			&fw->dropped_multicast_filtered,
1597 			0, mxge_handle_be32,
1598 			"I", "dropped_multicast_filtered");
1599 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1600 			"dropped_no_big_buffer",
1601 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1602 			0, mxge_handle_be32,
1603 			"I", "dropped_no_big_buffer");
1604 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1605 			"dropped_no_small_buffer",
1606 			CTLTYPE_INT|CTLFLAG_RD,
1607 			&fw->dropped_no_small_buffer,
1608 			0, mxge_handle_be32,
1609 			"I", "dropped_no_small_buffer");
1610 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1611 			"dropped_overrun",
1612 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1613 			0, mxge_handle_be32,
1614 			"I", "dropped_overrun");
1615 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1616 			"dropped_pause",
1617 			CTLTYPE_INT|CTLFLAG_RD,
1618 			&fw->dropped_pause,
1619 			0, mxge_handle_be32,
1620 			"I", "dropped_pause");
1621 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1622 			"dropped_runt",
1623 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1624 			0, mxge_handle_be32,
1625 			"I", "dropped_runt");
1626 
1627 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1628 			"dropped_unicast_filtered",
1629 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1630 			0, mxge_handle_be32,
1631 			"I", "dropped_unicast_filtered");
1632 
1633 	/* verbose printing? */
1634 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1635 		       "verbose",
1636 		       CTLFLAG_RW, &mxge_verbose,
1637 		       0, "verbose printing");
1638 
1639 	/* lro */
1640 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1641 			"lro_cnt",
1642 			CTLTYPE_INT|CTLFLAG_RW, sc,
1643 			0, mxge_change_lro,
1644 			"I", "number of lro merge queues");
1645 
1646 
1647 	/* add counters exported for debugging from all slices */
1648 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1649 	sc->slice_sysctl_tree =
1650 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1651 				"slice", CTLFLAG_RD, 0, "");
1652 
1653 	for (slice = 0; slice < sc->num_slices; slice++) {
1654 		ss = &sc->ss[slice];
1655 		sysctl_ctx_init(&ss->sysctl_ctx);
1656 		ctx = &ss->sysctl_ctx;
1657 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1658 		ksprintf(slice_num, "%d", slice);
1659 		ss->sysctl_tree =
1660 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1661 					CTLFLAG_RD, 0, "");
1662 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1663 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1664 			       "rx_small_cnt",
1665 			       CTLFLAG_RD, &ss->rx_small.cnt,
1666 			       0, "rx_small_cnt");
1667 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1668 			       "rx_big_cnt",
1669 			       CTLFLAG_RD, &ss->rx_big.cnt,
1670 			       0, "rx_small_cnt");
1671 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1672 			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1673 			       0, "number of lro merge queues flushed");
1674 
1675 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1676 			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1677 			       0, "number of frames appended to lro merge"
1678 			       "queues");
1679 
1680 #ifndef IFNET_BUF_RING
1681 		/* only transmit from slice 0 for now */
1682 		if (slice > 0)
1683 			continue;
1684 #endif
1685 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1686 			       "tx_req",
1687 			       CTLFLAG_RD, &ss->tx.req,
1688 			       0, "tx_req");
1689 
1690 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1691 			       "tx_done",
1692 			       CTLFLAG_RD, &ss->tx.done,
1693 			       0, "tx_done");
1694 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1695 			       "tx_pkt_done",
1696 			       CTLFLAG_RD, &ss->tx.pkt_done,
1697 			       0, "tx_done");
1698 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1699 			       "tx_stall",
1700 			       CTLFLAG_RD, &ss->tx.stall,
1701 			       0, "tx_stall");
1702 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1703 			       "tx_wake",
1704 			       CTLFLAG_RD, &ss->tx.wake,
1705 			       0, "tx_wake");
1706 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1707 			       "tx_defrag",
1708 			       CTLFLAG_RD, &ss->tx.defrag,
1709 			       0, "tx_defrag");
1710 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1711 			       "tx_queue_active",
1712 			       CTLFLAG_RD, &ss->tx.queue_active,
1713 			       0, "tx_queue_active");
1714 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1715 			       "tx_activate",
1716 			       CTLFLAG_RD, &ss->tx.activate,
1717 			       0, "tx_activate");
1718 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1719 			       "tx_deactivate",
1720 			       CTLFLAG_RD, &ss->tx.deactivate,
1721 			       0, "tx_deactivate");
1722 	}
1723 }
1724 
1725 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1726    backwards one at a time and handle ring wraps */
1727 
1728 static inline void
1729 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1730 			    mcp_kreq_ether_send_t *src, int cnt)
1731 {
1732         int idx, starting_slot;
1733         starting_slot = tx->req;
1734         while (cnt > 1) {
1735                 cnt--;
1736                 idx = (starting_slot + cnt) & tx->mask;
1737                 mxge_pio_copy(&tx->lanai[idx],
1738 			      &src[cnt], sizeof(*src));
1739                 wmb();
1740         }
1741 }
1742 
1743 /*
1744  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1745  * at most 32 bytes at a time, so as to avoid involving the software
1746  * pio handler in the nic.   We re-write the first segment's flags
1747  * to mark them valid only after writing the entire chain
1748  */
1749 
1750 static inline void
1751 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1752                   int cnt)
1753 {
1754         int idx, i;
1755         uint32_t *src_ints;
1756 	volatile uint32_t *dst_ints;
1757         mcp_kreq_ether_send_t *srcp;
1758 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1759 	uint8_t last_flags;
1760 
1761         idx = tx->req & tx->mask;
1762 
1763 	last_flags = src->flags;
1764 	src->flags = 0;
1765         wmb();
1766         dst = dstp = &tx->lanai[idx];
1767         srcp = src;
1768 
1769         if ((idx + cnt) < tx->mask) {
1770                 for (i = 0; i < (cnt - 1); i += 2) {
1771                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1772                         wmb(); /* force write every 32 bytes */
1773                         srcp += 2;
1774                         dstp += 2;
1775                 }
1776         } else {
1777                 /* submit all but the first request, and ensure
1778                    that it is submitted below */
1779                 mxge_submit_req_backwards(tx, src, cnt);
1780                 i = 0;
1781         }
1782         if (i < cnt) {
1783                 /* submit the first request */
1784                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1785                 wmb(); /* barrier before setting valid flag */
1786         }
1787 
1788         /* re-write the last 32-bits with the valid flags */
1789         src->flags = last_flags;
1790         src_ints = (uint32_t *)src;
1791         src_ints+=3;
1792         dst_ints = (volatile uint32_t *)dst;
1793         dst_ints+=3;
1794         *dst_ints =  *src_ints;
1795         tx->req += cnt;
1796         wmb();
1797 }
1798 
1799 #if IFCAP_TSO4
1800 
1801 static void
1802 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1803 	       int busdma_seg_cnt, int ip_off)
1804 {
1805 	mxge_tx_ring_t *tx;
1806 	mcp_kreq_ether_send_t *req;
1807 	bus_dma_segment_t *seg;
1808 	struct ip *ip;
1809 	struct tcphdr *tcp;
1810 	uint32_t low, high_swapped;
1811 	int len, seglen, cum_len, cum_len_next;
1812 	int next_is_first, chop, cnt, rdma_count, small;
1813 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1814 	uint8_t flags, flags_next;
1815 	static int once;
1816 
1817 	mss = m->m_pkthdr.tso_segsz;
1818 
1819 	/* negative cum_len signifies to the
1820 	 * send loop that we are still in the
1821 	 * header portion of the TSO packet.
1822 	 */
1823 
1824 	/* ensure we have the ethernet, IP and TCP
1825 	   header together in the first mbuf, copy
1826 	   it to a scratch buffer if not */
1827 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1828 		m_copydata(m, 0, ip_off + sizeof (*ip),
1829 			   ss->scratch);
1830 		ip = (struct ip *)(ss->scratch + ip_off);
1831 	} else {
1832 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1833 	}
1834 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1835 			    + sizeof (*tcp))) {
1836 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1837 			   + sizeof (*tcp),  ss->scratch);
1838 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1839 	}
1840 
1841 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1842 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1843 
1844 	/* TSO implies checksum offload on this hardware */
1845 	cksum_offset = ip_off + (ip->ip_hl << 2);
1846 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1847 
1848 
1849 	/* for TSO, pseudo_hdr_offset holds mss.
1850 	 * The firmware figures out where to put
1851 	 * the checksum by parsing the header. */
1852 	pseudo_hdr_offset = htobe16(mss);
1853 
1854 	tx = &ss->tx;
1855 	req = tx->req_list;
1856 	seg = tx->seg_list;
1857 	cnt = 0;
1858 	rdma_count = 0;
1859 	/* "rdma_count" is the number of RDMAs belonging to the
1860 	 * current packet BEFORE the current send request. For
1861 	 * non-TSO packets, this is equal to "count".
1862 	 * For TSO packets, rdma_count needs to be reset
1863 	 * to 0 after a segment cut.
1864 	 *
1865 	 * The rdma_count field of the send request is
1866 	 * the number of RDMAs of the packet starting at
1867 	 * that request. For TSO send requests with one ore more cuts
1868 	 * in the middle, this is the number of RDMAs starting
1869 	 * after the last cut in the request. All previous
1870 	 * segments before the last cut implicitly have 1 RDMA.
1871 	 *
1872 	 * Since the number of RDMAs is not known beforehand,
1873 	 * it must be filled-in retroactively - after each
1874 	 * segmentation cut or at the end of the entire packet.
1875 	 */
1876 
1877 	while (busdma_seg_cnt) {
1878 		/* Break the busdma segment up into pieces*/
1879 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1880 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1881 		len = seg->ds_len;
1882 
1883 		while (len) {
1884 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1885 			seglen = len;
1886 			cum_len_next = cum_len + seglen;
1887 			(req-rdma_count)->rdma_count = rdma_count + 1;
1888 			if (__predict_true(cum_len >= 0)) {
1889 				/* payload */
1890 				chop = (cum_len_next > mss);
1891 				cum_len_next = cum_len_next % mss;
1892 				next_is_first = (cum_len_next == 0);
1893 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1894 				flags_next |= next_is_first *
1895 					MXGEFW_FLAGS_FIRST;
1896 				rdma_count |= -(chop | next_is_first);
1897 				rdma_count += chop & !next_is_first;
1898 			} else if (cum_len_next >= 0) {
1899 				/* header ends */
1900 				rdma_count = -1;
1901 				cum_len_next = 0;
1902 				seglen = -cum_len;
1903 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1904 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1905 					MXGEFW_FLAGS_FIRST |
1906 					(small * MXGEFW_FLAGS_SMALL);
1907 			    }
1908 
1909 			req->addr_high = high_swapped;
1910 			req->addr_low = htobe32(low);
1911 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1912 			req->pad = 0;
1913 			req->rdma_count = 1;
1914 			req->length = htobe16(seglen);
1915 			req->cksum_offset = cksum_offset;
1916 			req->flags = flags | ((cum_len & 1) *
1917 					      MXGEFW_FLAGS_ALIGN_ODD);
1918 			low += seglen;
1919 			len -= seglen;
1920 			cum_len = cum_len_next;
1921 			flags = flags_next;
1922 			req++;
1923 			cnt++;
1924 			rdma_count++;
1925 			if (__predict_false(cksum_offset > seglen))
1926 				cksum_offset -= seglen;
1927 			else
1928 				cksum_offset = 0;
1929 			if (__predict_false(cnt > tx->max_desc))
1930 				goto drop;
1931 		}
1932 		busdma_seg_cnt--;
1933 		seg++;
1934 	}
1935 	(req-rdma_count)->rdma_count = rdma_count;
1936 
1937 	do {
1938 		req--;
1939 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1940 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1941 
1942 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1943 	mxge_submit_req(tx, tx->req_list, cnt);
1944 #ifdef IFNET_BUF_RING
1945 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1946 		/* tell the NIC to start polling this slice */
1947 		*tx->send_go = 1;
1948 		tx->queue_active = 1;
1949 		tx->activate++;
1950 		wmb();
1951 	}
1952 #endif
1953 	return;
1954 
1955 drop:
1956 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1957 	m_freem(m);
1958 	ss->oerrors++;
1959 	if (!once) {
1960 		kprintf("tx->max_desc exceeded via TSO!\n");
1961 		kprintf("mss = %d, %ld, %d!\n", mss,
1962 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1963 		once = 1;
1964 	}
1965 	return;
1966 
1967 }
1968 
1969 #endif /* IFCAP_TSO4 */
1970 
1971 #ifdef MXGE_NEW_VLAN_API
1972 /*
1973  * We reproduce the software vlan tag insertion from
1974  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1975  * vlan tag insertion. We need to advertise this in order to have the
1976  * vlan interface respect our csum offload flags.
1977  */
1978 static struct mbuf *
1979 mxge_vlan_tag_insert(struct mbuf *m)
1980 {
1981 	struct ether_vlan_header *evl;
1982 
1983 	M_PREPEND(m, EVL_ENCAPLEN, MB_DONTWAIT);
1984 	if (__predict_false(m == NULL))
1985 		return NULL;
1986 	if (m->m_len < sizeof(*evl)) {
1987 		m = m_pullup(m, sizeof(*evl));
1988 		if (__predict_false(m == NULL))
1989 			return NULL;
1990 	}
1991 	/*
1992 	 * Transform the Ethernet header into an Ethernet header
1993 	 * with 802.1Q encapsulation.
1994 	 */
1995 	evl = mtod(m, struct ether_vlan_header *);
1996 	bcopy((char *)evl + EVL_ENCAPLEN,
1997 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1998 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1999 	evl->evl_tag = htons(m->m_pkthdr.ether_vlantag);
2000 	m->m_flags &= ~M_VLANTAG;
2001 	return m;
2002 }
2003 #endif /* MXGE_NEW_VLAN_API */
2004 
2005 static void
2006 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2007 {
2008 	mxge_softc_t *sc;
2009 	mcp_kreq_ether_send_t *req;
2010 	bus_dma_segment_t *seg;
2011 	struct mbuf *m_tmp;
2012 	struct ifnet *ifp;
2013 	mxge_tx_ring_t *tx;
2014 	struct ip *ip;
2015 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
2016 	uint16_t pseudo_hdr_offset;
2017         uint8_t flags, cksum_offset;
2018 
2019 
2020 	sc = ss->sc;
2021 	ifp = sc->ifp;
2022 	tx = &ss->tx;
2023 
2024 	ip_off = sizeof (struct ether_header);
2025 #ifdef MXGE_NEW_VLAN_API
2026 	if (m->m_flags & M_VLANTAG) {
2027 		m = mxge_vlan_tag_insert(m);
2028 		if (__predict_false(m == NULL))
2029 			goto drop;
2030 		ip_off += EVL_ENCAPLEN;
2031 	}
2032 #endif
2033 	/* (try to) map the frame for DMA */
2034 	idx = tx->req & tx->mask;
2035 	err = bus_dmamap_load_mbuf_segment(tx->dmat, tx->info[idx].map,
2036 					   m, tx->seg_list, 1, &cnt,
2037 					   BUS_DMA_NOWAIT);
2038 	if (__predict_false(err == EFBIG)) {
2039 		/* Too many segments in the chain.  Try
2040 		   to defrag */
2041 		m_tmp = m_defrag(m, MB_DONTWAIT);
2042 		if (m_tmp == NULL) {
2043 			goto drop;
2044 		}
2045 		ss->tx.defrag++;
2046 		m = m_tmp;
2047 		err = bus_dmamap_load_mbuf_segment(tx->dmat,
2048 					      tx->info[idx].map,
2049 					      m, tx->seg_list, 1, &cnt,
2050 					      BUS_DMA_NOWAIT);
2051 	}
2052 	if (__predict_false(err != 0)) {
2053 		device_printf(sc->dev, "bus_dmamap_load_mbuf_segment returned %d"
2054 			      " packet len = %d\n", err, m->m_pkthdr.len);
2055 		goto drop;
2056 	}
2057 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2058 			BUS_DMASYNC_PREWRITE);
2059 	tx->info[idx].m = m;
2060 
2061 #if IFCAP_TSO4
2062 	/* TSO is different enough, we handle it in another routine */
2063 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2064 		mxge_encap_tso(ss, m, cnt, ip_off);
2065 		return;
2066 	}
2067 #endif
2068 
2069 	req = tx->req_list;
2070 	cksum_offset = 0;
2071 	pseudo_hdr_offset = 0;
2072 	flags = MXGEFW_FLAGS_NO_TSO;
2073 
2074 	/* checksum offloading? */
2075 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2076 		/* ensure ip header is in first mbuf, copy
2077 		   it to a scratch buffer if not */
2078 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2079 			m_copydata(m, 0, ip_off + sizeof (*ip),
2080 				   ss->scratch);
2081 			ip = (struct ip *)(ss->scratch + ip_off);
2082 		} else {
2083 			ip = (struct ip *)(mtod(m, char *) + ip_off);
2084 		}
2085 		cksum_offset = ip_off + (ip->ip_hl << 2);
2086 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2087 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2088 		req->cksum_offset = cksum_offset;
2089 		flags |= MXGEFW_FLAGS_CKSUM;
2090 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2091 	} else {
2092 		odd_flag = 0;
2093 	}
2094 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2095 		flags |= MXGEFW_FLAGS_SMALL;
2096 
2097 	/* convert segments into a request list */
2098 	cum_len = 0;
2099 	seg = tx->seg_list;
2100 	req->flags = MXGEFW_FLAGS_FIRST;
2101 	for (i = 0; i < cnt; i++) {
2102 		req->addr_low =
2103 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2104 		req->addr_high =
2105 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2106 		req->length = htobe16(seg->ds_len);
2107 		req->cksum_offset = cksum_offset;
2108 		if (cksum_offset > seg->ds_len)
2109 			cksum_offset -= seg->ds_len;
2110 		else
2111 			cksum_offset = 0;
2112 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2113 		req->pad = 0; /* complete solid 16-byte block */
2114 		req->rdma_count = 1;
2115 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2116 		cum_len += seg->ds_len;
2117 		seg++;
2118 		req++;
2119 		req->flags = 0;
2120 	}
2121 	req--;
2122 	/* pad runts to 60 bytes */
2123 	if (cum_len < 60) {
2124 		req++;
2125 		req->addr_low =
2126 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2127 		req->addr_high =
2128 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2129 		req->length = htobe16(60 - cum_len);
2130 		req->cksum_offset = 0;
2131 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2132 		req->pad = 0; /* complete solid 16-byte block */
2133 		req->rdma_count = 1;
2134 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2135 		cnt++;
2136 	}
2137 
2138 	tx->req_list[0].rdma_count = cnt;
2139 #if 0
2140 	/* print what the firmware will see */
2141 	for (i = 0; i < cnt; i++) {
2142 		kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2143 		    "cso:%d, flags:0x%x, rdma:%d\n",
2144 		    i, (int)ntohl(tx->req_list[i].addr_high),
2145 		    (int)ntohl(tx->req_list[i].addr_low),
2146 		    (int)ntohs(tx->req_list[i].length),
2147 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2148 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2149 		    tx->req_list[i].rdma_count);
2150 	}
2151 	kprintf("--------------\n");
2152 #endif
2153 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2154 	mxge_submit_req(tx, tx->req_list, cnt);
2155 #ifdef IFNET_BUF_RING
2156 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2157 		/* tell the NIC to start polling this slice */
2158 		*tx->send_go = 1;
2159 		tx->queue_active = 1;
2160 		tx->activate++;
2161 		wmb();
2162 	}
2163 #endif
2164 	return;
2165 
2166 drop:
2167 	m_freem(m);
2168 	ss->oerrors++;
2169 	return;
2170 }
2171 
2172 #ifdef IFNET_BUF_RING
2173 static void
2174 mxge_qflush(struct ifnet *ifp)
2175 {
2176 	mxge_softc_t *sc = ifp->if_softc;
2177 	mxge_tx_ring_t *tx;
2178 	struct mbuf *m;
2179 	int slice;
2180 
2181 	for (slice = 0; slice < sc->num_slices; slice++) {
2182 		tx = &sc->ss[slice].tx;
2183 		lwkt_serialize_enter(sc->ifp->if_serializer);
2184 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2185 			m_freem(m);
2186 		lwkt_serialize_exit(sc->ifp->if_serializer);
2187 	}
2188 	if_qflush(ifp);
2189 }
2190 
2191 static inline void
2192 mxge_start_locked(struct mxge_slice_state *ss)
2193 {
2194 	mxge_softc_t *sc;
2195 	struct mbuf *m;
2196 	struct ifnet *ifp;
2197 	mxge_tx_ring_t *tx;
2198 
2199 	sc = ss->sc;
2200 	ifp = sc->ifp;
2201 	tx = &ss->tx;
2202 
2203 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2204 		m = drbr_dequeue(ifp, tx->br);
2205 		if (m == NULL) {
2206 			return;
2207 		}
2208 		/* let BPF see it */
2209 		BPF_MTAP(ifp, m);
2210 
2211 		/* give it to the nic */
2212 		mxge_encap(ss, m);
2213 	}
2214 	/* ran out of transmit slots */
2215 	if (((ss->if_flags & IFF_OACTIVE) == 0)
2216 	    && (!drbr_empty(ifp, tx->br))) {
2217 		ss->if_flags |= IFF_OACTIVE;
2218 		tx->stall++;
2219 	}
2220 }
2221 
2222 static int
2223 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2224 {
2225 	mxge_softc_t *sc;
2226 	struct ifnet *ifp;
2227 	mxge_tx_ring_t *tx;
2228 	int err;
2229 
2230 	sc = ss->sc;
2231 	ifp = sc->ifp;
2232 	tx = &ss->tx;
2233 
2234 	if ((ss->if_flags & (IFF_RUNNING|IFF_OACTIVE)) !=
2235 	    IFF_RUNNING) {
2236 		err = drbr_enqueue(ifp, tx->br, m);
2237 		return (err);
2238 	}
2239 
2240 	if (drbr_empty(ifp, tx->br) &&
2241 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2242 		/* let BPF see it */
2243 		BPF_MTAP(ifp, m);
2244 		/* give it to the nic */
2245 		mxge_encap(ss, m);
2246 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2247 		return (err);
2248 	}
2249 	if (!drbr_empty(ifp, tx->br))
2250 		mxge_start_locked(ss);
2251 	return (0);
2252 }
2253 
2254 static int
2255 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2256 {
2257 	mxge_softc_t *sc = ifp->if_softc;
2258 	struct mxge_slice_state *ss;
2259 	mxge_tx_ring_t *tx;
2260 	int err = 0;
2261 	int slice;
2262 
2263 #if 0
2264 	slice = m->m_pkthdr.flowid;
2265 #endif
2266 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2267 
2268 	ss = &sc->ss[slice];
2269 	tx = &ss->tx;
2270 
2271 	if(lwkt_serialize_try(ifp->if_serializer)) {
2272 		err = mxge_transmit_locked(ss, m);
2273 		lwkt_serialize_exit(ifp->if_serializer);
2274 	} else {
2275 		err = drbr_enqueue(ifp, tx->br, m);
2276 	}
2277 
2278 	return (err);
2279 }
2280 
2281 #else
2282 
2283 static inline void
2284 mxge_start_locked(struct mxge_slice_state *ss)
2285 {
2286 	mxge_softc_t *sc;
2287 	struct mbuf *m;
2288 	struct ifnet *ifp;
2289 	mxge_tx_ring_t *tx;
2290 
2291 	sc = ss->sc;
2292 	ifp = sc->ifp;
2293 	tx = &ss->tx;
2294 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2295 		m = ifq_dequeue(&ifp->if_snd, NULL);
2296 		if (m == NULL) {
2297 			return;
2298 		}
2299 		/* let BPF see it */
2300 		BPF_MTAP(ifp, m);
2301 
2302 		/* give it to the nic */
2303 		mxge_encap(ss, m);
2304 	}
2305 	/* ran out of transmit slots */
2306 	if ((sc->ifp->if_flags & IFF_OACTIVE) == 0) {
2307 		sc->ifp->if_flags |= IFF_OACTIVE;
2308 		tx->stall++;
2309 	}
2310 }
2311 #endif
2312 static void
2313 mxge_start(struct ifnet *ifp)
2314 {
2315 	mxge_softc_t *sc = ifp->if_softc;
2316 	struct mxge_slice_state *ss;
2317 
2318 	ASSERT_SERIALIZED(sc->ifp->if_serializer);
2319 	/* only use the first slice for now */
2320 	ss = &sc->ss[0];
2321 	mxge_start_locked(ss);
2322 }
2323 
2324 /*
2325  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2326  * at most 32 bytes at a time, so as to avoid involving the software
2327  * pio handler in the nic.   We re-write the first segment's low
2328  * DMA address to mark it valid only after we write the entire chunk
2329  * in a burst
2330  */
2331 static inline void
2332 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2333 		mcp_kreq_ether_recv_t *src)
2334 {
2335 	uint32_t low;
2336 
2337 	low = src->addr_low;
2338 	src->addr_low = 0xffffffff;
2339 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2340 	wmb();
2341 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2342 	wmb();
2343 	src->addr_low = low;
2344 	dst->addr_low = low;
2345 	wmb();
2346 }
2347 
2348 static int
2349 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2350 {
2351 	bus_dma_segment_t seg;
2352 	struct mbuf *m;
2353 	mxge_rx_ring_t *rx = &ss->rx_small;
2354 	int cnt, err;
2355 
2356 	m = m_gethdr(MB_DONTWAIT, MT_DATA);
2357 	if (m == NULL) {
2358 		rx->alloc_fail++;
2359 		err = ENOBUFS;
2360 		goto done;
2361 	}
2362 	m->m_len = m->m_pkthdr.len = MHLEN;
2363 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2364 				      &seg, 1, &cnt, BUS_DMA_NOWAIT);
2365 	if (err != 0) {
2366 		kprintf("can't dmamap small (%d)\n", err);
2367 		m_free(m);
2368 		goto done;
2369 	}
2370 	rx->info[idx].m = m;
2371 	rx->shadow[idx].addr_low =
2372 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2373 	rx->shadow[idx].addr_high =
2374 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2375 
2376 done:
2377 	if ((idx & 7) == 7)
2378 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2379 	return err;
2380 }
2381 
2382 
2383 static int
2384 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2385 {
2386 	bus_dma_segment_t seg[3];
2387 	struct mbuf *m;
2388 	mxge_rx_ring_t *rx = &ss->rx_big;
2389 	int cnt, err, i;
2390 
2391 	if (rx->cl_size == MCLBYTES)
2392 		m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2393 	else {
2394 #if 0
2395 		m = m_getjcl(MB_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2396 #else
2397 		/*
2398 		 * XXX: allocate normal sized buffers for big buffers.
2399 		 * We should be fine as long as we don't get any jumbo frames
2400 		 */
2401 		m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2402 #endif
2403 	}
2404 	if (m == NULL) {
2405 		rx->alloc_fail++;
2406 		err = ENOBUFS;
2407 		goto done;
2408 	}
2409 	m->m_pkthdr.len = 0;
2410 	m->m_len = m->m_pkthdr.len = rx->mlen;
2411 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2412 				      seg, 1, &cnt, BUS_DMA_NOWAIT);
2413 	if (err != 0) {
2414 		kprintf("can't dmamap big (%d)\n", err);
2415 		m_free(m);
2416 		goto done;
2417 	}
2418 	rx->info[idx].m = m;
2419 	rx->shadow[idx].addr_low =
2420 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2421 	rx->shadow[idx].addr_high =
2422 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2423 
2424 #if MXGE_VIRT_JUMBOS
2425 	for (i = 1; i < cnt; i++) {
2426 		rx->shadow[idx + i].addr_low =
2427 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2428 		rx->shadow[idx + i].addr_high =
2429 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2430        }
2431 #endif
2432 
2433 done:
2434        for (i = 0; i < rx->nbufs; i++) {
2435 		if ((idx & 7) == 7) {
2436 			mxge_submit_8rx(&rx->lanai[idx - 7],
2437 					&rx->shadow[idx - 7]);
2438 		}
2439 		idx++;
2440 	}
2441 	return err;
2442 }
2443 
2444 /*
2445  *  Myri10GE hardware checksums are not valid if the sender
2446  *  padded the frame with non-zero padding.  This is because
2447  *  the firmware just does a simple 16-bit 1s complement
2448  *  checksum across the entire frame, excluding the first 14
2449  *  bytes.  It is best to simply to check the checksum and
2450  *  tell the stack about it only if the checksum is good
2451  */
2452 
2453 static inline uint16_t
2454 mxge_rx_csum(struct mbuf *m, int csum)
2455 {
2456 	struct ether_header *eh;
2457 	struct ip *ip;
2458 	uint16_t c;
2459 
2460 	eh = mtod(m, struct ether_header *);
2461 
2462 	/* only deal with IPv4 TCP & UDP for now */
2463 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2464 		return 1;
2465 	ip = (struct ip *)(eh + 1);
2466 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2467 			    ip->ip_p != IPPROTO_UDP))
2468 		return 1;
2469 #ifdef INET
2470 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2471 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2472 			    - (ip->ip_hl << 2) + ip->ip_p));
2473 #else
2474 	c = 1;
2475 #endif
2476 	c ^= 0xffff;
2477 	return (c);
2478 }
2479 
2480 static void
2481 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2482 {
2483 	struct ether_vlan_header *evl;
2484 	struct ether_header *eh;
2485 	uint32_t partial;
2486 
2487 	evl = mtod(m, struct ether_vlan_header *);
2488 	eh = mtod(m, struct ether_header *);
2489 
2490 	/*
2491 	 * fix checksum by subtracting EVL_ENCAPLEN bytes
2492 	 * after what the firmware thought was the end of the ethernet
2493 	 * header.
2494 	 */
2495 
2496 	/* put checksum into host byte order */
2497 	*csum = ntohs(*csum);
2498 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2499 	(*csum) += ~partial;
2500 	(*csum) +=  ((*csum) < ~partial);
2501 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2502 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2503 
2504 	/* restore checksum to network byte order;
2505 	   later consumers expect this */
2506 	*csum = htons(*csum);
2507 
2508 	/* save the tag */
2509 #ifdef MXGE_NEW_VLAN_API
2510 	m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2511 #else
2512 	{
2513 		struct m_tag *mtag;
2514 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2515 				   MB_DONTWAIT);
2516 		if (mtag == NULL)
2517 			return;
2518 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2519 		m_tag_prepend(m, mtag);
2520 	}
2521 
2522 #endif
2523 	m->m_flags |= M_VLANTAG;
2524 
2525 	/*
2526 	 * Remove the 802.1q header by copying the Ethernet
2527 	 * addresses over it and adjusting the beginning of
2528 	 * the data in the mbuf.  The encapsulated Ethernet
2529 	 * type field is already in place.
2530 	 */
2531 	bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2532 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2533 	m_adj(m, EVL_ENCAPLEN);
2534 }
2535 
2536 
2537 static inline void
2538 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum,
2539 		   struct mbuf_chain *chain)
2540 {
2541 	mxge_softc_t *sc;
2542 	struct ifnet *ifp;
2543 	struct mbuf *m;
2544 	struct ether_header *eh;
2545 	mxge_rx_ring_t *rx;
2546 	bus_dmamap_t old_map;
2547 	int idx;
2548 	uint16_t tcpudp_csum;
2549 
2550 	sc = ss->sc;
2551 	ifp = sc->ifp;
2552 	rx = &ss->rx_big;
2553 	idx = rx->cnt & rx->mask;
2554 	rx->cnt += rx->nbufs;
2555 	/* save a pointer to the received mbuf */
2556 	m = rx->info[idx].m;
2557 	/* try to replace the received mbuf */
2558 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2559 		/* drop the frame -- the old mbuf is re-cycled */
2560 		ifp->if_ierrors++;
2561 		return;
2562 	}
2563 
2564 	/* unmap the received buffer */
2565 	old_map = rx->info[idx].map;
2566 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2567 	bus_dmamap_unload(rx->dmat, old_map);
2568 
2569 	/* swap the bus_dmamap_t's */
2570 	rx->info[idx].map = rx->extra_map;
2571 	rx->extra_map = old_map;
2572 
2573 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2574 	 * aligned */
2575 	m->m_data += MXGEFW_PAD;
2576 
2577 	m->m_pkthdr.rcvif = ifp;
2578 	m->m_len = m->m_pkthdr.len = len;
2579 	ss->ipackets++;
2580 	eh = mtod(m, struct ether_header *);
2581 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2582 		mxge_vlan_tag_remove(m, &csum);
2583 	}
2584 	/* if the checksum is valid, mark it in the mbuf header */
2585 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2586 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2587 			return;
2588 		/* otherwise, it was a UDP frame, or a TCP frame which
2589 		   we could not do LRO on.  Tell the stack that the
2590 		   checksum is good */
2591 		m->m_pkthdr.csum_data = 0xffff;
2592 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2593 	}
2594 #if 0
2595 	/* flowid only valid if RSS hashing is enabled */
2596 	if (sc->num_slices > 1) {
2597 		m->m_pkthdr.flowid = (ss - sc->ss);
2598 		m->m_flags |= M_FLOWID;
2599 	}
2600 #endif
2601 	ether_input_chain(ifp, m, NULL, chain);
2602 }
2603 
2604 static inline void
2605 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum,
2606 		   struct mbuf_chain *chain)
2607 {
2608 	mxge_softc_t *sc;
2609 	struct ifnet *ifp;
2610 	struct ether_header *eh;
2611 	struct mbuf *m;
2612 	mxge_rx_ring_t *rx;
2613 	bus_dmamap_t old_map;
2614 	int idx;
2615 	uint16_t tcpudp_csum;
2616 
2617 	sc = ss->sc;
2618 	ifp = sc->ifp;
2619 	rx = &ss->rx_small;
2620 	idx = rx->cnt & rx->mask;
2621 	rx->cnt++;
2622 	/* save a pointer to the received mbuf */
2623 	m = rx->info[idx].m;
2624 	/* try to replace the received mbuf */
2625 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2626 		/* drop the frame -- the old mbuf is re-cycled */
2627 		ifp->if_ierrors++;
2628 		return;
2629 	}
2630 
2631 	/* unmap the received buffer */
2632 	old_map = rx->info[idx].map;
2633 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2634 	bus_dmamap_unload(rx->dmat, old_map);
2635 
2636 	/* swap the bus_dmamap_t's */
2637 	rx->info[idx].map = rx->extra_map;
2638 	rx->extra_map = old_map;
2639 
2640 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2641 	 * aligned */
2642 	m->m_data += MXGEFW_PAD;
2643 
2644 	m->m_pkthdr.rcvif = ifp;
2645 	m->m_len = m->m_pkthdr.len = len;
2646 	ss->ipackets++;
2647 	eh = mtod(m, struct ether_header *);
2648 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2649 		mxge_vlan_tag_remove(m, &csum);
2650 	}
2651 	/* if the checksum is valid, mark it in the mbuf header */
2652 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2653 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2654 			return;
2655 		/* otherwise, it was a UDP frame, or a TCP frame which
2656 		   we could not do LRO on.  Tell the stack that the
2657 		   checksum is good */
2658 		m->m_pkthdr.csum_data = 0xffff;
2659 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2660 	}
2661 #if 0
2662 	/* flowid only valid if RSS hashing is enabled */
2663 	if (sc->num_slices > 1) {
2664 		m->m_pkthdr.flowid = (ss - sc->ss);
2665 		m->m_flags |= M_FLOWID;
2666 	}
2667 #endif
2668 	ether_input_chain(ifp, m, NULL, chain);
2669 }
2670 
2671 static inline void
2672 mxge_clean_rx_done(struct mxge_slice_state *ss)
2673 {
2674 	mxge_rx_done_t *rx_done = &ss->rx_done;
2675 	int limit = 0;
2676 	uint16_t length;
2677 	uint16_t checksum;
2678 	struct mbuf_chain chain[MAXCPU];
2679 
2680 	ether_input_chain_init(chain);
2681 	while (rx_done->entry[rx_done->idx].length != 0) {
2682 		length = ntohs(rx_done->entry[rx_done->idx].length);
2683 		rx_done->entry[rx_done->idx].length = 0;
2684 		checksum = rx_done->entry[rx_done->idx].checksum;
2685 		if (length <= (MHLEN - MXGEFW_PAD))
2686 			mxge_rx_done_small(ss, length, checksum, chain);
2687 		else
2688 			mxge_rx_done_big(ss, length, checksum, chain);
2689 		rx_done->cnt++;
2690 		rx_done->idx = rx_done->cnt & rx_done->mask;
2691 
2692 		/* limit potential for livelock */
2693 		if (__predict_false(++limit > rx_done->mask / 2))
2694 			break;
2695 	}
2696 	ether_input_dispatch(chain);
2697 #ifdef INET
2698 	while (!SLIST_EMPTY(&ss->lro_active)) {
2699 		struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2700 		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2701 		mxge_lro_flush(ss, lro);
2702 	}
2703 #endif
2704 }
2705 
2706 
2707 static inline void
2708 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2709 {
2710 	struct ifnet *ifp;
2711 	mxge_tx_ring_t *tx;
2712 	struct mbuf *m;
2713 	bus_dmamap_t map;
2714 	int idx;
2715 	int *flags;
2716 
2717 	tx = &ss->tx;
2718 	ifp = ss->sc->ifp;
2719 	ASSERT_SERIALIZED(ifp->if_serializer);
2720 	while (tx->pkt_done != mcp_idx) {
2721 		idx = tx->done & tx->mask;
2722 		tx->done++;
2723 		m = tx->info[idx].m;
2724 		/* mbuf and DMA map only attached to the first
2725 		   segment per-mbuf */
2726 		if (m != NULL) {
2727 			ss->obytes += m->m_pkthdr.len;
2728 			if (m->m_flags & M_MCAST)
2729 				ss->omcasts++;
2730 			ss->opackets++;
2731 			tx->info[idx].m = NULL;
2732 			map = tx->info[idx].map;
2733 			bus_dmamap_unload(tx->dmat, map);
2734 			m_freem(m);
2735 		}
2736 		if (tx->info[idx].flag) {
2737 			tx->info[idx].flag = 0;
2738 			tx->pkt_done++;
2739 		}
2740 	}
2741 
2742 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2743            its OK to send packets */
2744 #ifdef IFNET_BUF_RING
2745 	flags = &ss->if_flags;
2746 #else
2747 	flags = &ifp->if_flags;
2748 #endif
2749 	if ((*flags) & IFF_OACTIVE &&
2750 	    tx->req - tx->done < (tx->mask + 1)/4) {
2751 		*(flags) &= ~IFF_OACTIVE;
2752 		ss->tx.wake++;
2753 		mxge_start_locked(ss);
2754 	}
2755 #ifdef IFNET_BUF_RING
2756 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2757 		/* let the NIC stop polling this queue, since there
2758 		 * are no more transmits pending */
2759 		if (tx->req == tx->done) {
2760 			*tx->send_stop = 1;
2761 			tx->queue_active = 0;
2762 			tx->deactivate++;
2763 			wmb();
2764 		}
2765 	}
2766 #endif
2767 
2768 }
2769 
2770 static struct mxge_media_type mxge_xfp_media_types[] =
2771 {
2772 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2773 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2774 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2775 	{0,		(1 << 5),	"10GBASE-ER"},
2776 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2777 	{0,		(1 << 3),	"10GBASE-SW"},
2778 	{0,		(1 << 2),	"10GBASE-LW"},
2779 	{0,		(1 << 1),	"10GBASE-EW"},
2780 	{0,		(1 << 0),	"Reserved"}
2781 };
2782 static struct mxge_media_type mxge_sfp_media_types[] =
2783 {
2784 	{0,		(1 << 7),	"Reserved"},
2785 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2786 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2787 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"}
2788 };
2789 
2790 static void
2791 mxge_set_media(mxge_softc_t *sc, int type)
2792 {
2793 	sc->media_flags |= type;
2794 	ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2795 	ifmedia_set(&sc->media, sc->media_flags);
2796 }
2797 
2798 
2799 /*
2800  * Determine the media type for a NIC.  Some XFPs will identify
2801  * themselves only when their link is up, so this is initiated via a
2802  * link up interrupt.  However, this can potentially take up to
2803  * several milliseconds, so it is run via the watchdog routine, rather
2804  * than in the interrupt handler itself.   This need only be done
2805  * once, not each time the link is up.
2806  */
2807 static void
2808 mxge_media_probe(mxge_softc_t *sc)
2809 {
2810 	mxge_cmd_t cmd;
2811 	char *cage_type;
2812 	char *ptr;
2813 	struct mxge_media_type *mxge_media_types = NULL;
2814 	int i, err, ms, mxge_media_type_entries;
2815 	uint32_t byte;
2816 
2817 	sc->need_media_probe = 0;
2818 
2819 	/* if we've already set a media type, we're done */
2820 	if (sc->media_flags  != (IFM_ETHER | IFM_AUTO))
2821 		return;
2822 
2823 	/*
2824 	 * parse the product code to deterimine the interface type
2825 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2826 	 * after the 3rd dash in the driver's cached copy of the
2827 	 * EEPROM's product code string.
2828 	 */
2829 	ptr = sc->product_code_string;
2830 	if (ptr == NULL) {
2831 		device_printf(sc->dev, "Missing product code\n");
2832 	}
2833 
2834 	for (i = 0; i < 3; i++, ptr++) {
2835 		ptr = index(ptr, '-');
2836 		if (ptr == NULL) {
2837 			device_printf(sc->dev,
2838 				      "only %d dashes in PC?!?\n", i);
2839 			return;
2840 		}
2841 	}
2842 	if (*ptr == 'C') {
2843 		/* -C is CX4 */
2844 		mxge_set_media(sc, IFM_10G_CX4);
2845 		return;
2846 	}
2847 	else if (*ptr == 'Q') {
2848 		/* -Q is Quad Ribbon Fiber */
2849 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2850 		/* FreeBSD has no media type for Quad ribbon fiber */
2851 		return;
2852 	}
2853 
2854 	if (*ptr == 'R') {
2855 		/* -R is XFP */
2856 		mxge_media_types = mxge_xfp_media_types;
2857 		mxge_media_type_entries =
2858 			sizeof (mxge_xfp_media_types) /
2859 			sizeof (mxge_xfp_media_types[0]);
2860 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2861 		cage_type = "XFP";
2862 	}
2863 
2864 	if (*ptr == 'S' || *(ptr +1) == 'S') {
2865 		/* -S or -2S is SFP+ */
2866 		mxge_media_types = mxge_sfp_media_types;
2867 		mxge_media_type_entries =
2868 			sizeof (mxge_sfp_media_types) /
2869 			sizeof (mxge_sfp_media_types[0]);
2870 		cage_type = "SFP+";
2871 		byte = 3;
2872 	}
2873 
2874 	if (mxge_media_types == NULL) {
2875 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2876 		return;
2877 	}
2878 
2879 	/*
2880 	 * At this point we know the NIC has an XFP cage, so now we
2881 	 * try to determine what is in the cage by using the
2882 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2883 	 * register.  We read just one byte, which may take over
2884 	 * a millisecond
2885 	 */
2886 
2887 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2888 	cmd.data1 = byte;
2889 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2890 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2891 		device_printf(sc->dev, "failed to read XFP\n");
2892 	}
2893 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2894 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2895 	}
2896 	if (err != MXGEFW_CMD_OK) {
2897 		return;
2898 	}
2899 
2900 	/* now we wait for the data to be cached */
2901 	cmd.data0 = byte;
2902 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2903 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2904 		DELAY(1000);
2905 		cmd.data0 = byte;
2906 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2907 	}
2908 	if (err != MXGEFW_CMD_OK) {
2909 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2910 			      cage_type, err, ms);
2911 		return;
2912 	}
2913 
2914 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2915 		if (mxge_verbose)
2916 			device_printf(sc->dev, "%s:%s\n", cage_type,
2917 				      mxge_media_types[0].name);
2918 		mxge_set_media(sc, IFM_10G_CX4);
2919 		return;
2920 	}
2921 	for (i = 1; i < mxge_media_type_entries; i++) {
2922 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2923 			if (mxge_verbose)
2924 				device_printf(sc->dev, "%s:%s\n",
2925 					      cage_type,
2926 					      mxge_media_types[i].name);
2927 
2928 			mxge_set_media(sc, mxge_media_types[i].flag);
2929 			return;
2930 		}
2931 	}
2932 	device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2933 		      cmd.data0);
2934 
2935 	return;
2936 }
2937 
2938 static void
2939 mxge_intr(void *arg)
2940 {
2941 	struct mxge_slice_state *ss = arg;
2942 	mxge_softc_t *sc = ss->sc;
2943 	mcp_irq_data_t *stats = ss->fw_stats;
2944 	mxge_tx_ring_t *tx = &ss->tx;
2945 	mxge_rx_done_t *rx_done = &ss->rx_done;
2946 	uint32_t send_done_count;
2947 	uint8_t valid;
2948 
2949 
2950 #ifndef IFNET_BUF_RING
2951 	/* an interrupt on a non-zero slice is implicitly valid
2952 	   since MSI-X irqs are not shared */
2953 	if (ss != sc->ss) {
2954 		mxge_clean_rx_done(ss);
2955 		*ss->irq_claim = be32toh(3);
2956 		return;
2957 	}
2958 #endif
2959 
2960 	/* make sure the DMA has finished */
2961 	if (!stats->valid) {
2962 		return;
2963 	}
2964 	valid = stats->valid;
2965 
2966 	if (sc->legacy_irq) {
2967 		/* lower legacy IRQ  */
2968 		*sc->irq_deassert = 0;
2969 		if (!mxge_deassert_wait)
2970 			/* don't wait for conf. that irq is low */
2971 			stats->valid = 0;
2972 	} else {
2973 		stats->valid = 0;
2974 	}
2975 
2976 	/* loop while waiting for legacy irq deassertion */
2977 	do {
2978 		/* check for transmit completes and receives */
2979 		send_done_count = be32toh(stats->send_done_count);
2980 		while ((send_done_count != tx->pkt_done) ||
2981 		       (rx_done->entry[rx_done->idx].length != 0)) {
2982 			if (send_done_count != tx->pkt_done)
2983 				mxge_tx_done(ss, (int)send_done_count);
2984 			mxge_clean_rx_done(ss);
2985 			send_done_count = be32toh(stats->send_done_count);
2986 		}
2987 		if (sc->legacy_irq && mxge_deassert_wait)
2988 			wmb();
2989 	} while (*((volatile uint8_t *) &stats->valid));
2990 
2991 	/* fw link & error stats meaningful only on the first slice */
2992 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2993 		if (sc->link_state != stats->link_up) {
2994 			sc->link_state = stats->link_up;
2995 			if (sc->link_state) {
2996 				sc->ifp->if_link_state = LINK_STATE_UP;
2997 				if_link_state_change(sc->ifp);
2998 				if (mxge_verbose)
2999 					device_printf(sc->dev, "link up\n");
3000 			} else {
3001 				sc->ifp->if_link_state = LINK_STATE_DOWN;
3002 				if_link_state_change(sc->ifp);
3003 				if (mxge_verbose)
3004 					device_printf(sc->dev, "link down\n");
3005 			}
3006 			sc->need_media_probe = 1;
3007 		}
3008 		if (sc->rdma_tags_available !=
3009 		    be32toh(stats->rdma_tags_available)) {
3010 			sc->rdma_tags_available =
3011 				be32toh(stats->rdma_tags_available);
3012 			device_printf(sc->dev, "RDMA timed out! %d tags "
3013 				      "left\n", sc->rdma_tags_available);
3014 		}
3015 
3016 		if (stats->link_down) {
3017 			sc->down_cnt += stats->link_down;
3018 			sc->link_state = 0;
3019 			sc->ifp->if_link_state = LINK_STATE_DOWN;
3020 			if_link_state_change(sc->ifp);
3021 		}
3022 	}
3023 
3024 	/* check to see if we have rx token to pass back */
3025 	if (valid & 0x1)
3026 	    *ss->irq_claim = be32toh(3);
3027 	*(ss->irq_claim + 1) = be32toh(3);
3028 }
3029 
3030 static void
3031 mxge_init(void *arg)
3032 {
3033 }
3034 
3035 
3036 
3037 static void
3038 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3039 {
3040 	struct lro_entry *lro_entry;
3041 	int i;
3042 
3043 	while (!SLIST_EMPTY(&ss->lro_free)) {
3044 		lro_entry = SLIST_FIRST(&ss->lro_free);
3045 		SLIST_REMOVE_HEAD(&ss->lro_free, next);
3046 		kfree(lro_entry, M_DEVBUF);
3047 	}
3048 
3049 	for (i = 0; i <= ss->rx_big.mask; i++) {
3050 		if (ss->rx_big.info[i].m == NULL)
3051 			continue;
3052 		bus_dmamap_unload(ss->rx_big.dmat,
3053 				  ss->rx_big.info[i].map);
3054 		m_freem(ss->rx_big.info[i].m);
3055 		ss->rx_big.info[i].m = NULL;
3056 	}
3057 
3058 	for (i = 0; i <= ss->rx_small.mask; i++) {
3059 		if (ss->rx_small.info[i].m == NULL)
3060 			continue;
3061 		bus_dmamap_unload(ss->rx_small.dmat,
3062 				  ss->rx_small.info[i].map);
3063 		m_freem(ss->rx_small.info[i].m);
3064 		ss->rx_small.info[i].m = NULL;
3065 	}
3066 
3067 	/* transmit ring used only on the first slice */
3068 	if (ss->tx.info == NULL)
3069 		return;
3070 
3071 	for (i = 0; i <= ss->tx.mask; i++) {
3072 		ss->tx.info[i].flag = 0;
3073 		if (ss->tx.info[i].m == NULL)
3074 			continue;
3075 		bus_dmamap_unload(ss->tx.dmat,
3076 				  ss->tx.info[i].map);
3077 		m_freem(ss->tx.info[i].m);
3078 		ss->tx.info[i].m = NULL;
3079 	}
3080 }
3081 
3082 static void
3083 mxge_free_mbufs(mxge_softc_t *sc)
3084 {
3085 	int slice;
3086 
3087 	for (slice = 0; slice < sc->num_slices; slice++)
3088 		mxge_free_slice_mbufs(&sc->ss[slice]);
3089 }
3090 
3091 static void
3092 mxge_free_slice_rings(struct mxge_slice_state *ss)
3093 {
3094 	int i;
3095 
3096 
3097 	if (ss->rx_done.entry != NULL)
3098 		mxge_dma_free(&ss->rx_done.dma);
3099 	ss->rx_done.entry = NULL;
3100 
3101 	if (ss->tx.req_bytes != NULL)
3102 		kfree(ss->tx.req_bytes, M_DEVBUF);
3103 	ss->tx.req_bytes = NULL;
3104 
3105 	if (ss->tx.seg_list != NULL)
3106 		kfree(ss->tx.seg_list, M_DEVBUF);
3107 	ss->tx.seg_list = NULL;
3108 
3109 	if (ss->rx_small.shadow != NULL)
3110 		kfree(ss->rx_small.shadow, M_DEVBUF);
3111 	ss->rx_small.shadow = NULL;
3112 
3113 	if (ss->rx_big.shadow != NULL)
3114 		kfree(ss->rx_big.shadow, M_DEVBUF);
3115 	ss->rx_big.shadow = NULL;
3116 
3117 	if (ss->tx.info != NULL) {
3118 		if (ss->tx.dmat != NULL) {
3119 			for (i = 0; i <= ss->tx.mask; i++) {
3120 				bus_dmamap_destroy(ss->tx.dmat,
3121 						   ss->tx.info[i].map);
3122 			}
3123 			bus_dma_tag_destroy(ss->tx.dmat);
3124 		}
3125 		kfree(ss->tx.info, M_DEVBUF);
3126 	}
3127 	ss->tx.info = NULL;
3128 
3129 	if (ss->rx_small.info != NULL) {
3130 		if (ss->rx_small.dmat != NULL) {
3131 			for (i = 0; i <= ss->rx_small.mask; i++) {
3132 				bus_dmamap_destroy(ss->rx_small.dmat,
3133 						   ss->rx_small.info[i].map);
3134 			}
3135 			bus_dmamap_destroy(ss->rx_small.dmat,
3136 					   ss->rx_small.extra_map);
3137 			bus_dma_tag_destroy(ss->rx_small.dmat);
3138 		}
3139 		kfree(ss->rx_small.info, M_DEVBUF);
3140 	}
3141 	ss->rx_small.info = NULL;
3142 
3143 	if (ss->rx_big.info != NULL) {
3144 		if (ss->rx_big.dmat != NULL) {
3145 			for (i = 0; i <= ss->rx_big.mask; i++) {
3146 				bus_dmamap_destroy(ss->rx_big.dmat,
3147 						   ss->rx_big.info[i].map);
3148 			}
3149 			bus_dmamap_destroy(ss->rx_big.dmat,
3150 					   ss->rx_big.extra_map);
3151 			bus_dma_tag_destroy(ss->rx_big.dmat);
3152 		}
3153 		kfree(ss->rx_big.info, M_DEVBUF);
3154 	}
3155 	ss->rx_big.info = NULL;
3156 }
3157 
3158 static void
3159 mxge_free_rings(mxge_softc_t *sc)
3160 {
3161 	int slice;
3162 
3163 	for (slice = 0; slice < sc->num_slices; slice++)
3164 		mxge_free_slice_rings(&sc->ss[slice]);
3165 }
3166 
3167 static int
3168 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3169 		       int tx_ring_entries)
3170 {
3171 	mxge_softc_t *sc = ss->sc;
3172 	size_t bytes;
3173 	int err, i;
3174 
3175 	err = ENOMEM;
3176 
3177 	/* allocate per-slice receive resources */
3178 
3179 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3180 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3181 
3182 	/* allocate the rx shadow rings */
3183 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3184 	ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3185 	if (ss->rx_small.shadow == NULL)
3186 		return err;;
3187 
3188 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3189 	ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3190 	if (ss->rx_big.shadow == NULL)
3191 		return err;;
3192 
3193 	/* allocate the rx host info rings */
3194 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3195 	ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3196 	if (ss->rx_small.info == NULL)
3197 		return err;;
3198 
3199 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3200 	ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3201 	if (ss->rx_big.info == NULL)
3202 		return err;;
3203 
3204 	/* allocate the rx busdma resources */
3205 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3206 				 1,			/* alignment */
3207 				 4096,			/* boundary */
3208 				 BUS_SPACE_MAXADDR,	/* low */
3209 				 BUS_SPACE_MAXADDR,	/* high */
3210 				 NULL, NULL,		/* filter */
3211 				 MHLEN,			/* maxsize */
3212 				 1,			/* num segs */
3213 				 MHLEN,			/* maxsegsize */
3214 				 BUS_DMA_ALLOCNOW,	/* flags */
3215 				 &ss->rx_small.dmat);	/* tag */
3216 	if (err != 0) {
3217 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3218 			      err);
3219 		return err;;
3220 	}
3221 
3222 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3223 				 1,			/* alignment */
3224 #if MXGE_VIRT_JUMBOS
3225 				 4096,			/* boundary */
3226 #else
3227 				 0,			/* boundary */
3228 #endif
3229 				 BUS_SPACE_MAXADDR,	/* low */
3230 				 BUS_SPACE_MAXADDR,	/* high */
3231 				 NULL, NULL,		/* filter */
3232 				 3*4096,		/* maxsize */
3233 #if MXGE_VIRT_JUMBOS
3234 				 3,			/* num segs */
3235 				 4096,			/* maxsegsize*/
3236 #else
3237 				 1,			/* num segs */
3238 				 MJUM9BYTES,		/* maxsegsize*/
3239 #endif
3240 				 BUS_DMA_ALLOCNOW,	/* flags */
3241 				 &ss->rx_big.dmat);	/* tag */
3242 	if (err != 0) {
3243 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3244 			      err);
3245 		return err;;
3246 	}
3247 	for (i = 0; i <= ss->rx_small.mask; i++) {
3248 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3249 					&ss->rx_small.info[i].map);
3250 		if (err != 0) {
3251 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3252 				      err);
3253 			return err;;
3254 		}
3255 	}
3256 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3257 				&ss->rx_small.extra_map);
3258 	if (err != 0) {
3259 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3260 			      err);
3261 		return err;;
3262 	}
3263 
3264 	for (i = 0; i <= ss->rx_big.mask; i++) {
3265 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3266 					&ss->rx_big.info[i].map);
3267 		if (err != 0) {
3268 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3269 				      err);
3270 			return err;;
3271 		}
3272 	}
3273 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3274 				&ss->rx_big.extra_map);
3275 	if (err != 0) {
3276 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3277 			      err);
3278 		return err;;
3279 	}
3280 
3281 	/* now allocate TX resouces */
3282 
3283 #ifndef IFNET_BUF_RING
3284 	/* only use a single TX ring for now */
3285 	if (ss != ss->sc->ss)
3286 		return 0;
3287 #endif
3288 
3289 	ss->tx.mask = tx_ring_entries - 1;
3290 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3291 
3292 
3293 	/* allocate the tx request copy block */
3294 	bytes = 8 +
3295 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3296 	ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3297 	if (ss->tx.req_bytes == NULL)
3298 		return err;;
3299 	/* ensure req_list entries are aligned to 8 bytes */
3300 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3301 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3302 
3303 	/* allocate the tx busdma segment list */
3304 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3305 	ss->tx.seg_list = (bus_dma_segment_t *)
3306 		kmalloc(bytes, M_DEVBUF, M_WAITOK);
3307 	if (ss->tx.seg_list == NULL)
3308 		return err;;
3309 
3310 	/* allocate the tx host info ring */
3311 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3312 	ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3313 	if (ss->tx.info == NULL)
3314 		return err;;
3315 
3316 	/* allocate the tx busdma resources */
3317 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3318 				 1,			/* alignment */
3319 				 sc->tx_boundary,	/* boundary */
3320 				 BUS_SPACE_MAXADDR,	/* low */
3321 				 BUS_SPACE_MAXADDR,	/* high */
3322 				 NULL, NULL,		/* filter */
3323 				 65536 + 256,		/* maxsize */
3324 				 ss->tx.max_desc - 2,	/* num segs */
3325 				 sc->tx_boundary,	/* maxsegsz */
3326 				 BUS_DMA_ALLOCNOW,	/* flags */
3327 				 &ss->tx.dmat);		/* tag */
3328 
3329 	if (err != 0) {
3330 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3331 			      err);
3332 		return err;;
3333 	}
3334 
3335 	/* now use these tags to setup dmamaps for each slot
3336 	   in the ring */
3337 	for (i = 0; i <= ss->tx.mask; i++) {
3338 		err = bus_dmamap_create(ss->tx.dmat, 0,
3339 					&ss->tx.info[i].map);
3340 		if (err != 0) {
3341 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3342 				      err);
3343 			return err;;
3344 		}
3345 	}
3346 	return 0;
3347 
3348 }
3349 
3350 static int
3351 mxge_alloc_rings(mxge_softc_t *sc)
3352 {
3353 	mxge_cmd_t cmd;
3354 	int tx_ring_size;
3355 	int tx_ring_entries, rx_ring_entries;
3356 	int err, slice;
3357 
3358 	/* get ring sizes */
3359 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3360 	tx_ring_size = cmd.data0;
3361 	if (err != 0) {
3362 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3363 		goto abort;
3364 	}
3365 
3366 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3367 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3368 	ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3369 	ifq_set_ready(&sc->ifp->if_snd);
3370 
3371 	for (slice = 0; slice < sc->num_slices; slice++) {
3372 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3373 					     rx_ring_entries,
3374 					     tx_ring_entries);
3375 		if (err != 0)
3376 			goto abort;
3377 	}
3378 	return 0;
3379 
3380 abort:
3381 	mxge_free_rings(sc);
3382 	return err;
3383 
3384 }
3385 
3386 
3387 static void
3388 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3389 {
3390 	int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3391 
3392 	if (bufsize < MCLBYTES) {
3393 		/* easy, everything fits in a single buffer */
3394 		*big_buf_size = MCLBYTES;
3395 		*cl_size = MCLBYTES;
3396 		*nbufs = 1;
3397 		return;
3398 	}
3399 
3400 	if (bufsize < MJUMPAGESIZE) {
3401 		/* still easy, everything still fits in a single buffer */
3402 		*big_buf_size = MJUMPAGESIZE;
3403 		*cl_size = MJUMPAGESIZE;
3404 		*nbufs = 1;
3405 		return;
3406 	}
3407 #if MXGE_VIRT_JUMBOS
3408 	/* now we need to use virtually contiguous buffers */
3409 	*cl_size = MJUM9BYTES;
3410 	*big_buf_size = 4096;
3411 	*nbufs = mtu / 4096 + 1;
3412 	/* needs to be a power of two, so round up */
3413 	if (*nbufs == 3)
3414 		*nbufs = 4;
3415 #else
3416 	*cl_size = MJUM9BYTES;
3417 	*big_buf_size = MJUM9BYTES;
3418 	*nbufs = 1;
3419 #endif
3420 }
3421 
3422 static int
3423 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3424 {
3425 	mxge_softc_t *sc;
3426 	mxge_cmd_t cmd;
3427 	bus_dmamap_t map;
3428 	struct lro_entry *lro_entry;
3429 	int err, i, slice;
3430 
3431 
3432 	sc = ss->sc;
3433 	slice = ss - sc->ss;
3434 
3435 	SLIST_INIT(&ss->lro_free);
3436 	SLIST_INIT(&ss->lro_active);
3437 
3438 	for (i = 0; i < sc->lro_cnt; i++) {
3439 		lro_entry = (struct lro_entry *)
3440 			kmalloc(sizeof (*lro_entry), M_DEVBUF,
3441 			       M_NOWAIT | M_ZERO);
3442 		if (lro_entry == NULL) {
3443 			sc->lro_cnt = i;
3444 			break;
3445 		}
3446 		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3447 	}
3448 	/* get the lanai pointers to the send and receive rings */
3449 
3450 	err = 0;
3451 #ifndef IFNET_BUF_RING
3452 	/* We currently only send from the first slice */
3453 	if (slice == 0) {
3454 #endif
3455 		cmd.data0 = slice;
3456 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3457 		ss->tx.lanai =
3458 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3459 		ss->tx.send_go = (volatile uint32_t *)
3460 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3461 		ss->tx.send_stop = (volatile uint32_t *)
3462 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3463 #ifndef IFNET_BUF_RING
3464 	}
3465 #endif
3466 	cmd.data0 = slice;
3467 	err |= mxge_send_cmd(sc,
3468 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3469 	ss->rx_small.lanai =
3470 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3471 	cmd.data0 = slice;
3472 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3473 	ss->rx_big.lanai =
3474 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3475 
3476 	if (err != 0) {
3477 		device_printf(sc->dev,
3478 			      "failed to get ring sizes or locations\n");
3479 		return EIO;
3480 	}
3481 
3482 	/* stock receive rings */
3483 	for (i = 0; i <= ss->rx_small.mask; i++) {
3484 		map = ss->rx_small.info[i].map;
3485 		err = mxge_get_buf_small(ss, map, i);
3486 		if (err) {
3487 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3488 				      i, ss->rx_small.mask + 1);
3489 			return ENOMEM;
3490 		}
3491 	}
3492 	for (i = 0; i <= ss->rx_big.mask; i++) {
3493 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3494 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3495 	}
3496 	ss->rx_big.nbufs = nbufs;
3497 	ss->rx_big.cl_size = cl_size;
3498 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3499 		EVL_ENCAPLEN + MXGEFW_PAD;
3500 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3501 		map = ss->rx_big.info[i].map;
3502 		err = mxge_get_buf_big(ss, map, i);
3503 		if (err) {
3504 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3505 				      i, ss->rx_big.mask + 1);
3506 			return ENOMEM;
3507 		}
3508 	}
3509 	return 0;
3510 }
3511 
3512 static int
3513 mxge_open(mxge_softc_t *sc)
3514 {
3515 	mxge_cmd_t cmd;
3516 	int err, big_bytes, nbufs, slice, cl_size, i;
3517 	bus_addr_t bus;
3518 	volatile uint8_t *itable;
3519 	struct mxge_slice_state *ss;
3520 
3521 	ASSERT_SERIALIZED(sc->ifp->if_serializer);
3522 	/* Copy the MAC address in case it was overridden */
3523 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3524 
3525 	err = mxge_reset(sc, 1);
3526 	if (err != 0) {
3527 		device_printf(sc->dev, "failed to reset\n");
3528 		return EIO;
3529 	}
3530 
3531 	if (sc->num_slices > 1) {
3532 		/* setup the indirection table */
3533 		cmd.data0 = sc->num_slices;
3534 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3535 				    &cmd);
3536 
3537 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3538 				     &cmd);
3539 		if (err != 0) {
3540 			device_printf(sc->dev,
3541 				      "failed to setup rss tables\n");
3542 			return err;
3543 		}
3544 
3545 		/* just enable an identity mapping */
3546 		itable = sc->sram + cmd.data0;
3547 		for (i = 0; i < sc->num_slices; i++)
3548 			itable[i] = (uint8_t)i;
3549 
3550 		cmd.data0 = 1;
3551 		cmd.data1 = mxge_rss_hash_type;
3552 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3553 		if (err != 0) {
3554 			device_printf(sc->dev, "failed to enable slices\n");
3555 			return err;
3556 		}
3557 	}
3558 
3559 
3560 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3561 
3562 	cmd.data0 = nbufs;
3563 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3564 			    &cmd);
3565 	/* error is only meaningful if we're trying to set
3566 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3567 	if (err && nbufs > 1) {
3568 		device_printf(sc->dev,
3569 			      "Failed to set alway-use-n to %d\n",
3570 			      nbufs);
3571 		return EIO;
3572 	}
3573 	/* Give the firmware the mtu and the big and small buffer
3574 	   sizes.  The firmware wants the big buf size to be a power
3575 	   of two. Luckily, FreeBSD's clusters are powers of two */
3576 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3577 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3578 	cmd.data0 = MHLEN - MXGEFW_PAD;
3579 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3580 			     &cmd);
3581 	cmd.data0 = big_bytes;
3582 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3583 
3584 	if (err != 0) {
3585 		device_printf(sc->dev, "failed to setup params\n");
3586 		goto abort;
3587 	}
3588 
3589 	/* Now give him the pointer to the stats block */
3590 	for (slice = 0;
3591 #ifdef IFNET_BUF_RING
3592 	     slice < sc->num_slices;
3593 #else
3594 	     slice < 1;
3595 #endif
3596 	     slice++) {
3597 		ss = &sc->ss[slice];
3598 		cmd.data0 =
3599 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3600 		cmd.data1 =
3601 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3602 		cmd.data2 = sizeof(struct mcp_irq_data);
3603 		cmd.data2 |= (slice << 16);
3604 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3605 	}
3606 
3607 	if (err != 0) {
3608 		bus = sc->ss->fw_stats_dma.bus_addr;
3609 		bus += offsetof(struct mcp_irq_data, send_done_count);
3610 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3611 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3612 		err = mxge_send_cmd(sc,
3613 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3614 				    &cmd);
3615 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3616 		sc->fw_multicast_support = 0;
3617 	} else {
3618 		sc->fw_multicast_support = 1;
3619 	}
3620 
3621 	if (err != 0) {
3622 		device_printf(sc->dev, "failed to setup params\n");
3623 		goto abort;
3624 	}
3625 
3626 	for (slice = 0; slice < sc->num_slices; slice++) {
3627 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3628 		if (err != 0) {
3629 			device_printf(sc->dev, "couldn't open slice %d\n",
3630 				      slice);
3631 			goto abort;
3632 		}
3633 	}
3634 
3635 	/* Finally, start the firmware running */
3636 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3637 	if (err) {
3638 		device_printf(sc->dev, "Couldn't bring up link\n");
3639 		goto abort;
3640 	}
3641 #ifdef IFNET_BUF_RING
3642 	for (slice = 0; slice < sc->num_slices; slice++) {
3643 		ss = &sc->ss[slice];
3644 		ss->if_flags |= IFF_RUNNING;
3645 		ss->if_flags &= ~IFF_OACTIVE;
3646 	}
3647 #endif
3648 	sc->ifp->if_flags |= IFF_RUNNING;
3649 	sc->ifp->if_flags &= ~IFF_OACTIVE;
3650 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3651 
3652 	return 0;
3653 
3654 
3655 abort:
3656 	mxge_free_mbufs(sc);
3657 
3658 	return err;
3659 }
3660 
3661 static int
3662 mxge_close(mxge_softc_t *sc)
3663 {
3664 	mxge_cmd_t cmd;
3665 	int err, old_down_cnt;
3666 #ifdef IFNET_BUF_RING
3667 	struct mxge_slice_state *ss;
3668 	int slice;
3669 #endif
3670 
3671 	ASSERT_SERIALIZED(sc->ifp->if_serializer);
3672 	callout_stop(&sc->co_hdl);
3673 #ifdef IFNET_BUF_RING
3674 	for (slice = 0; slice < sc->num_slices; slice++) {
3675 		ss = &sc->ss[slice];
3676 		ss->if_flags &= ~IFF_RUNNING;
3677 	}
3678 #endif
3679 	sc->ifp->if_flags &= ~IFF_RUNNING;
3680 	old_down_cnt = sc->down_cnt;
3681 	wmb();
3682 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3683 	if (err) {
3684 		device_printf(sc->dev, "Couldn't bring down link\n");
3685 	}
3686 	if (old_down_cnt == sc->down_cnt) {
3687 		/* wait for down irq */
3688 		DELAY(10 * sc->intr_coal_delay);
3689 	}
3690 	wmb();
3691 	if (old_down_cnt == sc->down_cnt) {
3692 		device_printf(sc->dev, "never got down irq\n");
3693 	}
3694 
3695 	mxge_free_mbufs(sc);
3696 
3697 	return 0;
3698 }
3699 
3700 static void
3701 mxge_setup_cfg_space(mxge_softc_t *sc)
3702 {
3703 	device_t dev = sc->dev;
3704 	int reg;
3705 	uint16_t cmd, lnk, pectl;
3706 
3707 	/* find the PCIe link width and set max read request to 4KB*/
3708 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3709 		lnk = pci_read_config(dev, reg + 0x12, 2);
3710 		sc->link_width = (lnk >> 4) & 0x3f;
3711 
3712 		pectl = pci_read_config(dev, reg + 0x8, 2);
3713 		pectl = (pectl & ~0x7000) | (5 << 12);
3714 		pci_write_config(dev, reg + 0x8, pectl, 2);
3715 	}
3716 
3717 	/* Enable DMA and Memory space access */
3718 	pci_enable_busmaster(dev);
3719 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3720 	cmd |= PCIM_CMD_MEMEN;
3721 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3722 }
3723 
3724 static uint32_t
3725 mxge_read_reboot(mxge_softc_t *sc)
3726 {
3727 	device_t dev = sc->dev;
3728 	uint32_t vs;
3729 
3730 	/* find the vendor specific offset */
3731 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3732 		device_printf(sc->dev,
3733 			      "could not find vendor specific offset\n");
3734 		return (uint32_t)-1;
3735 	}
3736 	/* enable read32 mode */
3737 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3738 	/* tell NIC which register to read */
3739 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3740 	return (pci_read_config(dev, vs + 0x14, 4));
3741 }
3742 
3743 static int
3744 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3745 {
3746 	struct pci_devinfo *dinfo;
3747 	mxge_tx_ring_t *tx;
3748 	int err;
3749 	uint32_t reboot;
3750 	uint16_t cmd;
3751 
3752 	err = ENXIO;
3753 
3754 	device_printf(sc->dev, "Watchdog reset!\n");
3755 
3756 	/*
3757 	 * check to see if the NIC rebooted.  If it did, then all of
3758 	 * PCI config space has been reset, and things like the
3759 	 * busmaster bit will be zero.  If this is the case, then we
3760 	 * must restore PCI config space before the NIC can be used
3761 	 * again
3762 	 */
3763 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3764 	if (cmd == 0xffff) {
3765 		/*
3766 		 * maybe the watchdog caught the NIC rebooting; wait
3767 		 * up to 100ms for it to finish.  If it does not come
3768 		 * back, then give up
3769 		 */
3770 		DELAY(1000*100);
3771 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3772 		if (cmd == 0xffff) {
3773 			device_printf(sc->dev, "NIC disappeared!\n");
3774 			return (err);
3775 		}
3776 	}
3777 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3778 		/* print the reboot status */
3779 		reboot = mxge_read_reboot(sc);
3780 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3781 			      reboot);
3782 		/* restore PCI configuration space */
3783 		dinfo = device_get_ivars(sc->dev);
3784 		pci_cfg_restore(sc->dev, dinfo);
3785 
3786 		/* and redo any changes we made to our config space */
3787 		mxge_setup_cfg_space(sc);
3788 
3789 		if (sc->ifp->if_flags & IFF_RUNNING) {
3790 			mxge_close(sc);
3791 			err = mxge_open(sc);
3792 		}
3793 	} else {
3794 		tx = &sc->ss[slice].tx;
3795 		device_printf(sc->dev,
3796 			      "NIC did not reboot, slice %d ring state:\n",
3797 			      slice);
3798 		device_printf(sc->dev,
3799 			      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3800 			      tx->req, tx->done, tx->queue_active);
3801 		device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3802 			      tx->activate, tx->deactivate);
3803 		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3804 			      tx->pkt_done,
3805 			      be32toh(sc->ss->fw_stats->send_done_count));
3806 		device_printf(sc->dev, "not resetting\n");
3807 	}
3808 	return (err);
3809 }
3810 
3811 static int
3812 mxge_watchdog(mxge_softc_t *sc)
3813 {
3814 	mxge_tx_ring_t *tx;
3815 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3816 	int i, err = 0;
3817 
3818 	/* see if we have outstanding transmits, which
3819 	   have been pending for more than mxge_ticks */
3820 	for (i = 0;
3821 #ifdef IFNET_BUF_RING
3822 	     (i < sc->num_slices) && (err == 0);
3823 #else
3824 	     (i < 1) && (err == 0);
3825 #endif
3826 	     i++) {
3827 		tx = &sc->ss[i].tx;
3828 		if (tx->req != tx->done &&
3829 		    tx->watchdog_req != tx->watchdog_done &&
3830 		    tx->done == tx->watchdog_done) {
3831 			/* check for pause blocking before resetting */
3832 			if (tx->watchdog_rx_pause == rx_pause)
3833 				err = mxge_watchdog_reset(sc, i);
3834 			else
3835 				device_printf(sc->dev, "Flow control blocking "
3836 					      "xmits, check link partner\n");
3837 		}
3838 
3839 		tx->watchdog_req = tx->req;
3840 		tx->watchdog_done = tx->done;
3841 		tx->watchdog_rx_pause = rx_pause;
3842 	}
3843 
3844 	if (sc->need_media_probe)
3845 		mxge_media_probe(sc);
3846 	return (err);
3847 }
3848 
3849 static void
3850 mxge_update_stats(mxge_softc_t *sc)
3851 {
3852 	struct mxge_slice_state *ss;
3853 	u_long ipackets = 0;
3854 	u_long opackets = 0;
3855 #ifdef IFNET_BUF_RING
3856 	u_long obytes = 0;
3857 	u_long omcasts = 0;
3858 	u_long odrops = 0;
3859 #endif
3860 	u_long oerrors = 0;
3861 	int slice;
3862 
3863 	for (slice = 0; slice < sc->num_slices; slice++) {
3864 		ss = &sc->ss[slice];
3865 		ipackets += ss->ipackets;
3866 		opackets += ss->opackets;
3867 #ifdef IFNET_BUF_RING
3868 		obytes += ss->obytes;
3869 		omcasts += ss->omcasts;
3870 		odrops += ss->tx.br->br_drops;
3871 #endif
3872 		oerrors += ss->oerrors;
3873 	}
3874 	sc->ifp->if_ipackets = ipackets;
3875 	sc->ifp->if_opackets = opackets;
3876 #ifdef IFNET_BUF_RING
3877 	sc->ifp->if_obytes = obytes;
3878 	sc->ifp->if_omcasts = omcasts;
3879 	sc->ifp->if_snd.ifq_drops = odrops;
3880 #endif
3881 	sc->ifp->if_oerrors = oerrors;
3882 }
3883 
3884 static void
3885 mxge_tick(void *arg)
3886 {
3887 	mxge_softc_t *sc = arg;
3888 	int err = 0;
3889 
3890 	lwkt_serialize_enter(sc->ifp->if_serializer);
3891 	/* aggregate stats from different slices */
3892 	mxge_update_stats(sc);
3893 	if (!sc->watchdog_countdown) {
3894 		err = mxge_watchdog(sc);
3895 		sc->watchdog_countdown = 4;
3896 	}
3897 	sc->watchdog_countdown--;
3898 	if (err == 0)
3899 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3900 	lwkt_serialize_exit(sc->ifp->if_serializer);
3901 }
3902 
3903 static int
3904 mxge_media_change(struct ifnet *ifp)
3905 {
3906 	return EINVAL;
3907 }
3908 
3909 static int
3910 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3911 {
3912 	struct ifnet *ifp = sc->ifp;
3913 	int real_mtu, old_mtu;
3914 	int err = 0;
3915 
3916 	if (ifp->if_serializer)
3917 		ASSERT_SERIALIZED(ifp->if_serializer);
3918 
3919 	real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3920 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3921 		return EINVAL;
3922 	old_mtu = ifp->if_mtu;
3923 	ifp->if_mtu = mtu;
3924 	if (ifp->if_flags & IFF_RUNNING) {
3925 		mxge_close(sc);
3926 		err = mxge_open(sc);
3927 		if (err != 0) {
3928 			ifp->if_mtu = old_mtu;
3929 			mxge_close(sc);
3930 			(void) mxge_open(sc);
3931 		}
3932 	}
3933 	return err;
3934 }
3935 
3936 static void
3937 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3938 {
3939 	mxge_softc_t *sc = ifp->if_softc;
3940 
3941 
3942 	if (sc == NULL)
3943 		return;
3944 	ifmr->ifm_status = IFM_AVALID;
3945 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3946 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3947 	ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3948 }
3949 
3950 static int
3951 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data, struct ucred *cr)
3952 {
3953 	mxge_softc_t *sc = ifp->if_softc;
3954 	struct ifreq *ifr = (struct ifreq *)data;
3955 	int err, mask;
3956 
3957 	(void)cr;
3958 	err = 0;
3959 	ASSERT_SERIALIZED(ifp->if_serializer);
3960 	switch (command) {
3961 	case SIOCSIFADDR:
3962 	case SIOCGIFADDR:
3963 		err = ether_ioctl(ifp, command, data);
3964 		break;
3965 
3966 	case SIOCSIFMTU:
3967 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3968 		break;
3969 
3970 	case SIOCSIFFLAGS:
3971 		if (sc->dying) {
3972 			return EINVAL;
3973 		}
3974 		if (ifp->if_flags & IFF_UP) {
3975 			if (!(ifp->if_flags & IFF_RUNNING)) {
3976 				err = mxge_open(sc);
3977 			} else {
3978 				/* take care of promis can allmulti
3979 				   flag chages */
3980 				mxge_change_promisc(sc,
3981 						    ifp->if_flags & IFF_PROMISC);
3982 				mxge_set_multicast_list(sc);
3983 			}
3984 		} else {
3985 			if (ifp->if_flags & IFF_RUNNING) {
3986 				mxge_close(sc);
3987 			}
3988 		}
3989 		break;
3990 
3991 	case SIOCADDMULTI:
3992 	case SIOCDELMULTI:
3993 		mxge_set_multicast_list(sc);
3994 		break;
3995 
3996 	case SIOCSIFCAP:
3997 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3998 		if (mask & IFCAP_TXCSUM) {
3999 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4000 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4001 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
4002 						      | CSUM_TSO);
4003 			} else {
4004 				ifp->if_capenable |= IFCAP_TXCSUM;
4005 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4006 			}
4007 		} else if (mask & IFCAP_RXCSUM) {
4008 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4009 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4010 				sc->csum_flag = 0;
4011 			} else {
4012 				ifp->if_capenable |= IFCAP_RXCSUM;
4013 				sc->csum_flag = 1;
4014 			}
4015 		}
4016 		if (mask & IFCAP_TSO4) {
4017 			if (IFCAP_TSO4 & ifp->if_capenable) {
4018 				ifp->if_capenable &= ~IFCAP_TSO4;
4019 				ifp->if_hwassist &= ~CSUM_TSO;
4020 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4021 				ifp->if_capenable |= IFCAP_TSO4;
4022 				ifp->if_hwassist |= CSUM_TSO;
4023 			} else {
4024 				kprintf("mxge requires tx checksum offload"
4025 				       " be enabled to use TSO\n");
4026 				err = EINVAL;
4027 			}
4028 		}
4029 		if (mask & IFCAP_LRO) {
4030 			if (IFCAP_LRO & ifp->if_capenable)
4031 				err = mxge_change_lro_locked(sc, 0);
4032 			else
4033 				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
4034 		}
4035 		if (mask & IFCAP_VLAN_HWTAGGING)
4036 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4037 		VLAN_CAPABILITIES(ifp);
4038 
4039 		break;
4040 
4041 	case SIOCGIFMEDIA:
4042 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4043 				    &sc->media, command);
4044                 break;
4045 
4046 	default:
4047 		err = ENOTTY;
4048         }
4049 	return err;
4050 }
4051 
4052 static void
4053 mxge_fetch_tunables(mxge_softc_t *sc)
4054 {
4055 
4056 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4057 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4058 			  &mxge_flow_control);
4059 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4060 			  &mxge_intr_coal_delay);
4061 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4062 			  &mxge_nvidia_ecrc_enable);
4063 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4064 			  &mxge_force_firmware);
4065 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4066 			  &mxge_deassert_wait);
4067 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4068 			  &mxge_verbose);
4069 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4070 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4071 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4072 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4073 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4074 	if (sc->lro_cnt != 0)
4075 		mxge_lro_cnt = sc->lro_cnt;
4076 
4077 	if (bootverbose)
4078 		mxge_verbose = 1;
4079 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4080 		mxge_intr_coal_delay = 30;
4081 	if (mxge_ticks == 0)
4082 		mxge_ticks = hz / 2;
4083 	sc->pause = mxge_flow_control;
4084 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4085 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4086 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4087 	}
4088 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4089 	    mxge_initial_mtu < ETHER_MIN_LEN)
4090 		mxge_initial_mtu = ETHERMTU_JUMBO;
4091 }
4092 
4093 
4094 static void
4095 mxge_free_slices(mxge_softc_t *sc)
4096 {
4097 	struct mxge_slice_state *ss;
4098 	int i;
4099 
4100 
4101 	if (sc->ss == NULL)
4102 		return;
4103 
4104 	for (i = 0; i < sc->num_slices; i++) {
4105 		ss = &sc->ss[i];
4106 		if (ss->fw_stats != NULL) {
4107 			mxge_dma_free(&ss->fw_stats_dma);
4108 			ss->fw_stats = NULL;
4109 #ifdef IFNET_BUF_RING
4110 			if (ss->tx.br != NULL) {
4111 				drbr_free(ss->tx.br, M_DEVBUF);
4112 				ss->tx.br = NULL;
4113 			}
4114 #endif
4115 		}
4116 		if (ss->rx_done.entry != NULL) {
4117 			mxge_dma_free(&ss->rx_done.dma);
4118 			ss->rx_done.entry = NULL;
4119 		}
4120 	}
4121 	kfree(sc->ss, M_DEVBUF);
4122 	sc->ss = NULL;
4123 }
4124 
4125 static int
4126 mxge_alloc_slices(mxge_softc_t *sc)
4127 {
4128 	mxge_cmd_t cmd;
4129 	struct mxge_slice_state *ss;
4130 	size_t bytes;
4131 	int err, i, max_intr_slots;
4132 
4133 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4134 	if (err != 0) {
4135 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4136 		return err;
4137 	}
4138 	sc->rx_ring_size = cmd.data0;
4139 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4140 
4141 	bytes = sizeof (*sc->ss) * sc->num_slices;
4142 	sc->ss = kmalloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4143 	if (sc->ss == NULL)
4144 		return (ENOMEM);
4145 	for (i = 0; i < sc->num_slices; i++) {
4146 		ss = &sc->ss[i];
4147 
4148 		ss->sc = sc;
4149 
4150 		/* allocate per-slice rx interrupt queues */
4151 
4152 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4153 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4154 		if (err != 0)
4155 			goto abort;
4156 		ss->rx_done.entry = ss->rx_done.dma.addr;
4157 		bzero(ss->rx_done.entry, bytes);
4158 
4159 		/*
4160 		 * allocate the per-slice firmware stats; stats
4161 		 * (including tx) are used used only on the first
4162 		 * slice for now
4163 		 */
4164 #ifndef IFNET_BUF_RING
4165 		if (i > 0)
4166 			continue;
4167 #endif
4168 
4169 		bytes = sizeof (*ss->fw_stats);
4170 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4171 				     sizeof (*ss->fw_stats), 64);
4172 		if (err != 0)
4173 			goto abort;
4174 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4175 #ifdef IFNET_BUF_RING
4176 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4177 					   &ss->tx.lock);
4178 #endif
4179 	}
4180 
4181 	return (0);
4182 
4183 abort:
4184 	mxge_free_slices(sc);
4185 	return (ENOMEM);
4186 }
4187 
4188 static void
4189 mxge_slice_probe(mxge_softc_t *sc)
4190 {
4191 	mxge_cmd_t cmd;
4192 	char *old_fw;
4193 	int msix_cnt, status, max_intr_slots;
4194 
4195 	sc->num_slices = 1;
4196 	/*
4197 	 *  don't enable multiple slices if they are not enabled,
4198 	 *  or if this is not an SMP system
4199 	 */
4200 
4201 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
4202 		return;
4203 
4204 	/* see how many MSI-X interrupts are available */
4205 	msix_cnt = pci_msix_count(sc->dev);
4206 	if (msix_cnt < 2)
4207 		return;
4208 
4209 	/* now load the slice aware firmware see what it supports */
4210 	old_fw = sc->fw_name;
4211 	if (old_fw == mxge_fw_aligned)
4212 		sc->fw_name = mxge_fw_rss_aligned;
4213 	else
4214 		sc->fw_name = mxge_fw_rss_unaligned;
4215 	status = mxge_load_firmware(sc, 0);
4216 	if (status != 0) {
4217 		device_printf(sc->dev, "Falling back to a single slice\n");
4218 		return;
4219 	}
4220 
4221 	/* try to send a reset command to the card to see if it
4222 	   is alive */
4223 	memset(&cmd, 0, sizeof (cmd));
4224 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4225 	if (status != 0) {
4226 		device_printf(sc->dev, "failed reset\n");
4227 		goto abort_with_fw;
4228 	}
4229 
4230 	/* get rx ring size */
4231 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4232 	if (status != 0) {
4233 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4234 		goto abort_with_fw;
4235 	}
4236 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4237 
4238 	/* tell it the size of the interrupt queues */
4239 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4240 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4241 	if (status != 0) {
4242 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4243 		goto abort_with_fw;
4244 	}
4245 
4246 	/* ask the maximum number of slices it supports */
4247 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4248 	if (status != 0) {
4249 		device_printf(sc->dev,
4250 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4251 		goto abort_with_fw;
4252 	}
4253 	sc->num_slices = cmd.data0;
4254 	if (sc->num_slices > msix_cnt)
4255 		sc->num_slices = msix_cnt;
4256 
4257 	if (mxge_max_slices == -1) {
4258 		/* cap to number of CPUs in system */
4259 		if (sc->num_slices > ncpus)
4260 			sc->num_slices = ncpus;
4261 	} else {
4262 		if (sc->num_slices > mxge_max_slices)
4263 			sc->num_slices = mxge_max_slices;
4264 	}
4265 	/* make sure it is a power of two */
4266 	while (sc->num_slices & (sc->num_slices - 1))
4267 		sc->num_slices--;
4268 
4269 	if (mxge_verbose)
4270 		device_printf(sc->dev, "using %d slices\n",
4271 			      sc->num_slices);
4272 
4273 	return;
4274 
4275 abort_with_fw:
4276 	sc->fw_name = old_fw;
4277 	(void) mxge_load_firmware(sc, 0);
4278 }
4279 
4280 static int
4281 mxge_add_msix_irqs(mxge_softc_t *sc)
4282 {
4283 	size_t bytes;
4284 	int count, err, i, rid;
4285 
4286 	rid = PCIR_BAR(2);
4287 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4288 						    &rid, RF_ACTIVE);
4289 
4290 	if (sc->msix_table_res == NULL) {
4291 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4292 		return ENXIO;
4293 	}
4294 
4295 	count = sc->num_slices;
4296 	err = pci_alloc_msix(sc->dev, &count);
4297 	if (err != 0) {
4298 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4299 			      "err = %d \n", sc->num_slices, err);
4300 		goto abort_with_msix_table;
4301 	}
4302 	if (count < sc->num_slices) {
4303 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4304 			      count, sc->num_slices);
4305 		device_printf(sc->dev,
4306 			      "Try setting hw.mxge.max_slices to %d\n",
4307 			      count);
4308 		err = ENOSPC;
4309 		goto abort_with_msix;
4310 	}
4311 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4312 	sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4313 	if (sc->msix_irq_res == NULL) {
4314 		err = ENOMEM;
4315 		goto abort_with_msix;
4316 	}
4317 
4318 	for (i = 0; i < sc->num_slices; i++) {
4319 		rid = i + 1;
4320 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4321 							  SYS_RES_IRQ,
4322 							  &rid, RF_ACTIVE);
4323 		if (sc->msix_irq_res[i] == NULL) {
4324 			device_printf(sc->dev, "couldn't allocate IRQ res"
4325 				      " for message %d\n", i);
4326 			err = ENXIO;
4327 			goto abort_with_res;
4328 		}
4329 	}
4330 
4331 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4332 	sc->msix_ih =  kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4333 
4334 	for (i = 0; i < sc->num_slices; i++) {
4335 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4336 				     INTR_MPSAFE,
4337 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i],
4338 				     sc->ifp->if_serializer);
4339 		if (err != 0) {
4340 			device_printf(sc->dev, "couldn't setup intr for "
4341 				      "message %d\n", i);
4342 			goto abort_with_intr;
4343 		}
4344 	}
4345 
4346 	if (mxge_verbose) {
4347 		device_printf(sc->dev, "using %d msix IRQs:",
4348 			      sc->num_slices);
4349 		for (i = 0; i < sc->num_slices; i++)
4350 			kprintf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4351 		kprintf("\n");
4352 	}
4353 	return (0);
4354 
4355 abort_with_intr:
4356 	for (i = 0; i < sc->num_slices; i++) {
4357 		if (sc->msix_ih[i] != NULL) {
4358 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4359 					  sc->msix_ih[i]);
4360 			sc->msix_ih[i] = NULL;
4361 		}
4362 	}
4363 	kfree(sc->msix_ih, M_DEVBUF);
4364 
4365 
4366 abort_with_res:
4367 	for (i = 0; i < sc->num_slices; i++) {
4368 		rid = i + 1;
4369 		if (sc->msix_irq_res[i] != NULL)
4370 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4371 					     sc->msix_irq_res[i]);
4372 		sc->msix_irq_res[i] = NULL;
4373 	}
4374 	kfree(sc->msix_irq_res, M_DEVBUF);
4375 
4376 
4377 abort_with_msix:
4378 	pci_release_msi(sc->dev);
4379 
4380 abort_with_msix_table:
4381 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4382 			     sc->msix_table_res);
4383 
4384 	return err;
4385 }
4386 
4387 static int
4388 mxge_add_single_irq(mxge_softc_t *sc)
4389 {
4390 	int count, err, rid;
4391 
4392 	count = pci_msi_count(sc->dev);
4393 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4394 		rid = 1;
4395 	} else {
4396 		rid = 0;
4397 		sc->legacy_irq = 1;
4398 	}
4399 	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4400 					 1, RF_SHAREABLE | RF_ACTIVE);
4401 	if (sc->irq_res == NULL) {
4402 		device_printf(sc->dev, "could not alloc interrupt\n");
4403 		return ENXIO;
4404 	}
4405 	if (mxge_verbose)
4406 		device_printf(sc->dev, "using %s irq %ld\n",
4407 			      sc->legacy_irq ? "INTx" : "MSI",
4408 			      rman_get_start(sc->irq_res));
4409 	err = bus_setup_intr(sc->dev, sc->irq_res,
4410 			     INTR_MPSAFE,
4411 			     mxge_intr, &sc->ss[0], &sc->ih,
4412 			     sc->ifp->if_serializer);
4413 	if (err != 0) {
4414 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4415 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4416 		if (!sc->legacy_irq)
4417 			pci_release_msi(sc->dev);
4418 	}
4419 	return err;
4420 }
4421 
4422 static void
4423 mxge_rem_msix_irqs(mxge_softc_t *sc)
4424 {
4425 	int i, rid;
4426 
4427 	for (i = 0; i < sc->num_slices; i++) {
4428 		if (sc->msix_ih[i] != NULL) {
4429 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4430 					  sc->msix_ih[i]);
4431 			sc->msix_ih[i] = NULL;
4432 		}
4433 	}
4434 	kfree(sc->msix_ih, M_DEVBUF);
4435 
4436 	for (i = 0; i < sc->num_slices; i++) {
4437 		rid = i + 1;
4438 		if (sc->msix_irq_res[i] != NULL)
4439 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4440 					     sc->msix_irq_res[i]);
4441 		sc->msix_irq_res[i] = NULL;
4442 	}
4443 	kfree(sc->msix_irq_res, M_DEVBUF);
4444 
4445 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4446 			     sc->msix_table_res);
4447 
4448 	pci_release_msi(sc->dev);
4449 	return;
4450 }
4451 
4452 static void
4453 mxge_rem_single_irq(mxge_softc_t *sc)
4454 {
4455 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4456 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4457 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4458 	if (!sc->legacy_irq)
4459 		pci_release_msi(sc->dev);
4460 }
4461 
4462 static void
4463 mxge_rem_irq(mxge_softc_t *sc)
4464 {
4465 	if (sc->num_slices > 1)
4466 		mxge_rem_msix_irqs(sc);
4467 	else
4468 		mxge_rem_single_irq(sc);
4469 }
4470 
4471 static int
4472 mxge_add_irq(mxge_softc_t *sc)
4473 {
4474 	int err;
4475 
4476 	if (sc->num_slices > 1)
4477 		err = mxge_add_msix_irqs(sc);
4478 	else
4479 		err = mxge_add_single_irq(sc);
4480 
4481 	if (0 && err == 0 && sc->num_slices > 1) {
4482 		mxge_rem_msix_irqs(sc);
4483 		err = mxge_add_msix_irqs(sc);
4484 	}
4485 	return err;
4486 }
4487 
4488 
4489 static int
4490 mxge_attach(device_t dev)
4491 {
4492 	mxge_softc_t *sc = device_get_softc(dev);
4493 	struct ifnet *ifp = &sc->arpcom.ac_if;
4494 	int err, rid;
4495 
4496 	/*
4497 	 * avoid rewriting half the lines in this file to use
4498 	 * &sc->arpcom.ac_if instead
4499 	 */
4500 	sc->ifp = ifp;
4501 	sc->dev = dev;
4502 	mxge_fetch_tunables(sc);
4503 
4504 	err = bus_dma_tag_create(NULL,			/* parent */
4505 				 1,			/* alignment */
4506 				 0,			/* boundary */
4507 				 BUS_SPACE_MAXADDR,	/* low */
4508 				 BUS_SPACE_MAXADDR,	/* high */
4509 				 NULL, NULL,		/* filter */
4510 				 65536 + 256,		/* maxsize */
4511 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4512 				 65536,			/* maxsegsize */
4513 				 0,			/* flags */
4514 				 &sc->parent_dmat);	/* tag */
4515 
4516 	if (err != 0) {
4517 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4518 			      err);
4519 		goto abort_with_nothing;
4520 	}
4521 
4522 	sc->ifp = ifp;
4523 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4524 
4525 	callout_init_mp(&sc->co_hdl);
4526 
4527 	mxge_setup_cfg_space(sc);
4528 
4529 	/* Map the board into the kernel */
4530 	rid = PCIR_BARS;
4531 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4532 					 ~0, 1, RF_ACTIVE);
4533 	if (sc->mem_res == NULL) {
4534 		device_printf(dev, "could not map memory\n");
4535 		err = ENXIO;
4536 		goto abort_with_nothing;
4537 	}
4538 	sc->sram = rman_get_virtual(sc->mem_res);
4539 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4540 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4541 		device_printf(dev, "impossible memory region size %ld\n",
4542 			      rman_get_size(sc->mem_res));
4543 		err = ENXIO;
4544 		goto abort_with_mem_res;
4545 	}
4546 
4547 	/* make NULL terminated copy of the EEPROM strings section of
4548 	   lanai SRAM */
4549 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4550 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4551 				rman_get_bushandle(sc->mem_res),
4552 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4553 				sc->eeprom_strings,
4554 				MXGE_EEPROM_STRINGS_SIZE - 2);
4555 	err = mxge_parse_strings(sc);
4556 	if (err != 0)
4557 		goto abort_with_mem_res;
4558 
4559 	/* Enable write combining for efficient use of PCIe bus */
4560 	mxge_enable_wc(sc);
4561 
4562 	/* Allocate the out of band dma memory */
4563 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4564 			     sizeof (mxge_cmd_t), 64);
4565 	if (err != 0)
4566 		goto abort_with_mem_res;
4567 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4568 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4569 	if (err != 0)
4570 		goto abort_with_cmd_dma;
4571 
4572 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4573 	if (err != 0)
4574 		goto abort_with_zeropad_dma;
4575 
4576 	/* select & load the firmware */
4577 	err = mxge_select_firmware(sc);
4578 	if (err != 0)
4579 		goto abort_with_dmabench;
4580 	sc->intr_coal_delay = mxge_intr_coal_delay;
4581 
4582 	mxge_slice_probe(sc);
4583 	err = mxge_alloc_slices(sc);
4584 	if (err != 0)
4585 		goto abort_with_dmabench;
4586 
4587 	err = mxge_reset(sc, 0);
4588 	if (err != 0)
4589 		goto abort_with_slices;
4590 
4591 	err = mxge_alloc_rings(sc);
4592 	if (err != 0) {
4593 		device_printf(sc->dev, "failed to allocate rings\n");
4594 		goto abort_with_dmabench;
4595 	}
4596 
4597 	ifp->if_baudrate = IF_Gbps(10UL);
4598 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4599 		IFCAP_VLAN_MTU;
4600 #ifdef INET
4601 	ifp->if_capabilities |= IFCAP_LRO;
4602 #endif
4603 
4604 #ifdef MXGE_NEW_VLAN_API
4605 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4606 #endif
4607 
4608 	sc->max_mtu = mxge_max_mtu(sc);
4609 	if (sc->max_mtu >= 9000)
4610 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4611 	else
4612 		device_printf(dev, "MTU limited to %d.  Install "
4613 			      "latest firmware for 9000 byte jumbo support\n",
4614 			      sc->max_mtu - ETHER_HDR_LEN);
4615 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4616 	ifp->if_capenable = ifp->if_capabilities;
4617 	if (sc->lro_cnt == 0)
4618 		ifp->if_capenable &= ~IFCAP_LRO;
4619 	sc->csum_flag = 1;
4620         ifp->if_init = mxge_init;
4621         ifp->if_softc = sc;
4622         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4623         ifp->if_ioctl = mxge_ioctl;
4624         ifp->if_start = mxge_start;
4625 	/* Initialise the ifmedia structure */
4626 	ifmedia_init(&sc->media, 0, mxge_media_change,
4627 		     mxge_media_status);
4628 	mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4629 	mxge_media_probe(sc);
4630 	sc->dying = 0;
4631 	ether_ifattach(ifp, sc->mac_addr, NULL);
4632 	/* ether_ifattach sets mtu to ETHERMTU */
4633 	if (mxge_initial_mtu != ETHERMTU) {
4634 		lwkt_serialize_enter(ifp->if_serializer);
4635 		mxge_change_mtu(sc, mxge_initial_mtu);
4636 		lwkt_serialize_exit(ifp->if_serializer);
4637 	}
4638 	/* must come after ether_ifattach() */
4639 	err = mxge_add_irq(sc);
4640 	if (err != 0) {
4641 		device_printf(sc->dev, "failed to add irq\n");
4642 		goto abort_with_rings;
4643 	}
4644 
4645 	mxge_add_sysctls(sc);
4646 #ifdef IFNET_BUF_RING
4647 	ifp->if_transmit = mxge_transmit;
4648 	ifp->if_qflush = mxge_qflush;
4649 #endif
4650 	return 0;
4651 
4652 abort_with_rings:
4653 	mxge_free_rings(sc);
4654 abort_with_slices:
4655 	mxge_free_slices(sc);
4656 abort_with_dmabench:
4657 	mxge_dma_free(&sc->dmabench_dma);
4658 abort_with_zeropad_dma:
4659 	mxge_dma_free(&sc->zeropad_dma);
4660 abort_with_cmd_dma:
4661 	mxge_dma_free(&sc->cmd_dma);
4662 abort_with_mem_res:
4663 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4664 	pci_disable_busmaster(dev);
4665 	bus_dma_tag_destroy(sc->parent_dmat);
4666 abort_with_nothing:
4667 	return err;
4668 }
4669 
4670 static int
4671 mxge_detach(device_t dev)
4672 {
4673 	mxge_softc_t *sc = device_get_softc(dev);
4674 
4675 	lwkt_serialize_enter(sc->ifp->if_serializer);
4676 	sc->dying = 1;
4677 	if (sc->ifp->if_flags & IFF_RUNNING)
4678 		mxge_close(sc);
4679 	/*
4680 	 * XXX: race: the callout callback could be spinning on
4681 	 * the serializer and run anyway
4682 	 */
4683 	callout_stop(&sc->co_hdl);
4684 	lwkt_serialize_exit(sc->ifp->if_serializer);
4685 
4686 	ether_ifdetach(sc->ifp);
4687 	ifmedia_removeall(&sc->media);
4688 	mxge_dummy_rdma(sc, 0);
4689 	mxge_rem_sysctls(sc);
4690 	mxge_rem_irq(sc);
4691 	mxge_free_rings(sc);
4692 	mxge_free_slices(sc);
4693 	mxge_dma_free(&sc->dmabench_dma);
4694 	mxge_dma_free(&sc->zeropad_dma);
4695 	mxge_dma_free(&sc->cmd_dma);
4696 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4697 	pci_disable_busmaster(dev);
4698 	bus_dma_tag_destroy(sc->parent_dmat);
4699 	return 0;
4700 }
4701 
4702 static int
4703 mxge_shutdown(device_t dev)
4704 {
4705 	return 0;
4706 }
4707 
4708 /*
4709   This file uses Myri10GE driver indentation.
4710 
4711   Local Variables:
4712   c-file-style:"linux"
4713   tab-width:8
4714   End:
4715 */
4716