xref: /dragonfly/sys/dev/netif/mxge/if_mxge.c (revision 956939d5)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2009, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 /*__FBSDID("$FreeBSD: src/sys/dev/mxge/if_mxge.c,v 1.63 2009/06/26 11:45:06 rwatson Exp $");*/
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/in_cksum.h>
39 #include <sys/sockio.h>
40 #include <sys/mbuf.h>
41 #include <sys/malloc.h>
42 #include <sys/kernel.h>
43 #include <sys/module.h>
44 #include <sys/serialize.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 
48 /* count xmits ourselves, rather than via drbr */
49 #define NO_SLOW_STATS
50 #include <net/if.h>
51 #include <net/if_arp.h>
52 #include <net/ifq_var.h>
53 #include <net/ethernet.h>
54 #include <net/if_dl.h>
55 #include <net/if_media.h>
56 
57 #include <net/bpf.h>
58 
59 #include <net/if_types.h>
60 #include <net/vlan/if_vlan_var.h>
61 #include <net/zlib.h>
62 
63 #include <netinet/in_systm.h>
64 #include <netinet/in.h>
65 #include <netinet/ip.h>
66 #include <netinet/tcp.h>
67 
68 #include <sys/bus.h>
69 #include <sys/rman.h>
70 
71 #include <bus/pci/pcireg.h>
72 #include <bus/pci/pcivar.h>
73 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
74 
75 #include <vm/vm.h>		/* for pmap_mapdev() */
76 #include <vm/pmap.h>
77 
78 #if defined(__i386) || defined(__amd64)
79 #include <machine/specialreg.h>
80 #endif
81 
82 #include <dev/netif/mxge/mxge_mcp.h>
83 #include <dev/netif/mxge/mcp_gen_header.h>
84 /*#define MXGE_FAKE_IFP*/
85 #include <dev/netif/mxge/if_mxge_var.h>
86 #ifdef IFNET_BUF_RING
87 #include <sys/buf_ring.h>
88 #endif
89 
90 #include "opt_inet.h"
91 
92 /* tunable params */
93 static int mxge_nvidia_ecrc_enable = 1;
94 static int mxge_force_firmware = 0;
95 static int mxge_intr_coal_delay = 30;
96 static int mxge_deassert_wait = 1;
97 static int mxge_flow_control = 1;
98 static int mxge_verbose = 0;
99 static int mxge_lro_cnt = 8;
100 static int mxge_ticks;
101 static int mxge_max_slices = 1;
102 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
103 static int mxge_always_promisc = 0;
104 /* XXX: not yet */
105 /* static int mxge_initial_mtu = ETHERMTU_JUMBO; */
106 static int mxge_initial_mtu = ETHERMTU;
107 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
108 static char *mxge_fw_aligned = "mxge_eth_z8e";
109 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
110 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
111 
112 static int mxge_probe(device_t dev);
113 static int mxge_attach(device_t dev);
114 static int mxge_detach(device_t dev);
115 static int mxge_shutdown(device_t dev);
116 static void mxge_intr(void *arg);
117 
118 static device_method_t mxge_methods[] =
119 {
120   /* Device interface */
121   DEVMETHOD(device_probe, mxge_probe),
122   DEVMETHOD(device_attach, mxge_attach),
123   DEVMETHOD(device_detach, mxge_detach),
124   DEVMETHOD(device_shutdown, mxge_shutdown),
125   {0, 0}
126 };
127 
128 static driver_t mxge_driver =
129 {
130   "mxge",
131   mxge_methods,
132   sizeof(mxge_softc_t),
133 };
134 
135 static devclass_t mxge_devclass;
136 
137 /* Declare ourselves to be a child of the PCI bus.*/
138 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
139 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
140 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
141 
142 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
143 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
144 static int mxge_close(mxge_softc_t *sc);
145 static int mxge_open(mxge_softc_t *sc);
146 static void mxge_tick(void *arg);
147 
148 /* XXX: we don't have Large Receive Offload support yet */
149  inline int
150 mxge_lro_rx(struct mxge_slice_state *ss, struct mbuf *m_head, uint32_t csum)
151 {
152 	(void)ss;
153 	(void)m_head;
154 	(void)csum;
155 	return 1;
156 }
157 
158  inline void
159 mxge_lro_flush(struct mxge_slice_state *ss, struct lro_entry *lro)
160 {
161 	(void)ss;
162 	(void)lro;
163 }
164 
165 static int
166 mxge_probe(device_t dev)
167 {
168 	int rev;
169 
170 
171 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
172 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
173 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
174 		rev = pci_get_revid(dev);
175 		switch (rev) {
176 		case MXGE_PCI_REV_Z8E:
177 			device_set_desc(dev, "Myri10G-PCIE-8A");
178 			break;
179 		case MXGE_PCI_REV_Z8ES:
180 			device_set_desc(dev, "Myri10G-PCIE-8B");
181 			break;
182 		default:
183 			device_set_desc(dev, "Myri10G-PCIE-8??");
184 			device_printf(dev, "Unrecognized rev %d NIC\n",
185 				      rev);
186 			break;
187 		}
188 		return 0;
189 	}
190 	return ENXIO;
191 }
192 
193 static void
194 mxge_enable_wc(mxge_softc_t *sc)
195 {
196 #if 0
197 #if defined(__i386) || defined(__amd64)
198 	vm_offset_t len;
199 	int err;
200 
201 	sc->wc = 1;
202 	len = rman_get_size(sc->mem_res);
203 	err = pmap_change_attr((vm_offset_t) sc->sram,
204 			       len, PAT_WRITE_COMBINING);
205 	if (err != 0) {
206 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
207 			      err);
208 		sc->wc = 0;
209 	}
210 #endif
211 #else
212 	sc->wc = 0;	/* TBD: PAT support */
213 #endif
214 }
215 
216 
217 /* callback to get our DMA address */
218 static void
219 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
220 			 int error)
221 {
222 	if (error == 0) {
223 		*(bus_addr_t *) arg = segs->ds_addr;
224 	}
225 }
226 
227 static int
228 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
229 		   bus_size_t alignment)
230 {
231 	int err;
232 	device_t dev = sc->dev;
233 	bus_size_t boundary, maxsegsize;
234 
235 	if (bytes > 4096 && alignment == 4096) {
236 		boundary = 0;
237 		maxsegsize = bytes;
238 	} else {
239 		boundary = 4096;
240 		maxsegsize = 4096;
241 	}
242 
243 	/* allocate DMAable memory tags */
244 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
245 				 alignment,		/* alignment */
246 				 boundary,		/* boundary */
247 				 BUS_SPACE_MAXADDR,	/* low */
248 				 BUS_SPACE_MAXADDR,	/* high */
249 				 NULL, NULL,		/* filter */
250 				 bytes,			/* maxsize */
251 				 1,			/* num segs */
252 				 maxsegsize,		/* maxsegsize */
253 				 BUS_DMA_COHERENT,	/* flags */
254 				 &dma->dmat);		/* tag */
255 	if (err != 0) {
256 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
257 		return err;
258 	}
259 
260 	/* allocate DMAable memory & map */
261 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
262 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
263 				| BUS_DMA_ZERO),  &dma->map);
264 	if (err != 0) {
265 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
266 		goto abort_with_dmat;
267 	}
268 
269 	/* load the memory */
270 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
271 			      mxge_dmamap_callback,
272 			      (void *)&dma->bus_addr, 0);
273 	if (err != 0) {
274 		device_printf(dev, "couldn't load map (err = %d)\n", err);
275 		goto abort_with_mem;
276 	}
277 	return 0;
278 
279 abort_with_mem:
280 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
281 abort_with_dmat:
282 	(void)bus_dma_tag_destroy(dma->dmat);
283 	return err;
284 }
285 
286 
287 static void
288 mxge_dma_free(mxge_dma_t *dma)
289 {
290 	bus_dmamap_unload(dma->dmat, dma->map);
291 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
292 	(void)bus_dma_tag_destroy(dma->dmat);
293 }
294 
295 /*
296  * The eeprom strings on the lanaiX have the format
297  * SN=x\0
298  * MAC=x:x:x:x:x:x\0
299  * PC=text\0
300  */
301 
302 static int
303 mxge_parse_strings(mxge_softc_t *sc)
304 {
305 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
306 
307 	char *ptr, *limit;
308 	int i, found_mac;
309 
310 	ptr = sc->eeprom_strings;
311 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
312 	found_mac = 0;
313 	while (ptr < limit && *ptr != '\0') {
314 		if (memcmp(ptr, "MAC=", 4) == 0) {
315 			ptr += 1;
316 			sc->mac_addr_string = ptr;
317 			for (i = 0; i < 6; i++) {
318 				ptr += 3;
319 				if ((ptr + 2) > limit)
320 					goto abort;
321 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
322 				found_mac = 1;
323 			}
324 		} else if (memcmp(ptr, "PC=", 3) == 0) {
325 			ptr += 3;
326 			strncpy(sc->product_code_string, ptr,
327 				sizeof (sc->product_code_string) - 1);
328 		} else if (memcmp(ptr, "SN=", 3) == 0) {
329 			ptr += 3;
330 			strncpy(sc->serial_number_string, ptr,
331 				sizeof (sc->serial_number_string) - 1);
332 		}
333 		MXGE_NEXT_STRING(ptr);
334 	}
335 
336 	if (found_mac)
337 		return 0;
338 
339  abort:
340 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
341 
342 	return ENXIO;
343 }
344 
345 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
346 static void
347 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
348 {
349 	uint32_t val;
350 	unsigned long base, off;
351 	char *va, *cfgptr;
352 	device_t pdev, mcp55;
353 	uint16_t vendor_id, device_id, word;
354 	uintptr_t bus, slot, func, ivend, idev;
355 	uint32_t *ptr32;
356 
357 
358 	if (!mxge_nvidia_ecrc_enable)
359 		return;
360 
361 	pdev = device_get_parent(device_get_parent(sc->dev));
362 	if (pdev == NULL) {
363 		device_printf(sc->dev, "could not find parent?\n");
364 		return;
365 	}
366 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
367 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
368 
369 	if (vendor_id != 0x10de)
370 		return;
371 
372 	base = 0;
373 
374 	if (device_id == 0x005d) {
375 		/* ck804, base address is magic */
376 		base = 0xe0000000UL;
377 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
378 		/* mcp55, base address stored in chipset */
379 		mcp55 = pci_find_bsf(0, 0, 0);
380 		if (mcp55 &&
381 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
382 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
383 			word = pci_read_config(mcp55, 0x90, 2);
384 			base = ((unsigned long)word & 0x7ffeU) << 25;
385 		}
386 	}
387 	if (!base)
388 		return;
389 
390 	/* XXXX
391 	   Test below is commented because it is believed that doing
392 	   config read/write beyond 0xff will access the config space
393 	   for the next larger function.  Uncomment this and remove
394 	   the hacky pmap_mapdev() way of accessing config space when
395 	   FreeBSD grows support for extended pcie config space access
396 	*/
397 #if 0
398 	/* See if we can, by some miracle, access the extended
399 	   config space */
400 	val = pci_read_config(pdev, 0x178, 4);
401 	if (val != 0xffffffff) {
402 		val |= 0x40;
403 		pci_write_config(pdev, 0x178, val, 4);
404 		return;
405 	}
406 #endif
407 	/* Rather than using normal pci config space writes, we must
408 	 * map the Nvidia config space ourselves.  This is because on
409 	 * opteron/nvidia class machine the 0xe000000 mapping is
410 	 * handled by the nvidia chipset, that means the internal PCI
411 	 * device (the on-chip northbridge), or the amd-8131 bridge
412 	 * and things behind them are not visible by this method.
413 	 */
414 
415 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
416 		      PCI_IVAR_BUS, &bus);
417 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
418 		      PCI_IVAR_SLOT, &slot);
419 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
420 		      PCI_IVAR_FUNCTION, &func);
421 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
422 		      PCI_IVAR_VENDOR, &ivend);
423 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
424 		      PCI_IVAR_DEVICE, &idev);
425 
426 	off =  base
427 		+ 0x00100000UL * (unsigned long)bus
428 		+ 0x00001000UL * (unsigned long)(func
429 						 + 8 * slot);
430 
431 	/* map it into the kernel */
432 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
433 
434 
435 	if (va == NULL) {
436 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
437 		return;
438 	}
439 	/* get a pointer to the config space mapped into the kernel */
440 	cfgptr = va + (off & PAGE_MASK);
441 
442 	/* make sure that we can really access it */
443 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
444 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
445 	if (! (vendor_id == ivend && device_id == idev)) {
446 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
447 			      vendor_id, device_id);
448 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
449 		return;
450 	}
451 
452 	ptr32 = (uint32_t*)(cfgptr + 0x178);
453 	val = *ptr32;
454 
455 	if (val == 0xffffffff) {
456 		device_printf(sc->dev, "extended mapping failed\n");
457 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
458 		return;
459 	}
460 	*ptr32 = val | 0x40;
461 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
462 	if (mxge_verbose)
463 		device_printf(sc->dev,
464 			      "Enabled ECRC on upstream Nvidia bridge "
465 			      "at %d:%d:%d\n",
466 			      (int)bus, (int)slot, (int)func);
467 	return;
468 }
469 #else
470 static void
471 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
472 {
473 	device_printf(sc->dev,
474 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
475 	return;
476 }
477 #endif
478 
479 
480 static int
481 mxge_dma_test(mxge_softc_t *sc, int test_type)
482 {
483 	mxge_cmd_t cmd;
484 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
485 	int status;
486 	uint32_t len;
487 	char *test = " ";
488 
489 
490 	/* Run a small DMA test.
491 	 * The magic multipliers to the length tell the firmware
492 	 * to do DMA read, write, or read+write tests.  The
493 	 * results are returned in cmd.data0.  The upper 16
494 	 * bits of the return is the number of transfers completed.
495 	 * The lower 16 bits is the time in 0.5us ticks that the
496 	 * transfers took to complete.
497 	 */
498 
499 	len = sc->tx_boundary;
500 
501 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
502 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
503 	cmd.data2 = len * 0x10000;
504 	status = mxge_send_cmd(sc, test_type, &cmd);
505 	if (status != 0) {
506 		test = "read";
507 		goto abort;
508 	}
509 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
510 		(cmd.data0 & 0xffff);
511 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
512 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
513 	cmd.data2 = len * 0x1;
514 	status = mxge_send_cmd(sc, test_type, &cmd);
515 	if (status != 0) {
516 		test = "write";
517 		goto abort;
518 	}
519 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
520 		(cmd.data0 & 0xffff);
521 
522 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
523 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
524 	cmd.data2 = len * 0x10001;
525 	status = mxge_send_cmd(sc, test_type, &cmd);
526 	if (status != 0) {
527 		test = "read/write";
528 		goto abort;
529 	}
530 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
531 		(cmd.data0 & 0xffff);
532 
533 abort:
534 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
535 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
536 			      test, status);
537 
538 	return status;
539 }
540 
541 /*
542  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
543  * when the PCI-E Completion packets are aligned on an 8-byte
544  * boundary.  Some PCI-E chip sets always align Completion packets; on
545  * the ones that do not, the alignment can be enforced by enabling
546  * ECRC generation (if supported).
547  *
548  * When PCI-E Completion packets are not aligned, it is actually more
549  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
550  *
551  * If the driver can neither enable ECRC nor verify that it has
552  * already been enabled, then it must use a firmware image which works
553  * around unaligned completion packets (ethp_z8e.dat), and it should
554  * also ensure that it never gives the device a Read-DMA which is
555  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
556  * enabled, then the driver should use the aligned (eth_z8e.dat)
557  * firmware image, and set tx_boundary to 4KB.
558  */
559 
560 static int
561 mxge_firmware_probe(mxge_softc_t *sc)
562 {
563 	device_t dev = sc->dev;
564 	int reg, status;
565 	uint16_t pectl;
566 
567 	sc->tx_boundary = 4096;
568 	/*
569 	 * Verify the max read request size was set to 4KB
570 	 * before trying the test with 4KB.
571 	 */
572 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
573 		pectl = pci_read_config(dev, reg + 0x8, 2);
574 		if ((pectl & (5 << 12)) != (5 << 12)) {
575 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
576 				      pectl);
577 			sc->tx_boundary = 2048;
578 		}
579 	}
580 
581 	/*
582 	 * load the optimized firmware (which assumes aligned PCIe
583 	 * completions) in order to see if it works on this host.
584 	 */
585 	sc->fw_name = mxge_fw_aligned;
586 	status = mxge_load_firmware(sc, 1);
587 	if (status != 0) {
588 		return status;
589 	}
590 
591 	/*
592 	 * Enable ECRC if possible
593 	 */
594 	mxge_enable_nvidia_ecrc(sc);
595 
596 	/*
597 	 * Run a DMA test which watches for unaligned completions and
598 	 * aborts on the first one seen.
599 	 */
600 
601 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
602 	if (status == 0)
603 		return 0; /* keep the aligned firmware */
604 
605 	if (status != E2BIG)
606 		device_printf(dev, "DMA test failed: %d\n", status);
607 	if (status == ENOSYS)
608 		device_printf(dev, "Falling back to ethp! "
609 			      "Please install up to date fw\n");
610 	return status;
611 }
612 
613 static int
614 mxge_select_firmware(mxge_softc_t *sc)
615 {
616 	int aligned = 0;
617 
618 
619 	if (mxge_force_firmware != 0) {
620 		if (mxge_force_firmware == 1)
621 			aligned = 1;
622 		else
623 			aligned = 0;
624 		if (mxge_verbose)
625 			device_printf(sc->dev,
626 				      "Assuming %s completions (forced)\n",
627 				      aligned ? "aligned" : "unaligned");
628 		goto abort;
629 	}
630 
631 	/* if the PCIe link width is 4 or less, we can use the aligned
632 	   firmware and skip any checks */
633 	if (sc->link_width != 0 && sc->link_width <= 4) {
634 		device_printf(sc->dev,
635 			      "PCIe x%d Link, expect reduced performance\n",
636 			      sc->link_width);
637 		aligned = 1;
638 		goto abort;
639 	}
640 
641 	if (0 == mxge_firmware_probe(sc))
642 		return 0;
643 
644 abort:
645 	if (aligned) {
646 		sc->fw_name = mxge_fw_aligned;
647 		sc->tx_boundary = 4096;
648 	} else {
649 		sc->fw_name = mxge_fw_unaligned;
650 		sc->tx_boundary = 2048;
651 	}
652 	return (mxge_load_firmware(sc, 0));
653 }
654 
655 union qualhack
656 {
657         const char *ro_char;
658         char *rw_char;
659 };
660 
661 static int
662 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
663 {
664 
665 
666 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
667 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
668 			      be32toh(hdr->mcp_type));
669 		return EIO;
670 	}
671 
672 	/* save firmware version for sysctl */
673 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
674 	if (mxge_verbose)
675 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
676 
677 	ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
678 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
679 
680 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
681 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
682 		device_printf(sc->dev, "Found firmware version %s\n",
683 			      sc->fw_version);
684 		device_printf(sc->dev, "Driver needs %d.%d\n",
685 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
686 		return EINVAL;
687 	}
688 	return 0;
689 
690 }
691 
692 #if 0
693 static void *
694 z_alloc(void *nil, u_int items, u_int size)
695 {
696         void *ptr;
697 
698         ptr = kmalloc(items * size, M_TEMP, M_NOWAIT);
699         return ptr;
700 }
701 
702 static void
703 z_free(void *nil, void *ptr)
704 {
705         kfree(ptr, M_TEMP);
706 }
707 #endif
708 
709 static int
710 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
711 {
712 	struct fw_image *fw;
713 	const mcp_gen_header_t *hdr;
714 	unsigned hdr_offset;
715 	int status;
716 	unsigned int i;
717 	char dummy;
718 	size_t fw_len;
719 
720 	fw = firmware_image_load(sc->fw_name, NULL);
721 	if (fw == NULL) {
722 		device_printf(sc->dev, "Could not find firmware image %s\n",
723 			      sc->fw_name);
724 		return ENOENT;
725 	}
726 #if 0
727 	/* setup zlib and decompress f/w */
728 	bzero(&zs, sizeof (zs));
729 	zs.zalloc = z_alloc;
730 	zs.zfree = z_free;
731 	status = inflateInit(&zs);
732 	if (status != Z_OK) {
733 		status = EIO;
734 		goto abort_with_fw;
735 	}
736 
737 	/* the uncompressed size is stored as the firmware version,
738 	   which would otherwise go unused */
739 	fw_len = (size_t) fw->version;
740 	inflate_buffer = kmalloc(fw_len, M_TEMP, M_NOWAIT);
741 	if (inflate_buffer == NULL)
742 		goto abort_with_zs;
743 	zs.avail_in = fw->datasize;
744 	zs.next_in = __DECONST(char *, fw->data);
745 	zs.avail_out = fw_len;
746 	zs.next_out = inflate_buffer;
747 	status = inflate(&zs, Z_FINISH);
748 	if (status != Z_STREAM_END) {
749 		device_printf(sc->dev, "zlib %d\n", status);
750 		status = EIO;
751 		goto abort_with_buffer;
752 	}
753 #endif
754 	fw_len = fw->fw_imglen;
755 	/* check id */
756 	hdr_offset = htobe32(*(const uint32_t *)
757 			     (fw->fw_image + MCP_HEADER_PTR_OFFSET));
758 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
759 		device_printf(sc->dev, "Bad firmware file");
760 		status = EIO;
761 		goto abort_with_fw;
762 	}
763 	hdr = (const void*)(fw->fw_image + hdr_offset);
764 
765 	status = mxge_validate_firmware(sc, hdr);
766 	if (status != 0)
767 		goto abort_with_fw;
768 
769 	/* Copy the inflated firmware to NIC SRAM. */
770 	for (i = 0; i < fw_len; i += 256) {
771 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
772 			      fw->fw_image + i,
773 			      min(256U, (unsigned)(fw_len - i)));
774 		wmb();
775 		dummy = *sc->sram;
776 		wmb();
777 	}
778 
779 	*limit = fw_len;
780 	status = 0;
781 #if 0
782 abort_with_buffer:
783 	kfree(inflate_buffer, M_TEMP);
784 abort_with_zs:
785 	inflateEnd(&zs);
786 #endif
787 abort_with_fw:
788 	firmware_image_unload(fw);
789 	return status;
790 }
791 
792 /*
793  * Enable or disable periodic RDMAs from the host to make certain
794  * chipsets resend dropped PCIe messages
795  */
796 
797 static void
798 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
799 {
800 	char buf_bytes[72];
801 	volatile uint32_t *confirm;
802 	volatile char *submit;
803 	uint32_t *buf, dma_low, dma_high;
804 	int i;
805 
806 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
807 
808 	/* clear confirmation addr */
809 	confirm = (volatile uint32_t *)sc->cmd;
810 	*confirm = 0;
811 	wmb();
812 
813 	/* send an rdma command to the PCIe engine, and wait for the
814 	   response in the confirmation address.  The firmware should
815 	   write a -1 there to indicate it is alive and well
816 	*/
817 
818 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
819 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
820 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
821 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
822 	buf[2] = htobe32(0xffffffff);		/* confirm data */
823 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
824 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
825 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
826 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
827 	buf[5] = htobe32(enable);			/* enable? */
828 
829 
830 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
831 
832 	mxge_pio_copy(submit, buf, 64);
833 	wmb();
834 	DELAY(1000);
835 	wmb();
836 	i = 0;
837 	while (*confirm != 0xffffffff && i < 20) {
838 		DELAY(1000);
839 		i++;
840 	}
841 	if (*confirm != 0xffffffff) {
842 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
843 			      (enable ? "enable" : "disable"), confirm,
844 			      *confirm);
845 	}
846 	return;
847 }
848 
849 static int
850 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
851 {
852 	mcp_cmd_t *buf;
853 	char buf_bytes[sizeof(*buf) + 8];
854 	volatile mcp_cmd_response_t *response = sc->cmd;
855 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
856 	uint32_t dma_low, dma_high;
857 	int err, sleep_total = 0;
858 
859 	/*
860 	 * We may be called during attach, before if_serializer is available.
861 	 * This is not a fast path, just check for NULL
862 	 */
863 
864 	if (sc->ifp->if_serializer)
865 		ASSERT_SERIALIZED(sc->ifp->if_serializer);
866 
867 	/* ensure buf is aligned to 8 bytes */
868 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
869 
870 	buf->data0 = htobe32(data->data0);
871 	buf->data1 = htobe32(data->data1);
872 	buf->data2 = htobe32(data->data2);
873 	buf->cmd = htobe32(cmd);
874 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
875 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
876 
877 	buf->response_addr.low = htobe32(dma_low);
878 	buf->response_addr.high = htobe32(dma_high);
879 
880 
881 	response->result = 0xffffffff;
882 	wmb();
883 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
884 
885 	/* wait up to 20ms */
886 	err = EAGAIN;
887 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
888 		bus_dmamap_sync(sc->cmd_dma.dmat,
889 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
890 		wmb();
891 		switch (be32toh(response->result)) {
892 		case 0:
893 			data->data0 = be32toh(response->data);
894 			err = 0;
895 			break;
896 		case 0xffffffff:
897 			DELAY(1000);
898 			break;
899 		case MXGEFW_CMD_UNKNOWN:
900 			err = ENOSYS;
901 			break;
902 		case MXGEFW_CMD_ERROR_UNALIGNED:
903 			err = E2BIG;
904 			break;
905 		case MXGEFW_CMD_ERROR_BUSY:
906 			err = EBUSY;
907 			break;
908 		default:
909 			device_printf(sc->dev,
910 				      "mxge: command %d "
911 				      "failed, result = %d\n",
912 				      cmd, be32toh(response->result));
913 			err = ENXIO;
914 			break;
915 		}
916 		if (err != EAGAIN)
917 			break;
918 	}
919 	if (err == EAGAIN)
920 		device_printf(sc->dev, "mxge: command %d timed out"
921 			      "result = %d\n",
922 			      cmd, be32toh(response->result));
923 	return err;
924 }
925 
926 static int
927 mxge_adopt_running_firmware(mxge_softc_t *sc)
928 {
929 	struct mcp_gen_header *hdr;
930 	const size_t bytes = sizeof (struct mcp_gen_header);
931 	size_t hdr_offset;
932 	int status;
933 
934 	/* find running firmware header */
935 	hdr_offset = htobe32(*(volatile uint32_t *)
936 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
937 
938 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
939 		device_printf(sc->dev,
940 			      "Running firmware has bad header offset (%d)\n",
941 			      (int)hdr_offset);
942 		return EIO;
943 	}
944 
945 	/* copy header of running firmware from SRAM to host memory to
946 	 * validate firmware */
947 	hdr = kmalloc(bytes, M_DEVBUF, M_NOWAIT);
948 	if (hdr == NULL) {
949 		device_printf(sc->dev, "could not kmalloc firmware hdr\n");
950 		return ENOMEM;
951 	}
952 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
953 				rman_get_bushandle(sc->mem_res),
954 				hdr_offset, (char *)hdr, bytes);
955 	status = mxge_validate_firmware(sc, hdr);
956 	kfree(hdr, M_DEVBUF);
957 
958 	/*
959 	 * check to see if adopted firmware has bug where adopting
960 	 * it will cause broadcasts to be filtered unless the NIC
961 	 * is kept in ALLMULTI mode
962 	 */
963 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
964 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
965 		sc->adopted_rx_filter_bug = 1;
966 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
967 			      "working around rx filter bug\n",
968 			      sc->fw_ver_major, sc->fw_ver_minor,
969 			      sc->fw_ver_tiny);
970 	}
971 
972 	return status;
973 }
974 
975 
976 static int
977 mxge_load_firmware(mxge_softc_t *sc, int adopt)
978 {
979 	volatile uint32_t *confirm;
980 	volatile char *submit;
981 	char buf_bytes[72];
982 	uint32_t *buf, size, dma_low, dma_high;
983 	int status, i;
984 
985 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
986 
987 	size = sc->sram_size;
988 	status = mxge_load_firmware_helper(sc, &size);
989 	if (status) {
990 		if (!adopt)
991 			return status;
992 		/* Try to use the currently running firmware, if
993 		   it is new enough */
994 		status = mxge_adopt_running_firmware(sc);
995 		if (status) {
996 			device_printf(sc->dev,
997 				      "failed to adopt running firmware\n");
998 			return status;
999 		}
1000 		device_printf(sc->dev,
1001 			      "Successfully adopted running firmware\n");
1002 		if (sc->tx_boundary == 4096) {
1003 			device_printf(sc->dev,
1004 				"Using firmware currently running on NIC"
1005 				 ".  For optimal\n");
1006 			device_printf(sc->dev,
1007 				 "performance consider loading optimized "
1008 				 "firmware\n");
1009 		}
1010 		sc->fw_name = mxge_fw_unaligned;
1011 		sc->tx_boundary = 2048;
1012 		return 0;
1013 	}
1014 	/* clear confirmation addr */
1015 	confirm = (volatile uint32_t *)sc->cmd;
1016 	*confirm = 0;
1017 	wmb();
1018 	/* send a reload command to the bootstrap MCP, and wait for the
1019 	   response in the confirmation address.  The firmware should
1020 	   write a -1 there to indicate it is alive and well
1021 	*/
1022 
1023 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1024 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1025 
1026 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1027 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1028 	buf[2] = htobe32(0xffffffff);	/* confirm data */
1029 
1030 	/* FIX: All newest firmware should un-protect the bottom of
1031 	   the sram before handoff. However, the very first interfaces
1032 	   do not. Therefore the handoff copy must skip the first 8 bytes
1033 	*/
1034 					/* where the code starts*/
1035 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1036 	buf[4] = htobe32(size - 8); 	/* length of code */
1037 	buf[5] = htobe32(8);		/* where to copy to */
1038 	buf[6] = htobe32(0);		/* where to jump to */
1039 
1040 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1041 	mxge_pio_copy(submit, buf, 64);
1042 	wmb();
1043 	DELAY(1000);
1044 	wmb();
1045 	i = 0;
1046 	while (*confirm != 0xffffffff && i < 20) {
1047 		DELAY(1000*10);
1048 		i++;
1049 		bus_dmamap_sync(sc->cmd_dma.dmat,
1050 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1051 	}
1052 	if (*confirm != 0xffffffff) {
1053 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1054 			confirm, *confirm);
1055 
1056 		return ENXIO;
1057 	}
1058 	return 0;
1059 }
1060 
1061 static int
1062 mxge_update_mac_address(mxge_softc_t *sc)
1063 {
1064 	mxge_cmd_t cmd;
1065 	uint8_t *addr = sc->mac_addr;
1066 	int status;
1067 
1068 
1069 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1070 		     | (addr[2] << 8) | addr[3]);
1071 
1072 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1073 
1074 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1075 	return status;
1076 }
1077 
1078 static int
1079 mxge_change_pause(mxge_softc_t *sc, int pause)
1080 {
1081 	mxge_cmd_t cmd;
1082 	int status;
1083 
1084 	if (pause)
1085 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1086 				       &cmd);
1087 	else
1088 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1089 				       &cmd);
1090 
1091 	if (status) {
1092 		device_printf(sc->dev, "Failed to set flow control mode\n");
1093 		return ENXIO;
1094 	}
1095 	sc->pause = pause;
1096 	return 0;
1097 }
1098 
1099 static void
1100 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1101 {
1102 	mxge_cmd_t cmd;
1103 	int status;
1104 
1105 	if( sc->ifp->if_serializer)
1106 		ASSERT_SERIALIZED(sc->ifp->if_serializer);
1107 	if (mxge_always_promisc)
1108 		promisc = 1;
1109 
1110 	if (promisc)
1111 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1112 				       &cmd);
1113 	else
1114 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1115 				       &cmd);
1116 
1117 	if (status) {
1118 		device_printf(sc->dev, "Failed to set promisc mode\n");
1119 	}
1120 }
1121 
1122 static void
1123 mxge_set_multicast_list(mxge_softc_t *sc)
1124 {
1125 	mxge_cmd_t cmd;
1126 	struct ifmultiaddr *ifma;
1127 	struct ifnet *ifp = sc->ifp;
1128 	int err;
1129 
1130 	if (ifp->if_serializer)
1131 		ASSERT_SERIALIZED(ifp->if_serializer);
1132 
1133 	/* This firmware is known to not support multicast */
1134 	if (!sc->fw_multicast_support)
1135 		return;
1136 
1137 	/* Disable multicast filtering while we play with the lists*/
1138 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1139 	if (err != 0) {
1140 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1141 		       " error status: %d\n", err);
1142 		return;
1143 	}
1144 
1145 	if (sc->adopted_rx_filter_bug)
1146 		return;
1147 
1148 	if (ifp->if_flags & IFF_ALLMULTI)
1149 		/* request to disable multicast filtering, so quit here */
1150 		return;
1151 
1152 	/* Flush all the filters */
1153 
1154 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1155 	if (err != 0) {
1156 		device_printf(sc->dev,
1157 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1158 			      ", error status: %d\n", err);
1159 		return;
1160 	}
1161 
1162 	/* Walk the multicast list, and add each address */
1163 
1164 	LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1165 		if (ifma->ifma_addr->sa_family != AF_LINK)
1166 			continue;
1167 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1168 		      &cmd.data0, 4);
1169 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1170 		      &cmd.data1, 2);
1171 		cmd.data0 = htonl(cmd.data0);
1172 		cmd.data1 = htonl(cmd.data1);
1173 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1174 		if (err != 0) {
1175 			device_printf(sc->dev, "Failed "
1176 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1177 			       "%d\t", err);
1178 			/* abort, leaving multicast filtering off */
1179 			return;
1180 		}
1181 	}
1182 	/* Enable multicast filtering */
1183 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1184 	if (err != 0) {
1185 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1186 		       ", error status: %d\n", err);
1187 	}
1188 }
1189 
1190 static int
1191 mxge_max_mtu(mxge_softc_t *sc)
1192 {
1193 	mxge_cmd_t cmd;
1194 	int status;
1195 
1196 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1197 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1198 
1199 	/* try to set nbufs to see if it we can
1200 	   use virtually contiguous jumbos */
1201 	cmd.data0 = 0;
1202 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1203 			       &cmd);
1204 	if (status == 0)
1205 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1206 
1207 	/* otherwise, we're limited to MJUMPAGESIZE */
1208 	return MJUMPAGESIZE - MXGEFW_PAD;
1209 }
1210 
1211 static int
1212 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1213 {
1214 	struct mxge_slice_state *ss;
1215 	mxge_rx_done_t *rx_done;
1216 	volatile uint32_t *irq_claim;
1217 	mxge_cmd_t cmd;
1218 	int slice, status;
1219 
1220 	/* try to send a reset command to the card to see if it
1221 	   is alive */
1222 	memset(&cmd, 0, sizeof (cmd));
1223 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1224 	if (status != 0) {
1225 		device_printf(sc->dev, "failed reset\n");
1226 		return ENXIO;
1227 	}
1228 
1229 	mxge_dummy_rdma(sc, 1);
1230 
1231 
1232 	/* set the intrq size */
1233 	cmd.data0 = sc->rx_ring_size;
1234 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1235 
1236 	/*
1237 	 * Even though we already know how many slices are supported
1238 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1239 	 * has magic side effects, and must be called after a reset.
1240 	 * It must be called prior to calling any RSS related cmds,
1241 	 * including assigning an interrupt queue for anything but
1242 	 * slice 0.  It must also be called *after*
1243 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1244 	 * the firmware to compute offsets.
1245 	 */
1246 
1247 	if (sc->num_slices > 1) {
1248 		/* ask the maximum number of slices it supports */
1249 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1250 					   &cmd);
1251 		if (status != 0) {
1252 			device_printf(sc->dev,
1253 				      "failed to get number of slices\n");
1254 			return status;
1255 		}
1256 		/*
1257 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1258 		 * to setting up the interrupt queue DMA
1259 		 */
1260 		cmd.data0 = sc->num_slices;
1261 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1262 #ifdef IFNET_BUF_RING
1263 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1264 #endif
1265 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1266 					   &cmd);
1267 		if (status != 0) {
1268 			device_printf(sc->dev,
1269 				      "failed to set number of slices\n");
1270 			return status;
1271 		}
1272 	}
1273 
1274 
1275 	if (interrupts_setup) {
1276 		/* Now exchange information about interrupts  */
1277 		for (slice = 0; slice < sc->num_slices; slice++) {
1278 			rx_done = &sc->ss[slice].rx_done;
1279 			memset(rx_done->entry, 0, sc->rx_ring_size);
1280 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1281 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1282 			cmd.data2 = slice;
1283 			status |= mxge_send_cmd(sc,
1284 						MXGEFW_CMD_SET_INTRQ_DMA,
1285 						&cmd);
1286 		}
1287 	}
1288 
1289 	status |= mxge_send_cmd(sc,
1290 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1291 
1292 
1293 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1294 
1295 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1296 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1297 
1298 
1299 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1300 				&cmd);
1301 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1302 	if (status != 0) {
1303 		device_printf(sc->dev, "failed set interrupt parameters\n");
1304 		return status;
1305 	}
1306 
1307 
1308 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1309 
1310 
1311 	/* run a DMA benchmark */
1312 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1313 
1314 	for (slice = 0; slice < sc->num_slices; slice++) {
1315 		ss = &sc->ss[slice];
1316 
1317 		ss->irq_claim = irq_claim + (2 * slice);
1318 		/* reset mcp/driver shared state back to 0 */
1319 		ss->rx_done.idx = 0;
1320 		ss->rx_done.cnt = 0;
1321 		ss->tx.req = 0;
1322 		ss->tx.done = 0;
1323 		ss->tx.pkt_done = 0;
1324 		ss->tx.queue_active = 0;
1325 		ss->tx.activate = 0;
1326 		ss->tx.deactivate = 0;
1327 		ss->tx.wake = 0;
1328 		ss->tx.defrag = 0;
1329 		ss->tx.stall = 0;
1330 		ss->rx_big.cnt = 0;
1331 		ss->rx_small.cnt = 0;
1332 		ss->lro_bad_csum = 0;
1333 		ss->lro_queued = 0;
1334 		ss->lro_flushed = 0;
1335 		if (ss->fw_stats != NULL) {
1336 			ss->fw_stats->valid = 0;
1337 			ss->fw_stats->send_done_count = 0;
1338 		}
1339 	}
1340 	sc->rdma_tags_available = 15;
1341 	status = mxge_update_mac_address(sc);
1342 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1343 	mxge_change_pause(sc, sc->pause);
1344 	mxge_set_multicast_list(sc);
1345 	return status;
1346 }
1347 
1348 static int
1349 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1350 {
1351         mxge_softc_t *sc;
1352         unsigned int intr_coal_delay;
1353         int err;
1354 
1355         sc = arg1;
1356         intr_coal_delay = sc->intr_coal_delay;
1357         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1358         if (err != 0) {
1359                 return err;
1360         }
1361         if (intr_coal_delay == sc->intr_coal_delay)
1362                 return 0;
1363 
1364         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1365                 return EINVAL;
1366 
1367 	lwkt_serialize_enter(sc->ifp->if_serializer);
1368 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1369 	sc->intr_coal_delay = intr_coal_delay;
1370 
1371 	lwkt_serialize_exit(sc->ifp->if_serializer);
1372         return err;
1373 }
1374 
1375 static int
1376 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1377 {
1378         mxge_softc_t *sc;
1379         unsigned int enabled;
1380         int err;
1381 
1382         sc = arg1;
1383         enabled = sc->pause;
1384         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1385         if (err != 0) {
1386                 return err;
1387         }
1388         if (enabled == sc->pause)
1389                 return 0;
1390 
1391 	lwkt_serialize_enter(sc->ifp->if_serializer);
1392 	err = mxge_change_pause(sc, enabled);
1393 	lwkt_serialize_exit(sc->ifp->if_serializer);
1394         return err;
1395 }
1396 
1397 static int
1398 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1399 {
1400 	struct ifnet *ifp;
1401 	int err = 0;
1402 
1403 	ifp = sc->ifp;
1404 	if (lro_cnt == 0)
1405 		ifp->if_capenable &= ~IFCAP_LRO;
1406 	else
1407 		ifp->if_capenable |= IFCAP_LRO;
1408 	sc->lro_cnt = lro_cnt;
1409 	if (ifp->if_flags & IFF_RUNNING) {
1410 		mxge_close(sc);
1411 		err = mxge_open(sc);
1412 	}
1413 	return err;
1414 }
1415 
1416 static int
1417 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1418 {
1419 	mxge_softc_t *sc;
1420 	unsigned int lro_cnt;
1421 	int err;
1422 
1423 	sc = arg1;
1424 	lro_cnt = sc->lro_cnt;
1425 	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1426 	if (err != 0)
1427 		return err;
1428 
1429 	if (lro_cnt == sc->lro_cnt)
1430 		return 0;
1431 
1432 	if (lro_cnt > 128)
1433 		return EINVAL;
1434 
1435 	lwkt_serialize_enter(sc->ifp->if_serializer);
1436 	err = mxge_change_lro_locked(sc, lro_cnt);
1437 	lwkt_serialize_exit(sc->ifp->if_serializer);
1438 	return err;
1439 }
1440 
1441 static int
1442 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1443 {
1444         int err;
1445 
1446         if (arg1 == NULL)
1447                 return EFAULT;
1448         arg2 = be32toh(*(int *)arg1);
1449         arg1 = NULL;
1450         err = sysctl_handle_int(oidp, arg1, arg2, req);
1451 
1452         return err;
1453 }
1454 
1455 static void
1456 mxge_rem_sysctls(mxge_softc_t *sc)
1457 {
1458 	struct mxge_slice_state *ss;
1459 	int slice;
1460 
1461 	if (sc->slice_sysctl_tree == NULL)
1462 		return;
1463 
1464 	for (slice = 0; slice < sc->num_slices; slice++) {
1465 		ss = &sc->ss[slice];
1466 		if (ss == NULL || ss->sysctl_tree == NULL)
1467 			continue;
1468 		sysctl_ctx_free(&ss->sysctl_ctx);
1469 		ss->sysctl_tree = NULL;
1470 	}
1471 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1472 	sc->slice_sysctl_tree = NULL;
1473 	sysctl_ctx_free(&sc->sysctl_ctx);
1474 	sc->sysctl_tree = NULL;
1475 
1476 }
1477 
1478 static void
1479 mxge_add_sysctls(mxge_softc_t *sc)
1480 {
1481 	struct sysctl_ctx_list *ctx;
1482 	struct sysctl_oid_list *children;
1483 	mcp_irq_data_t *fw;
1484 	struct mxge_slice_state *ss;
1485 	int slice;
1486 	char slice_num[8];
1487 
1488 	ctx = &sc->sysctl_ctx;
1489 	sysctl_ctx_init(ctx);
1490 	sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1491 					  OID_AUTO,
1492 					  device_get_nameunit(sc->dev),
1493 					  CTLFLAG_RD, 0, "");
1494 	if (sc->sysctl_tree == NULL) {
1495 		device_printf(sc->dev, "can't add sysctl node\n");
1496 		return;
1497 	}
1498 
1499 	children = SYSCTL_CHILDREN(sc->sysctl_tree);
1500 	fw = sc->ss[0].fw_stats;
1501 
1502 	/* random information */
1503 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1504 		       "firmware_version",
1505 		       CTLFLAG_RD, &sc->fw_version,
1506 		       0, "firmware version");
1507 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1508 		       "serial_number",
1509 		       CTLFLAG_RD, &sc->serial_number_string,
1510 		       0, "serial number");
1511 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1512 		       "product_code",
1513 		       CTLFLAG_RD, &sc->product_code_string,
1514 		       0, "product_code");
1515 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1516 		       "pcie_link_width",
1517 		       CTLFLAG_RD, &sc->link_width,
1518 		       0, "tx_boundary");
1519 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1520 		       "tx_boundary",
1521 		       CTLFLAG_RD, &sc->tx_boundary,
1522 		       0, "tx_boundary");
1523 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1524 		       "write_combine",
1525 		       CTLFLAG_RD, &sc->wc,
1526 		       0, "write combining PIO?");
1527 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1528 		       "read_dma_MBs",
1529 		       CTLFLAG_RD, &sc->read_dma,
1530 		       0, "DMA Read speed in MB/s");
1531 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1532 		       "write_dma_MBs",
1533 		       CTLFLAG_RD, &sc->write_dma,
1534 		       0, "DMA Write speed in MB/s");
1535 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1536 		       "read_write_dma_MBs",
1537 		       CTLFLAG_RD, &sc->read_write_dma,
1538 		       0, "DMA concurrent Read/Write speed in MB/s");
1539 
1540 
1541 	/* performance related tunables */
1542 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1543 			"intr_coal_delay",
1544 			CTLTYPE_INT|CTLFLAG_RW, sc,
1545 			0, mxge_change_intr_coal,
1546 			"I", "interrupt coalescing delay in usecs");
1547 
1548 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1549 			"flow_control_enabled",
1550 			CTLTYPE_INT|CTLFLAG_RW, sc,
1551 			0, mxge_change_flow_control,
1552 			"I", "interrupt coalescing delay in usecs");
1553 
1554 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1555 		       "deassert_wait",
1556 		       CTLFLAG_RW, &mxge_deassert_wait,
1557 		       0, "Wait for IRQ line to go low in ihandler");
1558 
1559 	/* stats block from firmware is in network byte order.
1560 	   Need to swap it */
1561 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1562 			"link_up",
1563 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1564 			0, mxge_handle_be32,
1565 			"I", "link up");
1566 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1567 			"rdma_tags_available",
1568 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1569 			0, mxge_handle_be32,
1570 			"I", "rdma_tags_available");
1571 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1572 			"dropped_bad_crc32",
1573 			CTLTYPE_INT|CTLFLAG_RD,
1574 			&fw->dropped_bad_crc32,
1575 			0, mxge_handle_be32,
1576 			"I", "dropped_bad_crc32");
1577 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1578 			"dropped_bad_phy",
1579 			CTLTYPE_INT|CTLFLAG_RD,
1580 			&fw->dropped_bad_phy,
1581 			0, mxge_handle_be32,
1582 			"I", "dropped_bad_phy");
1583 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1584 			"dropped_link_error_or_filtered",
1585 			CTLTYPE_INT|CTLFLAG_RD,
1586 			&fw->dropped_link_error_or_filtered,
1587 			0, mxge_handle_be32,
1588 			"I", "dropped_link_error_or_filtered");
1589 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1590 			"dropped_link_overflow",
1591 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1592 			0, mxge_handle_be32,
1593 			"I", "dropped_link_overflow");
1594 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1595 			"dropped_multicast_filtered",
1596 			CTLTYPE_INT|CTLFLAG_RD,
1597 			&fw->dropped_multicast_filtered,
1598 			0, mxge_handle_be32,
1599 			"I", "dropped_multicast_filtered");
1600 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1601 			"dropped_no_big_buffer",
1602 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1603 			0, mxge_handle_be32,
1604 			"I", "dropped_no_big_buffer");
1605 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1606 			"dropped_no_small_buffer",
1607 			CTLTYPE_INT|CTLFLAG_RD,
1608 			&fw->dropped_no_small_buffer,
1609 			0, mxge_handle_be32,
1610 			"I", "dropped_no_small_buffer");
1611 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1612 			"dropped_overrun",
1613 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1614 			0, mxge_handle_be32,
1615 			"I", "dropped_overrun");
1616 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1617 			"dropped_pause",
1618 			CTLTYPE_INT|CTLFLAG_RD,
1619 			&fw->dropped_pause,
1620 			0, mxge_handle_be32,
1621 			"I", "dropped_pause");
1622 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1623 			"dropped_runt",
1624 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1625 			0, mxge_handle_be32,
1626 			"I", "dropped_runt");
1627 
1628 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1629 			"dropped_unicast_filtered",
1630 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1631 			0, mxge_handle_be32,
1632 			"I", "dropped_unicast_filtered");
1633 
1634 	/* verbose printing? */
1635 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1636 		       "verbose",
1637 		       CTLFLAG_RW, &mxge_verbose,
1638 		       0, "verbose printing");
1639 
1640 	/* lro */
1641 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1642 			"lro_cnt",
1643 			CTLTYPE_INT|CTLFLAG_RW, sc,
1644 			0, mxge_change_lro,
1645 			"I", "number of lro merge queues");
1646 
1647 
1648 	/* add counters exported for debugging from all slices */
1649 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1650 	sc->slice_sysctl_tree =
1651 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1652 				"slice", CTLFLAG_RD, 0, "");
1653 
1654 	for (slice = 0; slice < sc->num_slices; slice++) {
1655 		ss = &sc->ss[slice];
1656 		sysctl_ctx_init(&ss->sysctl_ctx);
1657 		ctx = &ss->sysctl_ctx;
1658 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1659 		ksprintf(slice_num, "%d", slice);
1660 		ss->sysctl_tree =
1661 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1662 					CTLFLAG_RD, 0, "");
1663 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1664 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1665 			       "rx_small_cnt",
1666 			       CTLFLAG_RD, &ss->rx_small.cnt,
1667 			       0, "rx_small_cnt");
1668 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1669 			       "rx_big_cnt",
1670 			       CTLFLAG_RD, &ss->rx_big.cnt,
1671 			       0, "rx_small_cnt");
1672 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1673 			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1674 			       0, "number of lro merge queues flushed");
1675 
1676 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1677 			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1678 			       0, "number of frames appended to lro merge"
1679 			       "queues");
1680 
1681 #ifndef IFNET_BUF_RING
1682 		/* only transmit from slice 0 for now */
1683 		if (slice > 0)
1684 			continue;
1685 #endif
1686 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1687 			       "tx_req",
1688 			       CTLFLAG_RD, &ss->tx.req,
1689 			       0, "tx_req");
1690 
1691 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1692 			       "tx_done",
1693 			       CTLFLAG_RD, &ss->tx.done,
1694 			       0, "tx_done");
1695 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1696 			       "tx_pkt_done",
1697 			       CTLFLAG_RD, &ss->tx.pkt_done,
1698 			       0, "tx_done");
1699 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1700 			       "tx_stall",
1701 			       CTLFLAG_RD, &ss->tx.stall,
1702 			       0, "tx_stall");
1703 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1704 			       "tx_wake",
1705 			       CTLFLAG_RD, &ss->tx.wake,
1706 			       0, "tx_wake");
1707 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1708 			       "tx_defrag",
1709 			       CTLFLAG_RD, &ss->tx.defrag,
1710 			       0, "tx_defrag");
1711 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1712 			       "tx_queue_active",
1713 			       CTLFLAG_RD, &ss->tx.queue_active,
1714 			       0, "tx_queue_active");
1715 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1716 			       "tx_activate",
1717 			       CTLFLAG_RD, &ss->tx.activate,
1718 			       0, "tx_activate");
1719 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1720 			       "tx_deactivate",
1721 			       CTLFLAG_RD, &ss->tx.deactivate,
1722 			       0, "tx_deactivate");
1723 	}
1724 }
1725 
1726 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1727    backwards one at a time and handle ring wraps */
1728 
1729 static inline void
1730 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1731 			    mcp_kreq_ether_send_t *src, int cnt)
1732 {
1733         int idx, starting_slot;
1734         starting_slot = tx->req;
1735         while (cnt > 1) {
1736                 cnt--;
1737                 idx = (starting_slot + cnt) & tx->mask;
1738                 mxge_pio_copy(&tx->lanai[idx],
1739 			      &src[cnt], sizeof(*src));
1740                 wmb();
1741         }
1742 }
1743 
1744 /*
1745  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1746  * at most 32 bytes at a time, so as to avoid involving the software
1747  * pio handler in the nic.   We re-write the first segment's flags
1748  * to mark them valid only after writing the entire chain
1749  */
1750 
1751 static inline void
1752 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1753                   int cnt)
1754 {
1755         int idx, i;
1756         uint32_t *src_ints;
1757 	volatile uint32_t *dst_ints;
1758         mcp_kreq_ether_send_t *srcp;
1759 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1760 	uint8_t last_flags;
1761 
1762         idx = tx->req & tx->mask;
1763 
1764 	last_flags = src->flags;
1765 	src->flags = 0;
1766         wmb();
1767         dst = dstp = &tx->lanai[idx];
1768         srcp = src;
1769 
1770         if ((idx + cnt) < tx->mask) {
1771                 for (i = 0; i < (cnt - 1); i += 2) {
1772                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1773                         wmb(); /* force write every 32 bytes */
1774                         srcp += 2;
1775                         dstp += 2;
1776                 }
1777         } else {
1778                 /* submit all but the first request, and ensure
1779                    that it is submitted below */
1780                 mxge_submit_req_backwards(tx, src, cnt);
1781                 i = 0;
1782         }
1783         if (i < cnt) {
1784                 /* submit the first request */
1785                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1786                 wmb(); /* barrier before setting valid flag */
1787         }
1788 
1789         /* re-write the last 32-bits with the valid flags */
1790         src->flags = last_flags;
1791         src_ints = (uint32_t *)src;
1792         src_ints+=3;
1793         dst_ints = (volatile uint32_t *)dst;
1794         dst_ints+=3;
1795         *dst_ints =  *src_ints;
1796         tx->req += cnt;
1797         wmb();
1798 }
1799 
1800 #if IFCAP_TSO4
1801 
1802 static void
1803 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1804 	       int busdma_seg_cnt, int ip_off)
1805 {
1806 	mxge_tx_ring_t *tx;
1807 	mcp_kreq_ether_send_t *req;
1808 	bus_dma_segment_t *seg;
1809 	struct ip *ip;
1810 	struct tcphdr *tcp;
1811 	uint32_t low, high_swapped;
1812 	int len, seglen, cum_len, cum_len_next;
1813 	int next_is_first, chop, cnt, rdma_count, small;
1814 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1815 	uint8_t flags, flags_next;
1816 	static int once;
1817 
1818 	mss = m->m_pkthdr.tso_segsz;
1819 
1820 	/* negative cum_len signifies to the
1821 	 * send loop that we are still in the
1822 	 * header portion of the TSO packet.
1823 	 */
1824 
1825 	/* ensure we have the ethernet, IP and TCP
1826 	   header together in the first mbuf, copy
1827 	   it to a scratch buffer if not */
1828 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1829 		m_copydata(m, 0, ip_off + sizeof (*ip),
1830 			   ss->scratch);
1831 		ip = (struct ip *)(ss->scratch + ip_off);
1832 	} else {
1833 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1834 	}
1835 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1836 			    + sizeof (*tcp))) {
1837 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1838 			   + sizeof (*tcp),  ss->scratch);
1839 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1840 	}
1841 
1842 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1843 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1844 
1845 	/* TSO implies checksum offload on this hardware */
1846 	cksum_offset = ip_off + (ip->ip_hl << 2);
1847 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1848 
1849 
1850 	/* for TSO, pseudo_hdr_offset holds mss.
1851 	 * The firmware figures out where to put
1852 	 * the checksum by parsing the header. */
1853 	pseudo_hdr_offset = htobe16(mss);
1854 
1855 	tx = &ss->tx;
1856 	req = tx->req_list;
1857 	seg = tx->seg_list;
1858 	cnt = 0;
1859 	rdma_count = 0;
1860 	/* "rdma_count" is the number of RDMAs belonging to the
1861 	 * current packet BEFORE the current send request. For
1862 	 * non-TSO packets, this is equal to "count".
1863 	 * For TSO packets, rdma_count needs to be reset
1864 	 * to 0 after a segment cut.
1865 	 *
1866 	 * The rdma_count field of the send request is
1867 	 * the number of RDMAs of the packet starting at
1868 	 * that request. For TSO send requests with one ore more cuts
1869 	 * in the middle, this is the number of RDMAs starting
1870 	 * after the last cut in the request. All previous
1871 	 * segments before the last cut implicitly have 1 RDMA.
1872 	 *
1873 	 * Since the number of RDMAs is not known beforehand,
1874 	 * it must be filled-in retroactively - after each
1875 	 * segmentation cut or at the end of the entire packet.
1876 	 */
1877 
1878 	while (busdma_seg_cnt) {
1879 		/* Break the busdma segment up into pieces*/
1880 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1881 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1882 		len = seg->ds_len;
1883 
1884 		while (len) {
1885 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1886 			seglen = len;
1887 			cum_len_next = cum_len + seglen;
1888 			(req-rdma_count)->rdma_count = rdma_count + 1;
1889 			if (__predict_true(cum_len >= 0)) {
1890 				/* payload */
1891 				chop = (cum_len_next > mss);
1892 				cum_len_next = cum_len_next % mss;
1893 				next_is_first = (cum_len_next == 0);
1894 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1895 				flags_next |= next_is_first *
1896 					MXGEFW_FLAGS_FIRST;
1897 				rdma_count |= -(chop | next_is_first);
1898 				rdma_count += chop & !next_is_first;
1899 			} else if (cum_len_next >= 0) {
1900 				/* header ends */
1901 				rdma_count = -1;
1902 				cum_len_next = 0;
1903 				seglen = -cum_len;
1904 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1905 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1906 					MXGEFW_FLAGS_FIRST |
1907 					(small * MXGEFW_FLAGS_SMALL);
1908 			    }
1909 
1910 			req->addr_high = high_swapped;
1911 			req->addr_low = htobe32(low);
1912 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1913 			req->pad = 0;
1914 			req->rdma_count = 1;
1915 			req->length = htobe16(seglen);
1916 			req->cksum_offset = cksum_offset;
1917 			req->flags = flags | ((cum_len & 1) *
1918 					      MXGEFW_FLAGS_ALIGN_ODD);
1919 			low += seglen;
1920 			len -= seglen;
1921 			cum_len = cum_len_next;
1922 			flags = flags_next;
1923 			req++;
1924 			cnt++;
1925 			rdma_count++;
1926 			if (__predict_false(cksum_offset > seglen))
1927 				cksum_offset -= seglen;
1928 			else
1929 				cksum_offset = 0;
1930 			if (__predict_false(cnt > tx->max_desc))
1931 				goto drop;
1932 		}
1933 		busdma_seg_cnt--;
1934 		seg++;
1935 	}
1936 	(req-rdma_count)->rdma_count = rdma_count;
1937 
1938 	do {
1939 		req--;
1940 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1941 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1942 
1943 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1944 	mxge_submit_req(tx, tx->req_list, cnt);
1945 #ifdef IFNET_BUF_RING
1946 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1947 		/* tell the NIC to start polling this slice */
1948 		*tx->send_go = 1;
1949 		tx->queue_active = 1;
1950 		tx->activate++;
1951 		wmb();
1952 	}
1953 #endif
1954 	return;
1955 
1956 drop:
1957 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1958 	m_freem(m);
1959 	ss->oerrors++;
1960 	if (!once) {
1961 		kprintf("tx->max_desc exceeded via TSO!\n");
1962 		kprintf("mss = %d, %ld, %d!\n", mss,
1963 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1964 		once = 1;
1965 	}
1966 	return;
1967 
1968 }
1969 
1970 #endif /* IFCAP_TSO4 */
1971 
1972 #ifdef MXGE_NEW_VLAN_API
1973 /*
1974  * We reproduce the software vlan tag insertion from
1975  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1976  * vlan tag insertion. We need to advertise this in order to have the
1977  * vlan interface respect our csum offload flags.
1978  */
1979 static struct mbuf *
1980 mxge_vlan_tag_insert(struct mbuf *m)
1981 {
1982 	struct ether_vlan_header *evl;
1983 
1984 	M_PREPEND(m, EVL_ENCAPLEN, MB_DONTWAIT);
1985 	if (__predict_false(m == NULL))
1986 		return NULL;
1987 	if (m->m_len < sizeof(*evl)) {
1988 		m = m_pullup(m, sizeof(*evl));
1989 		if (__predict_false(m == NULL))
1990 			return NULL;
1991 	}
1992 	/*
1993 	 * Transform the Ethernet header into an Ethernet header
1994 	 * with 802.1Q encapsulation.
1995 	 */
1996 	evl = mtod(m, struct ether_vlan_header *);
1997 	bcopy((char *)evl + EVL_ENCAPLEN,
1998 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1999 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2000 	evl->evl_tag = htons(m->m_pkthdr.ether_vlantag);
2001 	m->m_flags &= ~M_VLANTAG;
2002 	return m;
2003 }
2004 #endif /* MXGE_NEW_VLAN_API */
2005 
2006 static void
2007 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2008 {
2009 	mxge_softc_t *sc;
2010 	mcp_kreq_ether_send_t *req;
2011 	bus_dma_segment_t *seg;
2012 	struct mbuf *m_tmp;
2013 	struct ifnet *ifp;
2014 	mxge_tx_ring_t *tx;
2015 	struct ip *ip;
2016 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
2017 	uint16_t pseudo_hdr_offset;
2018         uint8_t flags, cksum_offset;
2019 
2020 
2021 	sc = ss->sc;
2022 	ifp = sc->ifp;
2023 	tx = &ss->tx;
2024 
2025 	ip_off = sizeof (struct ether_header);
2026 #ifdef MXGE_NEW_VLAN_API
2027 	if (m->m_flags & M_VLANTAG) {
2028 		m = mxge_vlan_tag_insert(m);
2029 		if (__predict_false(m == NULL))
2030 			goto drop;
2031 		ip_off += EVL_ENCAPLEN;
2032 	}
2033 #endif
2034 	/* (try to) map the frame for DMA */
2035 	idx = tx->req & tx->mask;
2036 	err = bus_dmamap_load_mbuf_segment(tx->dmat, tx->info[idx].map,
2037 					   m, tx->seg_list, 1, &cnt,
2038 					   BUS_DMA_NOWAIT);
2039 	if (__predict_false(err == EFBIG)) {
2040 		/* Too many segments in the chain.  Try
2041 		   to defrag */
2042 		m_tmp = m_defrag(m, M_NOWAIT);
2043 		if (m_tmp == NULL) {
2044 			goto drop;
2045 		}
2046 		ss->tx.defrag++;
2047 		m = m_tmp;
2048 		err = bus_dmamap_load_mbuf_segment(tx->dmat,
2049 					      tx->info[idx].map,
2050 					      m, tx->seg_list, 1, &cnt,
2051 					      BUS_DMA_NOWAIT);
2052 	}
2053 	if (__predict_false(err != 0)) {
2054 		device_printf(sc->dev, "bus_dmamap_load_mbuf_segment returned %d"
2055 			      " packet len = %d\n", err, m->m_pkthdr.len);
2056 		goto drop;
2057 	}
2058 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2059 			BUS_DMASYNC_PREWRITE);
2060 	tx->info[idx].m = m;
2061 
2062 #if IFCAP_TSO4
2063 	/* TSO is different enough, we handle it in another routine */
2064 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2065 		mxge_encap_tso(ss, m, cnt, ip_off);
2066 		return;
2067 	}
2068 #endif
2069 
2070 	req = tx->req_list;
2071 	cksum_offset = 0;
2072 	pseudo_hdr_offset = 0;
2073 	flags = MXGEFW_FLAGS_NO_TSO;
2074 
2075 	/* checksum offloading? */
2076 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2077 		/* ensure ip header is in first mbuf, copy
2078 		   it to a scratch buffer if not */
2079 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2080 			m_copydata(m, 0, ip_off + sizeof (*ip),
2081 				   ss->scratch);
2082 			ip = (struct ip *)(ss->scratch + ip_off);
2083 		} else {
2084 			ip = (struct ip *)(mtod(m, char *) + ip_off);
2085 		}
2086 		cksum_offset = ip_off + (ip->ip_hl << 2);
2087 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2088 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2089 		req->cksum_offset = cksum_offset;
2090 		flags |= MXGEFW_FLAGS_CKSUM;
2091 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2092 	} else {
2093 		odd_flag = 0;
2094 	}
2095 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2096 		flags |= MXGEFW_FLAGS_SMALL;
2097 
2098 	/* convert segments into a request list */
2099 	cum_len = 0;
2100 	seg = tx->seg_list;
2101 	req->flags = MXGEFW_FLAGS_FIRST;
2102 	for (i = 0; i < cnt; i++) {
2103 		req->addr_low =
2104 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2105 		req->addr_high =
2106 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2107 		req->length = htobe16(seg->ds_len);
2108 		req->cksum_offset = cksum_offset;
2109 		if (cksum_offset > seg->ds_len)
2110 			cksum_offset -= seg->ds_len;
2111 		else
2112 			cksum_offset = 0;
2113 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2114 		req->pad = 0; /* complete solid 16-byte block */
2115 		req->rdma_count = 1;
2116 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2117 		cum_len += seg->ds_len;
2118 		seg++;
2119 		req++;
2120 		req->flags = 0;
2121 	}
2122 	req--;
2123 	/* pad runts to 60 bytes */
2124 	if (cum_len < 60) {
2125 		req++;
2126 		req->addr_low =
2127 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2128 		req->addr_high =
2129 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2130 		req->length = htobe16(60 - cum_len);
2131 		req->cksum_offset = 0;
2132 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2133 		req->pad = 0; /* complete solid 16-byte block */
2134 		req->rdma_count = 1;
2135 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2136 		cnt++;
2137 	}
2138 
2139 	tx->req_list[0].rdma_count = cnt;
2140 #if 0
2141 	/* print what the firmware will see */
2142 	for (i = 0; i < cnt; i++) {
2143 		kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2144 		    "cso:%d, flags:0x%x, rdma:%d\n",
2145 		    i, (int)ntohl(tx->req_list[i].addr_high),
2146 		    (int)ntohl(tx->req_list[i].addr_low),
2147 		    (int)ntohs(tx->req_list[i].length),
2148 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2149 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2150 		    tx->req_list[i].rdma_count);
2151 	}
2152 	kprintf("--------------\n");
2153 #endif
2154 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2155 	mxge_submit_req(tx, tx->req_list, cnt);
2156 #ifdef IFNET_BUF_RING
2157 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2158 		/* tell the NIC to start polling this slice */
2159 		*tx->send_go = 1;
2160 		tx->queue_active = 1;
2161 		tx->activate++;
2162 		wmb();
2163 	}
2164 #endif
2165 	return;
2166 
2167 drop:
2168 	m_freem(m);
2169 	ss->oerrors++;
2170 	return;
2171 }
2172 
2173 #ifdef IFNET_BUF_RING
2174 static void
2175 mxge_qflush(struct ifnet *ifp)
2176 {
2177 	mxge_softc_t *sc = ifp->if_softc;
2178 	mxge_tx_ring_t *tx;
2179 	struct mbuf *m;
2180 	int slice;
2181 
2182 	for (slice = 0; slice < sc->num_slices; slice++) {
2183 		tx = &sc->ss[slice].tx;
2184 		lwkt_serialize_enter(sc->ifp->if_serializer);
2185 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2186 			m_freem(m);
2187 		lwkt_serialize_exit(sc->ifp->if_serializer);
2188 	}
2189 	if_qflush(ifp);
2190 }
2191 
2192 static inline void
2193 mxge_start_locked(struct mxge_slice_state *ss)
2194 {
2195 	mxge_softc_t *sc;
2196 	struct mbuf *m;
2197 	struct ifnet *ifp;
2198 	mxge_tx_ring_t *tx;
2199 
2200 	sc = ss->sc;
2201 	ifp = sc->ifp;
2202 	tx = &ss->tx;
2203 
2204 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2205 		m = drbr_dequeue(ifp, tx->br);
2206 		if (m == NULL) {
2207 			return;
2208 		}
2209 		/* let BPF see it */
2210 		BPF_MTAP(ifp, m);
2211 
2212 		/* give it to the nic */
2213 		mxge_encap(ss, m);
2214 	}
2215 	/* ran out of transmit slots */
2216 	if (((ss->if_flags & IFF_OACTIVE) == 0)
2217 	    && (!drbr_empty(ifp, tx->br))) {
2218 		ss->if_flags |= IFF_OACTIVE;
2219 		tx->stall++;
2220 	}
2221 }
2222 
2223 static int
2224 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2225 {
2226 	mxge_softc_t *sc;
2227 	struct ifnet *ifp;
2228 	mxge_tx_ring_t *tx;
2229 	int err;
2230 
2231 	sc = ss->sc;
2232 	ifp = sc->ifp;
2233 	tx = &ss->tx;
2234 
2235 	if ((ss->if_flags & (IFF_RUNNING|IFF_OACTIVE)) !=
2236 	    IFF_RUNNING) {
2237 		err = drbr_enqueue(ifp, tx->br, m);
2238 		return (err);
2239 	}
2240 
2241 	if (drbr_empty(ifp, tx->br) &&
2242 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2243 		/* let BPF see it */
2244 		BPF_MTAP(ifp, m);
2245 		/* give it to the nic */
2246 		mxge_encap(ss, m);
2247 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2248 		return (err);
2249 	}
2250 	if (!drbr_empty(ifp, tx->br))
2251 		mxge_start_locked(ss);
2252 	return (0);
2253 }
2254 
2255 static int
2256 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2257 {
2258 	mxge_softc_t *sc = ifp->if_softc;
2259 	struct mxge_slice_state *ss;
2260 	mxge_tx_ring_t *tx;
2261 	int err = 0;
2262 	int slice;
2263 
2264 #if 0
2265 	slice = m->m_pkthdr.flowid;
2266 #endif
2267 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2268 
2269 	ss = &sc->ss[slice];
2270 	tx = &ss->tx;
2271 
2272 	if(lwkt_serialize_try(ifp->if_serializer)) {
2273 		err = mxge_transmit_locked(ss, m);
2274 		lwkt_serialize_exit(ifp->if_serializer);
2275 	} else {
2276 		err = drbr_enqueue(ifp, tx->br, m);
2277 	}
2278 
2279 	return (err);
2280 }
2281 
2282 #else
2283 
2284 static inline void
2285 mxge_start_locked(struct mxge_slice_state *ss)
2286 {
2287 	mxge_softc_t *sc;
2288 	struct mbuf *m;
2289 	struct ifnet *ifp;
2290 	mxge_tx_ring_t *tx;
2291 
2292 	sc = ss->sc;
2293 	ifp = sc->ifp;
2294 	tx = &ss->tx;
2295 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2296 		m = ifq_dequeue(&ifp->if_snd, NULL);
2297 		if (m == NULL) {
2298 			return;
2299 		}
2300 		/* let BPF see it */
2301 		BPF_MTAP(ifp, m);
2302 
2303 		/* give it to the nic */
2304 		mxge_encap(ss, m);
2305 	}
2306 	/* ran out of transmit slots */
2307 	if ((sc->ifp->if_flags & IFF_OACTIVE) == 0) {
2308 		sc->ifp->if_flags |= IFF_OACTIVE;
2309 		tx->stall++;
2310 	}
2311 }
2312 #endif
2313 static void
2314 mxge_start(struct ifnet *ifp)
2315 {
2316 	mxge_softc_t *sc = ifp->if_softc;
2317 	struct mxge_slice_state *ss;
2318 
2319 	ASSERT_SERIALIZED(sc->ifp->if_serializer);
2320 	/* only use the first slice for now */
2321 	ss = &sc->ss[0];
2322 	mxge_start_locked(ss);
2323 }
2324 
2325 /*
2326  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2327  * at most 32 bytes at a time, so as to avoid involving the software
2328  * pio handler in the nic.   We re-write the first segment's low
2329  * DMA address to mark it valid only after we write the entire chunk
2330  * in a burst
2331  */
2332 static inline void
2333 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2334 		mcp_kreq_ether_recv_t *src)
2335 {
2336 	uint32_t low;
2337 
2338 	low = src->addr_low;
2339 	src->addr_low = 0xffffffff;
2340 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2341 	wmb();
2342 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2343 	wmb();
2344 	src->addr_low = low;
2345 	dst->addr_low = low;
2346 	wmb();
2347 }
2348 
2349 static int
2350 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2351 {
2352 	bus_dma_segment_t seg;
2353 	struct mbuf *m;
2354 	mxge_rx_ring_t *rx = &ss->rx_small;
2355 	int cnt, err;
2356 
2357 	m = m_gethdr(MB_DONTWAIT, MT_DATA);
2358 	if (m == NULL) {
2359 		rx->alloc_fail++;
2360 		err = ENOBUFS;
2361 		goto done;
2362 	}
2363 	m->m_len = m->m_pkthdr.len = MHLEN;
2364 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2365 				      &seg, 1, &cnt, BUS_DMA_NOWAIT);
2366 	if (err != 0) {
2367 		kprintf("can't dmamap small (%d)\n", err);
2368 		m_free(m);
2369 		goto done;
2370 	}
2371 	rx->info[idx].m = m;
2372 	rx->shadow[idx].addr_low =
2373 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2374 	rx->shadow[idx].addr_high =
2375 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2376 
2377 done:
2378 	if ((idx & 7) == 7)
2379 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2380 	return err;
2381 }
2382 
2383 
2384 static int
2385 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2386 {
2387 	bus_dma_segment_t seg[3];
2388 	struct mbuf *m;
2389 	mxge_rx_ring_t *rx = &ss->rx_big;
2390 	int cnt, err, i;
2391 
2392 	if (rx->cl_size == MCLBYTES)
2393 		m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2394 	else {
2395 #if 0
2396 		m = m_getjcl(MB_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2397 #else
2398 		/*
2399 		 * XXX: allocate normal sized buffers for big buffers.
2400 		 * We should be fine as long as we don't get any jumbo frames
2401 		 */
2402 		m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2403 #endif
2404 	}
2405 	if (m == NULL) {
2406 		rx->alloc_fail++;
2407 		err = ENOBUFS;
2408 		goto done;
2409 	}
2410 	m->m_pkthdr.len = 0;
2411 	m->m_len = m->m_pkthdr.len = rx->mlen;
2412 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2413 				      seg, 1, &cnt, BUS_DMA_NOWAIT);
2414 	if (err != 0) {
2415 		kprintf("can't dmamap big (%d)\n", err);
2416 		m_free(m);
2417 		goto done;
2418 	}
2419 	rx->info[idx].m = m;
2420 	rx->shadow[idx].addr_low =
2421 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2422 	rx->shadow[idx].addr_high =
2423 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2424 
2425 #if MXGE_VIRT_JUMBOS
2426 	for (i = 1; i < cnt; i++) {
2427 		rx->shadow[idx + i].addr_low =
2428 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2429 		rx->shadow[idx + i].addr_high =
2430 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2431        }
2432 #endif
2433 
2434 done:
2435        for (i = 0; i < rx->nbufs; i++) {
2436 		if ((idx & 7) == 7) {
2437 			mxge_submit_8rx(&rx->lanai[idx - 7],
2438 					&rx->shadow[idx - 7]);
2439 		}
2440 		idx++;
2441 	}
2442 	return err;
2443 }
2444 
2445 /*
2446  *  Myri10GE hardware checksums are not valid if the sender
2447  *  padded the frame with non-zero padding.  This is because
2448  *  the firmware just does a simple 16-bit 1s complement
2449  *  checksum across the entire frame, excluding the first 14
2450  *  bytes.  It is best to simply to check the checksum and
2451  *  tell the stack about it only if the checksum is good
2452  */
2453 
2454 static inline uint16_t
2455 mxge_rx_csum(struct mbuf *m, int csum)
2456 {
2457 	struct ether_header *eh;
2458 	struct ip *ip;
2459 	uint16_t c;
2460 
2461 	eh = mtod(m, struct ether_header *);
2462 
2463 	/* only deal with IPv4 TCP & UDP for now */
2464 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2465 		return 1;
2466 	ip = (struct ip *)(eh + 1);
2467 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2468 			    ip->ip_p != IPPROTO_UDP))
2469 		return 1;
2470 #ifdef INET
2471 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2472 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2473 			    - (ip->ip_hl << 2) + ip->ip_p));
2474 #else
2475 	c = 1;
2476 #endif
2477 	c ^= 0xffff;
2478 	return (c);
2479 }
2480 
2481 static void
2482 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2483 {
2484 	struct ether_vlan_header *evl;
2485 	struct ether_header *eh;
2486 	uint32_t partial;
2487 
2488 	evl = mtod(m, struct ether_vlan_header *);
2489 	eh = mtod(m, struct ether_header *);
2490 
2491 	/*
2492 	 * fix checksum by subtracting EVL_ENCAPLEN bytes
2493 	 * after what the firmware thought was the end of the ethernet
2494 	 * header.
2495 	 */
2496 
2497 	/* put checksum into host byte order */
2498 	*csum = ntohs(*csum);
2499 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2500 	(*csum) += ~partial;
2501 	(*csum) +=  ((*csum) < ~partial);
2502 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2503 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2504 
2505 	/* restore checksum to network byte order;
2506 	   later consumers expect this */
2507 	*csum = htons(*csum);
2508 
2509 	/* save the tag */
2510 #ifdef MXGE_NEW_VLAN_API
2511 	m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2512 #else
2513 	{
2514 		struct m_tag *mtag;
2515 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2516 				   MB_DONTWAIT);
2517 		if (mtag == NULL)
2518 			return;
2519 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2520 		m_tag_prepend(m, mtag);
2521 	}
2522 
2523 #endif
2524 	m->m_flags |= M_VLANTAG;
2525 
2526 	/*
2527 	 * Remove the 802.1q header by copying the Ethernet
2528 	 * addresses over it and adjusting the beginning of
2529 	 * the data in the mbuf.  The encapsulated Ethernet
2530 	 * type field is already in place.
2531 	 */
2532 	bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2533 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2534 	m_adj(m, EVL_ENCAPLEN);
2535 }
2536 
2537 
2538 static inline void
2539 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum,
2540 		   struct mbuf_chain *chain)
2541 {
2542 	mxge_softc_t *sc;
2543 	struct ifnet *ifp;
2544 	struct mbuf *m;
2545 	struct ether_header *eh;
2546 	mxge_rx_ring_t *rx;
2547 	bus_dmamap_t old_map;
2548 	int idx;
2549 	uint16_t tcpudp_csum;
2550 
2551 	sc = ss->sc;
2552 	ifp = sc->ifp;
2553 	rx = &ss->rx_big;
2554 	idx = rx->cnt & rx->mask;
2555 	rx->cnt += rx->nbufs;
2556 	/* save a pointer to the received mbuf */
2557 	m = rx->info[idx].m;
2558 	/* try to replace the received mbuf */
2559 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2560 		/* drop the frame -- the old mbuf is re-cycled */
2561 		ifp->if_ierrors++;
2562 		return;
2563 	}
2564 
2565 	/* unmap the received buffer */
2566 	old_map = rx->info[idx].map;
2567 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2568 	bus_dmamap_unload(rx->dmat, old_map);
2569 
2570 	/* swap the bus_dmamap_t's */
2571 	rx->info[idx].map = rx->extra_map;
2572 	rx->extra_map = old_map;
2573 
2574 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2575 	 * aligned */
2576 	m->m_data += MXGEFW_PAD;
2577 
2578 	m->m_pkthdr.rcvif = ifp;
2579 	m->m_len = m->m_pkthdr.len = len;
2580 	ss->ipackets++;
2581 	eh = mtod(m, struct ether_header *);
2582 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2583 		mxge_vlan_tag_remove(m, &csum);
2584 	}
2585 	/* if the checksum is valid, mark it in the mbuf header */
2586 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2587 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2588 			return;
2589 		/* otherwise, it was a UDP frame, or a TCP frame which
2590 		   we could not do LRO on.  Tell the stack that the
2591 		   checksum is good */
2592 		m->m_pkthdr.csum_data = 0xffff;
2593 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2594 	}
2595 #if 0
2596 	/* flowid only valid if RSS hashing is enabled */
2597 	if (sc->num_slices > 1) {
2598 		m->m_pkthdr.flowid = (ss - sc->ss);
2599 		m->m_flags |= M_FLOWID;
2600 	}
2601 #endif
2602 	ether_input_chain(ifp, m, NULL, chain);
2603 }
2604 
2605 static inline void
2606 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum,
2607 		   struct mbuf_chain *chain)
2608 {
2609 	mxge_softc_t *sc;
2610 	struct ifnet *ifp;
2611 	struct ether_header *eh;
2612 	struct mbuf *m;
2613 	mxge_rx_ring_t *rx;
2614 	bus_dmamap_t old_map;
2615 	int idx;
2616 	uint16_t tcpudp_csum;
2617 
2618 	sc = ss->sc;
2619 	ifp = sc->ifp;
2620 	rx = &ss->rx_small;
2621 	idx = rx->cnt & rx->mask;
2622 	rx->cnt++;
2623 	/* save a pointer to the received mbuf */
2624 	m = rx->info[idx].m;
2625 	/* try to replace the received mbuf */
2626 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2627 		/* drop the frame -- the old mbuf is re-cycled */
2628 		ifp->if_ierrors++;
2629 		return;
2630 	}
2631 
2632 	/* unmap the received buffer */
2633 	old_map = rx->info[idx].map;
2634 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2635 	bus_dmamap_unload(rx->dmat, old_map);
2636 
2637 	/* swap the bus_dmamap_t's */
2638 	rx->info[idx].map = rx->extra_map;
2639 	rx->extra_map = old_map;
2640 
2641 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2642 	 * aligned */
2643 	m->m_data += MXGEFW_PAD;
2644 
2645 	m->m_pkthdr.rcvif = ifp;
2646 	m->m_len = m->m_pkthdr.len = len;
2647 	ss->ipackets++;
2648 	eh = mtod(m, struct ether_header *);
2649 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2650 		mxge_vlan_tag_remove(m, &csum);
2651 	}
2652 	/* if the checksum is valid, mark it in the mbuf header */
2653 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2654 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2655 			return;
2656 		/* otherwise, it was a UDP frame, or a TCP frame which
2657 		   we could not do LRO on.  Tell the stack that the
2658 		   checksum is good */
2659 		m->m_pkthdr.csum_data = 0xffff;
2660 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2661 	}
2662 #if 0
2663 	/* flowid only valid if RSS hashing is enabled */
2664 	if (sc->num_slices > 1) {
2665 		m->m_pkthdr.flowid = (ss - sc->ss);
2666 		m->m_flags |= M_FLOWID;
2667 	}
2668 #endif
2669 	ether_input_chain(ifp, m, NULL, chain);
2670 }
2671 
2672 static inline void
2673 mxge_clean_rx_done(struct mxge_slice_state *ss)
2674 {
2675 	mxge_rx_done_t *rx_done = &ss->rx_done;
2676 	int limit = 0;
2677 	uint16_t length;
2678 	uint16_t checksum;
2679 	struct mbuf_chain chain[MAXCPU];
2680 
2681 	ether_input_chain_init(chain);
2682 	while (rx_done->entry[rx_done->idx].length != 0) {
2683 		length = ntohs(rx_done->entry[rx_done->idx].length);
2684 		rx_done->entry[rx_done->idx].length = 0;
2685 		checksum = rx_done->entry[rx_done->idx].checksum;
2686 		if (length <= (MHLEN - MXGEFW_PAD))
2687 			mxge_rx_done_small(ss, length, checksum, chain);
2688 		else
2689 			mxge_rx_done_big(ss, length, checksum, chain);
2690 		rx_done->cnt++;
2691 		rx_done->idx = rx_done->cnt & rx_done->mask;
2692 
2693 		/* limit potential for livelock */
2694 		if (__predict_false(++limit > rx_done->mask / 2))
2695 			break;
2696 	}
2697 	ether_input_dispatch(chain);
2698 #ifdef INET
2699 	while (!SLIST_EMPTY(&ss->lro_active)) {
2700 		struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2701 		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2702 		mxge_lro_flush(ss, lro);
2703 	}
2704 #endif
2705 }
2706 
2707 
2708 static inline void
2709 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2710 {
2711 	struct ifnet *ifp;
2712 	mxge_tx_ring_t *tx;
2713 	struct mbuf *m;
2714 	bus_dmamap_t map;
2715 	int idx;
2716 	int *flags;
2717 
2718 	tx = &ss->tx;
2719 	ifp = ss->sc->ifp;
2720 	ASSERT_SERIALIZED(ifp->if_serializer);
2721 	while (tx->pkt_done != mcp_idx) {
2722 		idx = tx->done & tx->mask;
2723 		tx->done++;
2724 		m = tx->info[idx].m;
2725 		/* mbuf and DMA map only attached to the first
2726 		   segment per-mbuf */
2727 		if (m != NULL) {
2728 			ss->obytes += m->m_pkthdr.len;
2729 			if (m->m_flags & M_MCAST)
2730 				ss->omcasts++;
2731 			ss->opackets++;
2732 			tx->info[idx].m = NULL;
2733 			map = tx->info[idx].map;
2734 			bus_dmamap_unload(tx->dmat, map);
2735 			m_freem(m);
2736 		}
2737 		if (tx->info[idx].flag) {
2738 			tx->info[idx].flag = 0;
2739 			tx->pkt_done++;
2740 		}
2741 	}
2742 
2743 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2744            its OK to send packets */
2745 #ifdef IFNET_BUF_RING
2746 	flags = &ss->if_flags;
2747 #else
2748 	flags = &ifp->if_flags;
2749 #endif
2750 	if ((*flags) & IFF_OACTIVE &&
2751 	    tx->req - tx->done < (tx->mask + 1)/4) {
2752 		*(flags) &= ~IFF_OACTIVE;
2753 		ss->tx.wake++;
2754 		mxge_start_locked(ss);
2755 	}
2756 #ifdef IFNET_BUF_RING
2757 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2758 		/* let the NIC stop polling this queue, since there
2759 		 * are no more transmits pending */
2760 		if (tx->req == tx->done) {
2761 			*tx->send_stop = 1;
2762 			tx->queue_active = 0;
2763 			tx->deactivate++;
2764 			wmb();
2765 		}
2766 	}
2767 #endif
2768 
2769 }
2770 
2771 static struct mxge_media_type mxge_xfp_media_types[] =
2772 {
2773 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2774 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2775 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2776 	{0,		(1 << 5),	"10GBASE-ER"},
2777 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2778 	{0,		(1 << 3),	"10GBASE-SW"},
2779 	{0,		(1 << 2),	"10GBASE-LW"},
2780 	{0,		(1 << 1),	"10GBASE-EW"},
2781 	{0,		(1 << 0),	"Reserved"}
2782 };
2783 static struct mxge_media_type mxge_sfp_media_types[] =
2784 {
2785 	{0,		(1 << 7),	"Reserved"},
2786 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2787 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2788 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"}
2789 };
2790 
2791 static void
2792 mxge_set_media(mxge_softc_t *sc, int type)
2793 {
2794 	sc->media_flags |= type;
2795 	ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2796 	ifmedia_set(&sc->media, sc->media_flags);
2797 }
2798 
2799 
2800 /*
2801  * Determine the media type for a NIC.  Some XFPs will identify
2802  * themselves only when their link is up, so this is initiated via a
2803  * link up interrupt.  However, this can potentially take up to
2804  * several milliseconds, so it is run via the watchdog routine, rather
2805  * than in the interrupt handler itself.   This need only be done
2806  * once, not each time the link is up.
2807  */
2808 static void
2809 mxge_media_probe(mxge_softc_t *sc)
2810 {
2811 	mxge_cmd_t cmd;
2812 	char *cage_type;
2813 	char *ptr;
2814 	struct mxge_media_type *mxge_media_types = NULL;
2815 	int i, err, ms, mxge_media_type_entries;
2816 	uint32_t byte;
2817 
2818 	sc->need_media_probe = 0;
2819 
2820 	/* if we've already set a media type, we're done */
2821 	if (sc->media_flags  != (IFM_ETHER | IFM_AUTO))
2822 		return;
2823 
2824 	/*
2825 	 * parse the product code to deterimine the interface type
2826 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2827 	 * after the 3rd dash in the driver's cached copy of the
2828 	 * EEPROM's product code string.
2829 	 */
2830 	ptr = sc->product_code_string;
2831 	if (ptr == NULL) {
2832 		device_printf(sc->dev, "Missing product code\n");
2833 	}
2834 
2835 	for (i = 0; i < 3; i++, ptr++) {
2836 		ptr = index(ptr, '-');
2837 		if (ptr == NULL) {
2838 			device_printf(sc->dev,
2839 				      "only %d dashes in PC?!?\n", i);
2840 			return;
2841 		}
2842 	}
2843 	if (*ptr == 'C') {
2844 		/* -C is CX4 */
2845 		mxge_set_media(sc, IFM_10G_CX4);
2846 		return;
2847 	}
2848 	else if (*ptr == 'Q') {
2849 		/* -Q is Quad Ribbon Fiber */
2850 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2851 		/* FreeBSD has no media type for Quad ribbon fiber */
2852 		return;
2853 	}
2854 
2855 	if (*ptr == 'R') {
2856 		/* -R is XFP */
2857 		mxge_media_types = mxge_xfp_media_types;
2858 		mxge_media_type_entries =
2859 			sizeof (mxge_xfp_media_types) /
2860 			sizeof (mxge_xfp_media_types[0]);
2861 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2862 		cage_type = "XFP";
2863 	}
2864 
2865 	if (*ptr == 'S' || *(ptr +1) == 'S') {
2866 		/* -S or -2S is SFP+ */
2867 		mxge_media_types = mxge_sfp_media_types;
2868 		mxge_media_type_entries =
2869 			sizeof (mxge_sfp_media_types) /
2870 			sizeof (mxge_sfp_media_types[0]);
2871 		cage_type = "SFP+";
2872 		byte = 3;
2873 	}
2874 
2875 	if (mxge_media_types == NULL) {
2876 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2877 		return;
2878 	}
2879 
2880 	/*
2881 	 * At this point we know the NIC has an XFP cage, so now we
2882 	 * try to determine what is in the cage by using the
2883 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2884 	 * register.  We read just one byte, which may take over
2885 	 * a millisecond
2886 	 */
2887 
2888 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2889 	cmd.data1 = byte;
2890 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2891 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2892 		device_printf(sc->dev, "failed to read XFP\n");
2893 	}
2894 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2895 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2896 	}
2897 	if (err != MXGEFW_CMD_OK) {
2898 		return;
2899 	}
2900 
2901 	/* now we wait for the data to be cached */
2902 	cmd.data0 = byte;
2903 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2904 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2905 		DELAY(1000);
2906 		cmd.data0 = byte;
2907 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2908 	}
2909 	if (err != MXGEFW_CMD_OK) {
2910 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2911 			      cage_type, err, ms);
2912 		return;
2913 	}
2914 
2915 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2916 		if (mxge_verbose)
2917 			device_printf(sc->dev, "%s:%s\n", cage_type,
2918 				      mxge_media_types[0].name);
2919 		mxge_set_media(sc, IFM_10G_CX4);
2920 		return;
2921 	}
2922 	for (i = 1; i < mxge_media_type_entries; i++) {
2923 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2924 			if (mxge_verbose)
2925 				device_printf(sc->dev, "%s:%s\n",
2926 					      cage_type,
2927 					      mxge_media_types[i].name);
2928 
2929 			mxge_set_media(sc, mxge_media_types[i].flag);
2930 			return;
2931 		}
2932 	}
2933 	device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2934 		      cmd.data0);
2935 
2936 	return;
2937 }
2938 
2939 static void
2940 mxge_intr(void *arg)
2941 {
2942 	struct mxge_slice_state *ss = arg;
2943 	mxge_softc_t *sc = ss->sc;
2944 	mcp_irq_data_t *stats = ss->fw_stats;
2945 	mxge_tx_ring_t *tx = &ss->tx;
2946 	mxge_rx_done_t *rx_done = &ss->rx_done;
2947 	uint32_t send_done_count;
2948 	uint8_t valid;
2949 
2950 
2951 #ifndef IFNET_BUF_RING
2952 	/* an interrupt on a non-zero slice is implicitly valid
2953 	   since MSI-X irqs are not shared */
2954 	if (ss != sc->ss) {
2955 		mxge_clean_rx_done(ss);
2956 		*ss->irq_claim = be32toh(3);
2957 		return;
2958 	}
2959 #endif
2960 
2961 	/* make sure the DMA has finished */
2962 	if (!stats->valid) {
2963 		return;
2964 	}
2965 	valid = stats->valid;
2966 
2967 	if (sc->legacy_irq) {
2968 		/* lower legacy IRQ  */
2969 		*sc->irq_deassert = 0;
2970 		if (!mxge_deassert_wait)
2971 			/* don't wait for conf. that irq is low */
2972 			stats->valid = 0;
2973 	} else {
2974 		stats->valid = 0;
2975 	}
2976 
2977 	/* loop while waiting for legacy irq deassertion */
2978 	do {
2979 		/* check for transmit completes and receives */
2980 		send_done_count = be32toh(stats->send_done_count);
2981 		while ((send_done_count != tx->pkt_done) ||
2982 		       (rx_done->entry[rx_done->idx].length != 0)) {
2983 			if (send_done_count != tx->pkt_done)
2984 				mxge_tx_done(ss, (int)send_done_count);
2985 			mxge_clean_rx_done(ss);
2986 			send_done_count = be32toh(stats->send_done_count);
2987 		}
2988 		if (sc->legacy_irq && mxge_deassert_wait)
2989 			wmb();
2990 	} while (*((volatile uint8_t *) &stats->valid));
2991 
2992 	/* fw link & error stats meaningful only on the first slice */
2993 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2994 		if (sc->link_state != stats->link_up) {
2995 			sc->link_state = stats->link_up;
2996 			if (sc->link_state) {
2997 				sc->ifp->if_link_state = LINK_STATE_UP;
2998 				if_link_state_change(sc->ifp);
2999 				if (mxge_verbose)
3000 					device_printf(sc->dev, "link up\n");
3001 			} else {
3002 				sc->ifp->if_link_state = LINK_STATE_DOWN;
3003 				if_link_state_change(sc->ifp);
3004 				if (mxge_verbose)
3005 					device_printf(sc->dev, "link down\n");
3006 			}
3007 			sc->need_media_probe = 1;
3008 		}
3009 		if (sc->rdma_tags_available !=
3010 		    be32toh(stats->rdma_tags_available)) {
3011 			sc->rdma_tags_available =
3012 				be32toh(stats->rdma_tags_available);
3013 			device_printf(sc->dev, "RDMA timed out! %d tags "
3014 				      "left\n", sc->rdma_tags_available);
3015 		}
3016 
3017 		if (stats->link_down) {
3018 			sc->down_cnt += stats->link_down;
3019 			sc->link_state = 0;
3020 			sc->ifp->if_link_state = LINK_STATE_DOWN;
3021 			if_link_state_change(sc->ifp);
3022 		}
3023 	}
3024 
3025 	/* check to see if we have rx token to pass back */
3026 	if (valid & 0x1)
3027 	    *ss->irq_claim = be32toh(3);
3028 	*(ss->irq_claim + 1) = be32toh(3);
3029 }
3030 
3031 static void
3032 mxge_init(void *arg)
3033 {
3034 }
3035 
3036 
3037 
3038 static void
3039 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3040 {
3041 	struct lro_entry *lro_entry;
3042 	int i;
3043 
3044 	while (!SLIST_EMPTY(&ss->lro_free)) {
3045 		lro_entry = SLIST_FIRST(&ss->lro_free);
3046 		SLIST_REMOVE_HEAD(&ss->lro_free, next);
3047 		kfree(lro_entry, M_DEVBUF);
3048 	}
3049 
3050 	for (i = 0; i <= ss->rx_big.mask; i++) {
3051 		if (ss->rx_big.info[i].m == NULL)
3052 			continue;
3053 		bus_dmamap_unload(ss->rx_big.dmat,
3054 				  ss->rx_big.info[i].map);
3055 		m_freem(ss->rx_big.info[i].m);
3056 		ss->rx_big.info[i].m = NULL;
3057 	}
3058 
3059 	for (i = 0; i <= ss->rx_small.mask; i++) {
3060 		if (ss->rx_small.info[i].m == NULL)
3061 			continue;
3062 		bus_dmamap_unload(ss->rx_small.dmat,
3063 				  ss->rx_small.info[i].map);
3064 		m_freem(ss->rx_small.info[i].m);
3065 		ss->rx_small.info[i].m = NULL;
3066 	}
3067 
3068 	/* transmit ring used only on the first slice */
3069 	if (ss->tx.info == NULL)
3070 		return;
3071 
3072 	for (i = 0; i <= ss->tx.mask; i++) {
3073 		ss->tx.info[i].flag = 0;
3074 		if (ss->tx.info[i].m == NULL)
3075 			continue;
3076 		bus_dmamap_unload(ss->tx.dmat,
3077 				  ss->tx.info[i].map);
3078 		m_freem(ss->tx.info[i].m);
3079 		ss->tx.info[i].m = NULL;
3080 	}
3081 }
3082 
3083 static void
3084 mxge_free_mbufs(mxge_softc_t *sc)
3085 {
3086 	int slice;
3087 
3088 	for (slice = 0; slice < sc->num_slices; slice++)
3089 		mxge_free_slice_mbufs(&sc->ss[slice]);
3090 }
3091 
3092 static void
3093 mxge_free_slice_rings(struct mxge_slice_state *ss)
3094 {
3095 	int i;
3096 
3097 
3098 	if (ss->rx_done.entry != NULL)
3099 		mxge_dma_free(&ss->rx_done.dma);
3100 	ss->rx_done.entry = NULL;
3101 
3102 	if (ss->tx.req_bytes != NULL)
3103 		kfree(ss->tx.req_bytes, M_DEVBUF);
3104 	ss->tx.req_bytes = NULL;
3105 
3106 	if (ss->tx.seg_list != NULL)
3107 		kfree(ss->tx.seg_list, M_DEVBUF);
3108 	ss->tx.seg_list = NULL;
3109 
3110 	if (ss->rx_small.shadow != NULL)
3111 		kfree(ss->rx_small.shadow, M_DEVBUF);
3112 	ss->rx_small.shadow = NULL;
3113 
3114 	if (ss->rx_big.shadow != NULL)
3115 		kfree(ss->rx_big.shadow, M_DEVBUF);
3116 	ss->rx_big.shadow = NULL;
3117 
3118 	if (ss->tx.info != NULL) {
3119 		if (ss->tx.dmat != NULL) {
3120 			for (i = 0; i <= ss->tx.mask; i++) {
3121 				bus_dmamap_destroy(ss->tx.dmat,
3122 						   ss->tx.info[i].map);
3123 			}
3124 			bus_dma_tag_destroy(ss->tx.dmat);
3125 		}
3126 		kfree(ss->tx.info, M_DEVBUF);
3127 	}
3128 	ss->tx.info = NULL;
3129 
3130 	if (ss->rx_small.info != NULL) {
3131 		if (ss->rx_small.dmat != NULL) {
3132 			for (i = 0; i <= ss->rx_small.mask; i++) {
3133 				bus_dmamap_destroy(ss->rx_small.dmat,
3134 						   ss->rx_small.info[i].map);
3135 			}
3136 			bus_dmamap_destroy(ss->rx_small.dmat,
3137 					   ss->rx_small.extra_map);
3138 			bus_dma_tag_destroy(ss->rx_small.dmat);
3139 		}
3140 		kfree(ss->rx_small.info, M_DEVBUF);
3141 	}
3142 	ss->rx_small.info = NULL;
3143 
3144 	if (ss->rx_big.info != NULL) {
3145 		if (ss->rx_big.dmat != NULL) {
3146 			for (i = 0; i <= ss->rx_big.mask; i++) {
3147 				bus_dmamap_destroy(ss->rx_big.dmat,
3148 						   ss->rx_big.info[i].map);
3149 			}
3150 			bus_dmamap_destroy(ss->rx_big.dmat,
3151 					   ss->rx_big.extra_map);
3152 			bus_dma_tag_destroy(ss->rx_big.dmat);
3153 		}
3154 		kfree(ss->rx_big.info, M_DEVBUF);
3155 	}
3156 	ss->rx_big.info = NULL;
3157 }
3158 
3159 static void
3160 mxge_free_rings(mxge_softc_t *sc)
3161 {
3162 	int slice;
3163 
3164 	for (slice = 0; slice < sc->num_slices; slice++)
3165 		mxge_free_slice_rings(&sc->ss[slice]);
3166 }
3167 
3168 static int
3169 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3170 		       int tx_ring_entries)
3171 {
3172 	mxge_softc_t *sc = ss->sc;
3173 	size_t bytes;
3174 	int err, i;
3175 
3176 	err = ENOMEM;
3177 
3178 	/* allocate per-slice receive resources */
3179 
3180 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3181 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3182 
3183 	/* allocate the rx shadow rings */
3184 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3185 	ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3186 	if (ss->rx_small.shadow == NULL)
3187 		return err;;
3188 
3189 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3190 	ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3191 	if (ss->rx_big.shadow == NULL)
3192 		return err;;
3193 
3194 	/* allocate the rx host info rings */
3195 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3196 	ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3197 	if (ss->rx_small.info == NULL)
3198 		return err;;
3199 
3200 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3201 	ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3202 	if (ss->rx_big.info == NULL)
3203 		return err;;
3204 
3205 	/* allocate the rx busdma resources */
3206 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3207 				 1,			/* alignment */
3208 				 4096,			/* boundary */
3209 				 BUS_SPACE_MAXADDR,	/* low */
3210 				 BUS_SPACE_MAXADDR,	/* high */
3211 				 NULL, NULL,		/* filter */
3212 				 MHLEN,			/* maxsize */
3213 				 1,			/* num segs */
3214 				 MHLEN,			/* maxsegsize */
3215 				 BUS_DMA_ALLOCNOW,	/* flags */
3216 				 &ss->rx_small.dmat);	/* tag */
3217 	if (err != 0) {
3218 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3219 			      err);
3220 		return err;;
3221 	}
3222 
3223 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3224 				 1,			/* alignment */
3225 #if MXGE_VIRT_JUMBOS
3226 				 4096,			/* boundary */
3227 #else
3228 				 0,			/* boundary */
3229 #endif
3230 				 BUS_SPACE_MAXADDR,	/* low */
3231 				 BUS_SPACE_MAXADDR,	/* high */
3232 				 NULL, NULL,		/* filter */
3233 				 3*4096,		/* maxsize */
3234 #if MXGE_VIRT_JUMBOS
3235 				 3,			/* num segs */
3236 				 4096,			/* maxsegsize*/
3237 #else
3238 				 1,			/* num segs */
3239 				 MJUM9BYTES,		/* maxsegsize*/
3240 #endif
3241 				 BUS_DMA_ALLOCNOW,	/* flags */
3242 				 &ss->rx_big.dmat);	/* tag */
3243 	if (err != 0) {
3244 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3245 			      err);
3246 		return err;;
3247 	}
3248 	for (i = 0; i <= ss->rx_small.mask; i++) {
3249 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3250 					&ss->rx_small.info[i].map);
3251 		if (err != 0) {
3252 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3253 				      err);
3254 			return err;;
3255 		}
3256 	}
3257 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3258 				&ss->rx_small.extra_map);
3259 	if (err != 0) {
3260 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3261 			      err);
3262 		return err;;
3263 	}
3264 
3265 	for (i = 0; i <= ss->rx_big.mask; i++) {
3266 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3267 					&ss->rx_big.info[i].map);
3268 		if (err != 0) {
3269 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3270 				      err);
3271 			return err;;
3272 		}
3273 	}
3274 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3275 				&ss->rx_big.extra_map);
3276 	if (err != 0) {
3277 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3278 			      err);
3279 		return err;;
3280 	}
3281 
3282 	/* now allocate TX resouces */
3283 
3284 #ifndef IFNET_BUF_RING
3285 	/* only use a single TX ring for now */
3286 	if (ss != ss->sc->ss)
3287 		return 0;
3288 #endif
3289 
3290 	ss->tx.mask = tx_ring_entries - 1;
3291 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3292 
3293 
3294 	/* allocate the tx request copy block */
3295 	bytes = 8 +
3296 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3297 	ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3298 	if (ss->tx.req_bytes == NULL)
3299 		return err;;
3300 	/* ensure req_list entries are aligned to 8 bytes */
3301 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3302 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3303 
3304 	/* allocate the tx busdma segment list */
3305 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3306 	ss->tx.seg_list = (bus_dma_segment_t *)
3307 		kmalloc(bytes, M_DEVBUF, M_WAITOK);
3308 	if (ss->tx.seg_list == NULL)
3309 		return err;;
3310 
3311 	/* allocate the tx host info ring */
3312 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3313 	ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3314 	if (ss->tx.info == NULL)
3315 		return err;;
3316 
3317 	/* allocate the tx busdma resources */
3318 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3319 				 1,			/* alignment */
3320 				 sc->tx_boundary,	/* boundary */
3321 				 BUS_SPACE_MAXADDR,	/* low */
3322 				 BUS_SPACE_MAXADDR,	/* high */
3323 				 NULL, NULL,		/* filter */
3324 				 65536 + 256,		/* maxsize */
3325 				 ss->tx.max_desc - 2,	/* num segs */
3326 				 sc->tx_boundary,	/* maxsegsz */
3327 				 BUS_DMA_ALLOCNOW,	/* flags */
3328 				 &ss->tx.dmat);		/* tag */
3329 
3330 	if (err != 0) {
3331 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3332 			      err);
3333 		return err;;
3334 	}
3335 
3336 	/* now use these tags to setup dmamaps for each slot
3337 	   in the ring */
3338 	for (i = 0; i <= ss->tx.mask; i++) {
3339 		err = bus_dmamap_create(ss->tx.dmat, 0,
3340 					&ss->tx.info[i].map);
3341 		if (err != 0) {
3342 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3343 				      err);
3344 			return err;;
3345 		}
3346 	}
3347 	return 0;
3348 
3349 }
3350 
3351 static int
3352 mxge_alloc_rings(mxge_softc_t *sc)
3353 {
3354 	mxge_cmd_t cmd;
3355 	int tx_ring_size;
3356 	int tx_ring_entries, rx_ring_entries;
3357 	int err, slice;
3358 
3359 	/* get ring sizes */
3360 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3361 	tx_ring_size = cmd.data0;
3362 	if (err != 0) {
3363 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3364 		goto abort;
3365 	}
3366 
3367 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3368 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3369 	ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3370 	ifq_set_ready(&sc->ifp->if_snd);
3371 
3372 	for (slice = 0; slice < sc->num_slices; slice++) {
3373 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3374 					     rx_ring_entries,
3375 					     tx_ring_entries);
3376 		if (err != 0)
3377 			goto abort;
3378 	}
3379 	return 0;
3380 
3381 abort:
3382 	mxge_free_rings(sc);
3383 	return err;
3384 
3385 }
3386 
3387 
3388 static void
3389 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3390 {
3391 	int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3392 
3393 	if (bufsize < MCLBYTES) {
3394 		/* easy, everything fits in a single buffer */
3395 		*big_buf_size = MCLBYTES;
3396 		*cl_size = MCLBYTES;
3397 		*nbufs = 1;
3398 		return;
3399 	}
3400 
3401 	if (bufsize < MJUMPAGESIZE) {
3402 		/* still easy, everything still fits in a single buffer */
3403 		*big_buf_size = MJUMPAGESIZE;
3404 		*cl_size = MJUMPAGESIZE;
3405 		*nbufs = 1;
3406 		return;
3407 	}
3408 #if MXGE_VIRT_JUMBOS
3409 	/* now we need to use virtually contiguous buffers */
3410 	*cl_size = MJUM9BYTES;
3411 	*big_buf_size = 4096;
3412 	*nbufs = mtu / 4096 + 1;
3413 	/* needs to be a power of two, so round up */
3414 	if (*nbufs == 3)
3415 		*nbufs = 4;
3416 #else
3417 	*cl_size = MJUM9BYTES;
3418 	*big_buf_size = MJUM9BYTES;
3419 	*nbufs = 1;
3420 #endif
3421 }
3422 
3423 static int
3424 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3425 {
3426 	mxge_softc_t *sc;
3427 	mxge_cmd_t cmd;
3428 	bus_dmamap_t map;
3429 	struct lro_entry *lro_entry;
3430 	int err, i, slice;
3431 
3432 
3433 	sc = ss->sc;
3434 	slice = ss - sc->ss;
3435 
3436 	SLIST_INIT(&ss->lro_free);
3437 	SLIST_INIT(&ss->lro_active);
3438 
3439 	for (i = 0; i < sc->lro_cnt; i++) {
3440 		lro_entry = (struct lro_entry *)
3441 			kmalloc(sizeof (*lro_entry), M_DEVBUF,
3442 			       M_NOWAIT | M_ZERO);
3443 		if (lro_entry == NULL) {
3444 			sc->lro_cnt = i;
3445 			break;
3446 		}
3447 		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3448 	}
3449 	/* get the lanai pointers to the send and receive rings */
3450 
3451 	err = 0;
3452 #ifndef IFNET_BUF_RING
3453 	/* We currently only send from the first slice */
3454 	if (slice == 0) {
3455 #endif
3456 		cmd.data0 = slice;
3457 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3458 		ss->tx.lanai =
3459 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3460 		ss->tx.send_go = (volatile uint32_t *)
3461 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3462 		ss->tx.send_stop = (volatile uint32_t *)
3463 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3464 #ifndef IFNET_BUF_RING
3465 	}
3466 #endif
3467 	cmd.data0 = slice;
3468 	err |= mxge_send_cmd(sc,
3469 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3470 	ss->rx_small.lanai =
3471 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3472 	cmd.data0 = slice;
3473 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3474 	ss->rx_big.lanai =
3475 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3476 
3477 	if (err != 0) {
3478 		device_printf(sc->dev,
3479 			      "failed to get ring sizes or locations\n");
3480 		return EIO;
3481 	}
3482 
3483 	/* stock receive rings */
3484 	for (i = 0; i <= ss->rx_small.mask; i++) {
3485 		map = ss->rx_small.info[i].map;
3486 		err = mxge_get_buf_small(ss, map, i);
3487 		if (err) {
3488 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3489 				      i, ss->rx_small.mask + 1);
3490 			return ENOMEM;
3491 		}
3492 	}
3493 	for (i = 0; i <= ss->rx_big.mask; i++) {
3494 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3495 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3496 	}
3497 	ss->rx_big.nbufs = nbufs;
3498 	ss->rx_big.cl_size = cl_size;
3499 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3500 		EVL_ENCAPLEN + MXGEFW_PAD;
3501 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3502 		map = ss->rx_big.info[i].map;
3503 		err = mxge_get_buf_big(ss, map, i);
3504 		if (err) {
3505 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3506 				      i, ss->rx_big.mask + 1);
3507 			return ENOMEM;
3508 		}
3509 	}
3510 	return 0;
3511 }
3512 
3513 static int
3514 mxge_open(mxge_softc_t *sc)
3515 {
3516 	mxge_cmd_t cmd;
3517 	int err, big_bytes, nbufs, slice, cl_size, i;
3518 	bus_addr_t bus;
3519 	volatile uint8_t *itable;
3520 	struct mxge_slice_state *ss;
3521 
3522 	ASSERT_SERIALIZED(sc->ifp->if_serializer);
3523 	/* Copy the MAC address in case it was overridden */
3524 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3525 
3526 	err = mxge_reset(sc, 1);
3527 	if (err != 0) {
3528 		device_printf(sc->dev, "failed to reset\n");
3529 		return EIO;
3530 	}
3531 
3532 	if (sc->num_slices > 1) {
3533 		/* setup the indirection table */
3534 		cmd.data0 = sc->num_slices;
3535 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3536 				    &cmd);
3537 
3538 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3539 				     &cmd);
3540 		if (err != 0) {
3541 			device_printf(sc->dev,
3542 				      "failed to setup rss tables\n");
3543 			return err;
3544 		}
3545 
3546 		/* just enable an identity mapping */
3547 		itable = sc->sram + cmd.data0;
3548 		for (i = 0; i < sc->num_slices; i++)
3549 			itable[i] = (uint8_t)i;
3550 
3551 		cmd.data0 = 1;
3552 		cmd.data1 = mxge_rss_hash_type;
3553 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3554 		if (err != 0) {
3555 			device_printf(sc->dev, "failed to enable slices\n");
3556 			return err;
3557 		}
3558 	}
3559 
3560 
3561 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3562 
3563 	cmd.data0 = nbufs;
3564 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3565 			    &cmd);
3566 	/* error is only meaningful if we're trying to set
3567 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3568 	if (err && nbufs > 1) {
3569 		device_printf(sc->dev,
3570 			      "Failed to set alway-use-n to %d\n",
3571 			      nbufs);
3572 		return EIO;
3573 	}
3574 	/* Give the firmware the mtu and the big and small buffer
3575 	   sizes.  The firmware wants the big buf size to be a power
3576 	   of two. Luckily, FreeBSD's clusters are powers of two */
3577 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3578 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3579 	cmd.data0 = MHLEN - MXGEFW_PAD;
3580 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3581 			     &cmd);
3582 	cmd.data0 = big_bytes;
3583 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3584 
3585 	if (err != 0) {
3586 		device_printf(sc->dev, "failed to setup params\n");
3587 		goto abort;
3588 	}
3589 
3590 	/* Now give him the pointer to the stats block */
3591 	for (slice = 0;
3592 #ifdef IFNET_BUF_RING
3593 	     slice < sc->num_slices;
3594 #else
3595 	     slice < 1;
3596 #endif
3597 	     slice++) {
3598 		ss = &sc->ss[slice];
3599 		cmd.data0 =
3600 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3601 		cmd.data1 =
3602 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3603 		cmd.data2 = sizeof(struct mcp_irq_data);
3604 		cmd.data2 |= (slice << 16);
3605 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3606 	}
3607 
3608 	if (err != 0) {
3609 		bus = sc->ss->fw_stats_dma.bus_addr;
3610 		bus += offsetof(struct mcp_irq_data, send_done_count);
3611 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3612 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3613 		err = mxge_send_cmd(sc,
3614 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3615 				    &cmd);
3616 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3617 		sc->fw_multicast_support = 0;
3618 	} else {
3619 		sc->fw_multicast_support = 1;
3620 	}
3621 
3622 	if (err != 0) {
3623 		device_printf(sc->dev, "failed to setup params\n");
3624 		goto abort;
3625 	}
3626 
3627 	for (slice = 0; slice < sc->num_slices; slice++) {
3628 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3629 		if (err != 0) {
3630 			device_printf(sc->dev, "couldn't open slice %d\n",
3631 				      slice);
3632 			goto abort;
3633 		}
3634 	}
3635 
3636 	/* Finally, start the firmware running */
3637 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3638 	if (err) {
3639 		device_printf(sc->dev, "Couldn't bring up link\n");
3640 		goto abort;
3641 	}
3642 #ifdef IFNET_BUF_RING
3643 	for (slice = 0; slice < sc->num_slices; slice++) {
3644 		ss = &sc->ss[slice];
3645 		ss->if_flags |= IFF_RUNNING;
3646 		ss->if_flags &= ~IFF_OACTIVE;
3647 	}
3648 #endif
3649 	sc->ifp->if_flags |= IFF_RUNNING;
3650 	sc->ifp->if_flags &= ~IFF_OACTIVE;
3651 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3652 
3653 	return 0;
3654 
3655 
3656 abort:
3657 	mxge_free_mbufs(sc);
3658 
3659 	return err;
3660 }
3661 
3662 static int
3663 mxge_close(mxge_softc_t *sc)
3664 {
3665 	mxge_cmd_t cmd;
3666 	int err, old_down_cnt;
3667 #ifdef IFNET_BUF_RING
3668 	struct mxge_slice_state *ss;
3669 	int slice;
3670 #endif
3671 
3672 	ASSERT_SERIALIZED(sc->ifp->if_serializer);
3673 	callout_stop(&sc->co_hdl);
3674 #ifdef IFNET_BUF_RING
3675 	for (slice = 0; slice < sc->num_slices; slice++) {
3676 		ss = &sc->ss[slice];
3677 		ss->if_flags &= ~IFF_RUNNING;
3678 	}
3679 #endif
3680 	sc->ifp->if_flags &= ~IFF_RUNNING;
3681 	old_down_cnt = sc->down_cnt;
3682 	wmb();
3683 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3684 	if (err) {
3685 		device_printf(sc->dev, "Couldn't bring down link\n");
3686 	}
3687 	if (old_down_cnt == sc->down_cnt) {
3688 		/* wait for down irq */
3689 		DELAY(10 * sc->intr_coal_delay);
3690 	}
3691 	wmb();
3692 	if (old_down_cnt == sc->down_cnt) {
3693 		device_printf(sc->dev, "never got down irq\n");
3694 	}
3695 
3696 	mxge_free_mbufs(sc);
3697 
3698 	return 0;
3699 }
3700 
3701 static void
3702 mxge_setup_cfg_space(mxge_softc_t *sc)
3703 {
3704 	device_t dev = sc->dev;
3705 	int reg;
3706 	uint16_t cmd, lnk, pectl;
3707 
3708 	/* find the PCIe link width and set max read request to 4KB*/
3709 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3710 		lnk = pci_read_config(dev, reg + 0x12, 2);
3711 		sc->link_width = (lnk >> 4) & 0x3f;
3712 
3713 		pectl = pci_read_config(dev, reg + 0x8, 2);
3714 		pectl = (pectl & ~0x7000) | (5 << 12);
3715 		pci_write_config(dev, reg + 0x8, pectl, 2);
3716 	}
3717 
3718 	/* Enable DMA and Memory space access */
3719 	pci_enable_busmaster(dev);
3720 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3721 	cmd |= PCIM_CMD_MEMEN;
3722 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3723 }
3724 
3725 static uint32_t
3726 mxge_read_reboot(mxge_softc_t *sc)
3727 {
3728 	device_t dev = sc->dev;
3729 	uint32_t vs;
3730 
3731 	/* find the vendor specific offset */
3732 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3733 		device_printf(sc->dev,
3734 			      "could not find vendor specific offset\n");
3735 		return (uint32_t)-1;
3736 	}
3737 	/* enable read32 mode */
3738 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3739 	/* tell NIC which register to read */
3740 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3741 	return (pci_read_config(dev, vs + 0x14, 4));
3742 }
3743 
3744 static int
3745 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3746 {
3747 	struct pci_devinfo *dinfo;
3748 	mxge_tx_ring_t *tx;
3749 	int err;
3750 	uint32_t reboot;
3751 	uint16_t cmd;
3752 
3753 	err = ENXIO;
3754 
3755 	device_printf(sc->dev, "Watchdog reset!\n");
3756 
3757 	/*
3758 	 * check to see if the NIC rebooted.  If it did, then all of
3759 	 * PCI config space has been reset, and things like the
3760 	 * busmaster bit will be zero.  If this is the case, then we
3761 	 * must restore PCI config space before the NIC can be used
3762 	 * again
3763 	 */
3764 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3765 	if (cmd == 0xffff) {
3766 		/*
3767 		 * maybe the watchdog caught the NIC rebooting; wait
3768 		 * up to 100ms for it to finish.  If it does not come
3769 		 * back, then give up
3770 		 */
3771 		DELAY(1000*100);
3772 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3773 		if (cmd == 0xffff) {
3774 			device_printf(sc->dev, "NIC disappeared!\n");
3775 			return (err);
3776 		}
3777 	}
3778 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3779 		/* print the reboot status */
3780 		reboot = mxge_read_reboot(sc);
3781 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3782 			      reboot);
3783 		/* restore PCI configuration space */
3784 		dinfo = device_get_ivars(sc->dev);
3785 		pci_cfg_restore(sc->dev, dinfo);
3786 
3787 		/* and redo any changes we made to our config space */
3788 		mxge_setup_cfg_space(sc);
3789 
3790 		if (sc->ifp->if_flags & IFF_RUNNING) {
3791 			mxge_close(sc);
3792 			err = mxge_open(sc);
3793 		}
3794 	} else {
3795 		tx = &sc->ss[slice].tx;
3796 		device_printf(sc->dev,
3797 			      "NIC did not reboot, slice %d ring state:\n",
3798 			      slice);
3799 		device_printf(sc->dev,
3800 			      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3801 			      tx->req, tx->done, tx->queue_active);
3802 		device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3803 			      tx->activate, tx->deactivate);
3804 		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3805 			      tx->pkt_done,
3806 			      be32toh(sc->ss->fw_stats->send_done_count));
3807 		device_printf(sc->dev, "not resetting\n");
3808 	}
3809 	return (err);
3810 }
3811 
3812 static int
3813 mxge_watchdog(mxge_softc_t *sc)
3814 {
3815 	mxge_tx_ring_t *tx;
3816 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3817 	int i, err = 0;
3818 
3819 	/* see if we have outstanding transmits, which
3820 	   have been pending for more than mxge_ticks */
3821 	for (i = 0;
3822 #ifdef IFNET_BUF_RING
3823 	     (i < sc->num_slices) && (err == 0);
3824 #else
3825 	     (i < 1) && (err == 0);
3826 #endif
3827 	     i++) {
3828 		tx = &sc->ss[i].tx;
3829 		if (tx->req != tx->done &&
3830 		    tx->watchdog_req != tx->watchdog_done &&
3831 		    tx->done == tx->watchdog_done) {
3832 			/* check for pause blocking before resetting */
3833 			if (tx->watchdog_rx_pause == rx_pause)
3834 				err = mxge_watchdog_reset(sc, i);
3835 			else
3836 				device_printf(sc->dev, "Flow control blocking "
3837 					      "xmits, check link partner\n");
3838 		}
3839 
3840 		tx->watchdog_req = tx->req;
3841 		tx->watchdog_done = tx->done;
3842 		tx->watchdog_rx_pause = rx_pause;
3843 	}
3844 
3845 	if (sc->need_media_probe)
3846 		mxge_media_probe(sc);
3847 	return (err);
3848 }
3849 
3850 static void
3851 mxge_update_stats(mxge_softc_t *sc)
3852 {
3853 	struct mxge_slice_state *ss;
3854 	u_long ipackets = 0;
3855 	u_long opackets = 0;
3856 #ifdef IFNET_BUF_RING
3857 	u_long obytes = 0;
3858 	u_long omcasts = 0;
3859 	u_long odrops = 0;
3860 #endif
3861 	u_long oerrors = 0;
3862 	int slice;
3863 
3864 	for (slice = 0; slice < sc->num_slices; slice++) {
3865 		ss = &sc->ss[slice];
3866 		ipackets += ss->ipackets;
3867 		opackets += ss->opackets;
3868 #ifdef IFNET_BUF_RING
3869 		obytes += ss->obytes;
3870 		omcasts += ss->omcasts;
3871 		odrops += ss->tx.br->br_drops;
3872 #endif
3873 		oerrors += ss->oerrors;
3874 	}
3875 	sc->ifp->if_ipackets = ipackets;
3876 	sc->ifp->if_opackets = opackets;
3877 #ifdef IFNET_BUF_RING
3878 	sc->ifp->if_obytes = obytes;
3879 	sc->ifp->if_omcasts = omcasts;
3880 	sc->ifp->if_snd.ifq_drops = odrops;
3881 #endif
3882 	sc->ifp->if_oerrors = oerrors;
3883 }
3884 
3885 static void
3886 mxge_tick(void *arg)
3887 {
3888 	mxge_softc_t *sc = arg;
3889 	int err = 0;
3890 
3891 	lwkt_serialize_enter(sc->ifp->if_serializer);
3892 	/* aggregate stats from different slices */
3893 	mxge_update_stats(sc);
3894 	if (!sc->watchdog_countdown) {
3895 		err = mxge_watchdog(sc);
3896 		sc->watchdog_countdown = 4;
3897 	}
3898 	sc->watchdog_countdown--;
3899 	if (err == 0)
3900 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3901 	lwkt_serialize_exit(sc->ifp->if_serializer);
3902 }
3903 
3904 static int
3905 mxge_media_change(struct ifnet *ifp)
3906 {
3907 	return EINVAL;
3908 }
3909 
3910 static int
3911 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3912 {
3913 	struct ifnet *ifp = sc->ifp;
3914 	int real_mtu, old_mtu;
3915 	int err = 0;
3916 
3917 	if (ifp->if_serializer)
3918 		ASSERT_SERIALIZED(ifp->if_serializer);
3919 
3920 	real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3921 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3922 		return EINVAL;
3923 	old_mtu = ifp->if_mtu;
3924 	ifp->if_mtu = mtu;
3925 	if (ifp->if_flags & IFF_RUNNING) {
3926 		mxge_close(sc);
3927 		err = mxge_open(sc);
3928 		if (err != 0) {
3929 			ifp->if_mtu = old_mtu;
3930 			mxge_close(sc);
3931 			(void) mxge_open(sc);
3932 		}
3933 	}
3934 	return err;
3935 }
3936 
3937 static void
3938 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3939 {
3940 	mxge_softc_t *sc = ifp->if_softc;
3941 
3942 
3943 	if (sc == NULL)
3944 		return;
3945 	ifmr->ifm_status = IFM_AVALID;
3946 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3947 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3948 	ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3949 }
3950 
3951 static int
3952 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data, struct ucred *cr)
3953 {
3954 	mxge_softc_t *sc = ifp->if_softc;
3955 	struct ifreq *ifr = (struct ifreq *)data;
3956 	int err, mask;
3957 
3958 	(void)cr;
3959 	err = 0;
3960 	ASSERT_SERIALIZED(ifp->if_serializer);
3961 	switch (command) {
3962 	case SIOCSIFADDR:
3963 	case SIOCGIFADDR:
3964 		err = ether_ioctl(ifp, command, data);
3965 		break;
3966 
3967 	case SIOCSIFMTU:
3968 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3969 		break;
3970 
3971 	case SIOCSIFFLAGS:
3972 		if (sc->dying) {
3973 			return EINVAL;
3974 		}
3975 		if (ifp->if_flags & IFF_UP) {
3976 			if (!(ifp->if_flags & IFF_RUNNING)) {
3977 				err = mxge_open(sc);
3978 			} else {
3979 				/* take care of promis can allmulti
3980 				   flag chages */
3981 				mxge_change_promisc(sc,
3982 						    ifp->if_flags & IFF_PROMISC);
3983 				mxge_set_multicast_list(sc);
3984 			}
3985 		} else {
3986 			if (ifp->if_flags & IFF_RUNNING) {
3987 				mxge_close(sc);
3988 			}
3989 		}
3990 		break;
3991 
3992 	case SIOCADDMULTI:
3993 	case SIOCDELMULTI:
3994 		mxge_set_multicast_list(sc);
3995 		break;
3996 
3997 	case SIOCSIFCAP:
3998 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3999 		if (mask & IFCAP_TXCSUM) {
4000 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4001 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4002 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
4003 						      | CSUM_TSO);
4004 			} else {
4005 				ifp->if_capenable |= IFCAP_TXCSUM;
4006 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4007 			}
4008 		} else if (mask & IFCAP_RXCSUM) {
4009 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4010 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4011 				sc->csum_flag = 0;
4012 			} else {
4013 				ifp->if_capenable |= IFCAP_RXCSUM;
4014 				sc->csum_flag = 1;
4015 			}
4016 		}
4017 		if (mask & IFCAP_TSO4) {
4018 			if (IFCAP_TSO4 & ifp->if_capenable) {
4019 				ifp->if_capenable &= ~IFCAP_TSO4;
4020 				ifp->if_hwassist &= ~CSUM_TSO;
4021 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4022 				ifp->if_capenable |= IFCAP_TSO4;
4023 				ifp->if_hwassist |= CSUM_TSO;
4024 			} else {
4025 				kprintf("mxge requires tx checksum offload"
4026 				       " be enabled to use TSO\n");
4027 				err = EINVAL;
4028 			}
4029 		}
4030 		if (mask & IFCAP_LRO) {
4031 			if (IFCAP_LRO & ifp->if_capenable)
4032 				err = mxge_change_lro_locked(sc, 0);
4033 			else
4034 				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
4035 		}
4036 		if (mask & IFCAP_VLAN_HWTAGGING)
4037 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4038 		VLAN_CAPABILITIES(ifp);
4039 
4040 		break;
4041 
4042 	case SIOCGIFMEDIA:
4043 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4044 				    &sc->media, command);
4045                 break;
4046 
4047 	default:
4048 		err = ENOTTY;
4049         }
4050 	return err;
4051 }
4052 
4053 static void
4054 mxge_fetch_tunables(mxge_softc_t *sc)
4055 {
4056 
4057 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4058 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4059 			  &mxge_flow_control);
4060 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4061 			  &mxge_intr_coal_delay);
4062 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4063 			  &mxge_nvidia_ecrc_enable);
4064 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4065 			  &mxge_force_firmware);
4066 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4067 			  &mxge_deassert_wait);
4068 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4069 			  &mxge_verbose);
4070 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4071 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4072 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4073 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4074 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4075 	if (sc->lro_cnt != 0)
4076 		mxge_lro_cnt = sc->lro_cnt;
4077 
4078 	if (bootverbose)
4079 		mxge_verbose = 1;
4080 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4081 		mxge_intr_coal_delay = 30;
4082 	if (mxge_ticks == 0)
4083 		mxge_ticks = hz / 2;
4084 	sc->pause = mxge_flow_control;
4085 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4086 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4087 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4088 	}
4089 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4090 	    mxge_initial_mtu < ETHER_MIN_LEN)
4091 		mxge_initial_mtu = ETHERMTU_JUMBO;
4092 }
4093 
4094 
4095 static void
4096 mxge_free_slices(mxge_softc_t *sc)
4097 {
4098 	struct mxge_slice_state *ss;
4099 	int i;
4100 
4101 
4102 	if (sc->ss == NULL)
4103 		return;
4104 
4105 	for (i = 0; i < sc->num_slices; i++) {
4106 		ss = &sc->ss[i];
4107 		if (ss->fw_stats != NULL) {
4108 			mxge_dma_free(&ss->fw_stats_dma);
4109 			ss->fw_stats = NULL;
4110 #ifdef IFNET_BUF_RING
4111 			if (ss->tx.br != NULL) {
4112 				drbr_free(ss->tx.br, M_DEVBUF);
4113 				ss->tx.br = NULL;
4114 			}
4115 #endif
4116 		}
4117 		if (ss->rx_done.entry != NULL) {
4118 			mxge_dma_free(&ss->rx_done.dma);
4119 			ss->rx_done.entry = NULL;
4120 		}
4121 	}
4122 	kfree(sc->ss, M_DEVBUF);
4123 	sc->ss = NULL;
4124 }
4125 
4126 static int
4127 mxge_alloc_slices(mxge_softc_t *sc)
4128 {
4129 	mxge_cmd_t cmd;
4130 	struct mxge_slice_state *ss;
4131 	size_t bytes;
4132 	int err, i, max_intr_slots;
4133 
4134 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4135 	if (err != 0) {
4136 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4137 		return err;
4138 	}
4139 	sc->rx_ring_size = cmd.data0;
4140 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4141 
4142 	bytes = sizeof (*sc->ss) * sc->num_slices;
4143 	sc->ss = kmalloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4144 	if (sc->ss == NULL)
4145 		return (ENOMEM);
4146 	for (i = 0; i < sc->num_slices; i++) {
4147 		ss = &sc->ss[i];
4148 
4149 		ss->sc = sc;
4150 
4151 		/* allocate per-slice rx interrupt queues */
4152 
4153 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4154 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4155 		if (err != 0)
4156 			goto abort;
4157 		ss->rx_done.entry = ss->rx_done.dma.addr;
4158 		bzero(ss->rx_done.entry, bytes);
4159 
4160 		/*
4161 		 * allocate the per-slice firmware stats; stats
4162 		 * (including tx) are used used only on the first
4163 		 * slice for now
4164 		 */
4165 #ifndef IFNET_BUF_RING
4166 		if (i > 0)
4167 			continue;
4168 #endif
4169 
4170 		bytes = sizeof (*ss->fw_stats);
4171 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4172 				     sizeof (*ss->fw_stats), 64);
4173 		if (err != 0)
4174 			goto abort;
4175 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4176 #ifdef IFNET_BUF_RING
4177 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4178 					   &ss->tx.lock);
4179 #endif
4180 	}
4181 
4182 	return (0);
4183 
4184 abort:
4185 	mxge_free_slices(sc);
4186 	return (ENOMEM);
4187 }
4188 
4189 static void
4190 mxge_slice_probe(mxge_softc_t *sc)
4191 {
4192 	mxge_cmd_t cmd;
4193 	char *old_fw;
4194 	int msix_cnt, status, max_intr_slots;
4195 
4196 	sc->num_slices = 1;
4197 	/*
4198 	 *  don't enable multiple slices if they are not enabled,
4199 	 *  or if this is not an SMP system
4200 	 */
4201 
4202 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
4203 		return;
4204 
4205 	/* see how many MSI-X interrupts are available */
4206 	msix_cnt = pci_msix_count(sc->dev);
4207 	if (msix_cnt < 2)
4208 		return;
4209 
4210 	/* now load the slice aware firmware see what it supports */
4211 	old_fw = sc->fw_name;
4212 	if (old_fw == mxge_fw_aligned)
4213 		sc->fw_name = mxge_fw_rss_aligned;
4214 	else
4215 		sc->fw_name = mxge_fw_rss_unaligned;
4216 	status = mxge_load_firmware(sc, 0);
4217 	if (status != 0) {
4218 		device_printf(sc->dev, "Falling back to a single slice\n");
4219 		return;
4220 	}
4221 
4222 	/* try to send a reset command to the card to see if it
4223 	   is alive */
4224 	memset(&cmd, 0, sizeof (cmd));
4225 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4226 	if (status != 0) {
4227 		device_printf(sc->dev, "failed reset\n");
4228 		goto abort_with_fw;
4229 	}
4230 
4231 	/* get rx ring size */
4232 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4233 	if (status != 0) {
4234 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4235 		goto abort_with_fw;
4236 	}
4237 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4238 
4239 	/* tell it the size of the interrupt queues */
4240 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4241 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4242 	if (status != 0) {
4243 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4244 		goto abort_with_fw;
4245 	}
4246 
4247 	/* ask the maximum number of slices it supports */
4248 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4249 	if (status != 0) {
4250 		device_printf(sc->dev,
4251 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4252 		goto abort_with_fw;
4253 	}
4254 	sc->num_slices = cmd.data0;
4255 	if (sc->num_slices > msix_cnt)
4256 		sc->num_slices = msix_cnt;
4257 
4258 	if (mxge_max_slices == -1) {
4259 		/* cap to number of CPUs in system */
4260 		if (sc->num_slices > ncpus)
4261 			sc->num_slices = ncpus;
4262 	} else {
4263 		if (sc->num_slices > mxge_max_slices)
4264 			sc->num_slices = mxge_max_slices;
4265 	}
4266 	/* make sure it is a power of two */
4267 	while (sc->num_slices & (sc->num_slices - 1))
4268 		sc->num_slices--;
4269 
4270 	if (mxge_verbose)
4271 		device_printf(sc->dev, "using %d slices\n",
4272 			      sc->num_slices);
4273 
4274 	return;
4275 
4276 abort_with_fw:
4277 	sc->fw_name = old_fw;
4278 	(void) mxge_load_firmware(sc, 0);
4279 }
4280 
4281 static int
4282 mxge_add_msix_irqs(mxge_softc_t *sc)
4283 {
4284 	size_t bytes;
4285 	int count, err, i, rid;
4286 
4287 	rid = PCIR_BAR(2);
4288 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4289 						    &rid, RF_ACTIVE);
4290 
4291 	if (sc->msix_table_res == NULL) {
4292 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4293 		return ENXIO;
4294 	}
4295 
4296 	count = sc->num_slices;
4297 	err = pci_alloc_msix(sc->dev, &count);
4298 	if (err != 0) {
4299 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4300 			      "err = %d \n", sc->num_slices, err);
4301 		goto abort_with_msix_table;
4302 	}
4303 	if (count < sc->num_slices) {
4304 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4305 			      count, sc->num_slices);
4306 		device_printf(sc->dev,
4307 			      "Try setting hw.mxge.max_slices to %d\n",
4308 			      count);
4309 		err = ENOSPC;
4310 		goto abort_with_msix;
4311 	}
4312 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4313 	sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4314 	if (sc->msix_irq_res == NULL) {
4315 		err = ENOMEM;
4316 		goto abort_with_msix;
4317 	}
4318 
4319 	for (i = 0; i < sc->num_slices; i++) {
4320 		rid = i + 1;
4321 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4322 							  SYS_RES_IRQ,
4323 							  &rid, RF_ACTIVE);
4324 		if (sc->msix_irq_res[i] == NULL) {
4325 			device_printf(sc->dev, "couldn't allocate IRQ res"
4326 				      " for message %d\n", i);
4327 			err = ENXIO;
4328 			goto abort_with_res;
4329 		}
4330 	}
4331 
4332 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4333 	sc->msix_ih =  kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4334 
4335 	for (i = 0; i < sc->num_slices; i++) {
4336 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4337 				     INTR_MPSAFE,
4338 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i],
4339 				     sc->ifp->if_serializer);
4340 		if (err != 0) {
4341 			device_printf(sc->dev, "couldn't setup intr for "
4342 				      "message %d\n", i);
4343 			goto abort_with_intr;
4344 		}
4345 	}
4346 
4347 	if (mxge_verbose) {
4348 		device_printf(sc->dev, "using %d msix IRQs:",
4349 			      sc->num_slices);
4350 		for (i = 0; i < sc->num_slices; i++)
4351 			kprintf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4352 		kprintf("\n");
4353 	}
4354 	return (0);
4355 
4356 abort_with_intr:
4357 	for (i = 0; i < sc->num_slices; i++) {
4358 		if (sc->msix_ih[i] != NULL) {
4359 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4360 					  sc->msix_ih[i]);
4361 			sc->msix_ih[i] = NULL;
4362 		}
4363 	}
4364 	kfree(sc->msix_ih, M_DEVBUF);
4365 
4366 
4367 abort_with_res:
4368 	for (i = 0; i < sc->num_slices; i++) {
4369 		rid = i + 1;
4370 		if (sc->msix_irq_res[i] != NULL)
4371 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4372 					     sc->msix_irq_res[i]);
4373 		sc->msix_irq_res[i] = NULL;
4374 	}
4375 	kfree(sc->msix_irq_res, M_DEVBUF);
4376 
4377 
4378 abort_with_msix:
4379 	pci_release_msi(sc->dev);
4380 
4381 abort_with_msix_table:
4382 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4383 			     sc->msix_table_res);
4384 
4385 	return err;
4386 }
4387 
4388 static int
4389 mxge_add_single_irq(mxge_softc_t *sc)
4390 {
4391 	int count, err, rid;
4392 
4393 	count = pci_msi_count(sc->dev);
4394 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4395 		rid = 1;
4396 	} else {
4397 		rid = 0;
4398 		sc->legacy_irq = 1;
4399 	}
4400 	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4401 					 1, RF_SHAREABLE | RF_ACTIVE);
4402 	if (sc->irq_res == NULL) {
4403 		device_printf(sc->dev, "could not alloc interrupt\n");
4404 		return ENXIO;
4405 	}
4406 	if (mxge_verbose)
4407 		device_printf(sc->dev, "using %s irq %ld\n",
4408 			      sc->legacy_irq ? "INTx" : "MSI",
4409 			      rman_get_start(sc->irq_res));
4410 	err = bus_setup_intr(sc->dev, sc->irq_res,
4411 			     INTR_MPSAFE,
4412 			     mxge_intr, &sc->ss[0], &sc->ih,
4413 			     sc->ifp->if_serializer);
4414 	if (err != 0) {
4415 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4416 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4417 		if (!sc->legacy_irq)
4418 			pci_release_msi(sc->dev);
4419 	}
4420 	return err;
4421 }
4422 
4423 static void
4424 mxge_rem_msix_irqs(mxge_softc_t *sc)
4425 {
4426 	int i, rid;
4427 
4428 	for (i = 0; i < sc->num_slices; i++) {
4429 		if (sc->msix_ih[i] != NULL) {
4430 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4431 					  sc->msix_ih[i]);
4432 			sc->msix_ih[i] = NULL;
4433 		}
4434 	}
4435 	kfree(sc->msix_ih, M_DEVBUF);
4436 
4437 	for (i = 0; i < sc->num_slices; i++) {
4438 		rid = i + 1;
4439 		if (sc->msix_irq_res[i] != NULL)
4440 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4441 					     sc->msix_irq_res[i]);
4442 		sc->msix_irq_res[i] = NULL;
4443 	}
4444 	kfree(sc->msix_irq_res, M_DEVBUF);
4445 
4446 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4447 			     sc->msix_table_res);
4448 
4449 	pci_release_msi(sc->dev);
4450 	return;
4451 }
4452 
4453 static void
4454 mxge_rem_single_irq(mxge_softc_t *sc)
4455 {
4456 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4457 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4458 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4459 	if (!sc->legacy_irq)
4460 		pci_release_msi(sc->dev);
4461 }
4462 
4463 static void
4464 mxge_rem_irq(mxge_softc_t *sc)
4465 {
4466 	if (sc->num_slices > 1)
4467 		mxge_rem_msix_irqs(sc);
4468 	else
4469 		mxge_rem_single_irq(sc);
4470 }
4471 
4472 static int
4473 mxge_add_irq(mxge_softc_t *sc)
4474 {
4475 	int err;
4476 
4477 	if (sc->num_slices > 1)
4478 		err = mxge_add_msix_irqs(sc);
4479 	else
4480 		err = mxge_add_single_irq(sc);
4481 
4482 	if (0 && err == 0 && sc->num_slices > 1) {
4483 		mxge_rem_msix_irqs(sc);
4484 		err = mxge_add_msix_irqs(sc);
4485 	}
4486 	return err;
4487 }
4488 
4489 
4490 static int
4491 mxge_attach(device_t dev)
4492 {
4493 	mxge_softc_t *sc = device_get_softc(dev);
4494 	struct ifnet *ifp = &sc->arpcom.ac_if;
4495 	int err, rid;
4496 
4497 	/*
4498 	 * avoid rewriting half the lines in this file to use
4499 	 * &sc->arpcom.ac_if instead
4500 	 */
4501 	sc->ifp = ifp;
4502 	sc->dev = dev;
4503 	mxge_fetch_tunables(sc);
4504 
4505 	err = bus_dma_tag_create(NULL,			/* parent */
4506 				 1,			/* alignment */
4507 				 0,			/* boundary */
4508 				 BUS_SPACE_MAXADDR,	/* low */
4509 				 BUS_SPACE_MAXADDR,	/* high */
4510 				 NULL, NULL,		/* filter */
4511 				 65536 + 256,		/* maxsize */
4512 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4513 				 65536,			/* maxsegsize */
4514 				 0,			/* flags */
4515 				 &sc->parent_dmat);	/* tag */
4516 
4517 	if (err != 0) {
4518 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4519 			      err);
4520 		goto abort_with_nothing;
4521 	}
4522 
4523 	sc->ifp = ifp;
4524 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4525 
4526 	callout_init_mp(&sc->co_hdl);
4527 
4528 	mxge_setup_cfg_space(sc);
4529 
4530 	/* Map the board into the kernel */
4531 	rid = PCIR_BARS;
4532 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4533 					 ~0, 1, RF_ACTIVE);
4534 	if (sc->mem_res == NULL) {
4535 		device_printf(dev, "could not map memory\n");
4536 		err = ENXIO;
4537 		goto abort_with_nothing;
4538 	}
4539 	sc->sram = rman_get_virtual(sc->mem_res);
4540 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4541 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4542 		device_printf(dev, "impossible memory region size %ld\n",
4543 			      rman_get_size(sc->mem_res));
4544 		err = ENXIO;
4545 		goto abort_with_mem_res;
4546 	}
4547 
4548 	/* make NULL terminated copy of the EEPROM strings section of
4549 	   lanai SRAM */
4550 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4551 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4552 				rman_get_bushandle(sc->mem_res),
4553 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4554 				sc->eeprom_strings,
4555 				MXGE_EEPROM_STRINGS_SIZE - 2);
4556 	err = mxge_parse_strings(sc);
4557 	if (err != 0)
4558 		goto abort_with_mem_res;
4559 
4560 	/* Enable write combining for efficient use of PCIe bus */
4561 	mxge_enable_wc(sc);
4562 
4563 	/* Allocate the out of band dma memory */
4564 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4565 			     sizeof (mxge_cmd_t), 64);
4566 	if (err != 0)
4567 		goto abort_with_mem_res;
4568 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4569 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4570 	if (err != 0)
4571 		goto abort_with_cmd_dma;
4572 
4573 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4574 	if (err != 0)
4575 		goto abort_with_zeropad_dma;
4576 
4577 	/* select & load the firmware */
4578 	err = mxge_select_firmware(sc);
4579 	if (err != 0)
4580 		goto abort_with_dmabench;
4581 	sc->intr_coal_delay = mxge_intr_coal_delay;
4582 
4583 	mxge_slice_probe(sc);
4584 	err = mxge_alloc_slices(sc);
4585 	if (err != 0)
4586 		goto abort_with_dmabench;
4587 
4588 	err = mxge_reset(sc, 0);
4589 	if (err != 0)
4590 		goto abort_with_slices;
4591 
4592 	err = mxge_alloc_rings(sc);
4593 	if (err != 0) {
4594 		device_printf(sc->dev, "failed to allocate rings\n");
4595 		goto abort_with_dmabench;
4596 	}
4597 
4598 	ifp->if_baudrate = IF_Gbps(10UL);
4599 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4600 		IFCAP_VLAN_MTU;
4601 #ifdef INET
4602 	ifp->if_capabilities |= IFCAP_LRO;
4603 #endif
4604 
4605 #ifdef MXGE_NEW_VLAN_API
4606 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4607 #endif
4608 
4609 	sc->max_mtu = mxge_max_mtu(sc);
4610 	if (sc->max_mtu >= 9000)
4611 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4612 	else
4613 		device_printf(dev, "MTU limited to %d.  Install "
4614 			      "latest firmware for 9000 byte jumbo support\n",
4615 			      sc->max_mtu - ETHER_HDR_LEN);
4616 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4617 	ifp->if_capenable = ifp->if_capabilities;
4618 	if (sc->lro_cnt == 0)
4619 		ifp->if_capenable &= ~IFCAP_LRO;
4620 	sc->csum_flag = 1;
4621         ifp->if_init = mxge_init;
4622         ifp->if_softc = sc;
4623         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4624         ifp->if_ioctl = mxge_ioctl;
4625         ifp->if_start = mxge_start;
4626 	/* Initialise the ifmedia structure */
4627 	ifmedia_init(&sc->media, 0, mxge_media_change,
4628 		     mxge_media_status);
4629 	mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4630 	mxge_media_probe(sc);
4631 	sc->dying = 0;
4632 	ether_ifattach(ifp, sc->mac_addr, NULL);
4633 	/* ether_ifattach sets mtu to ETHERMTU */
4634 	if (mxge_initial_mtu != ETHERMTU) {
4635 		lwkt_serialize_enter(ifp->if_serializer);
4636 		mxge_change_mtu(sc, mxge_initial_mtu);
4637 		lwkt_serialize_exit(ifp->if_serializer);
4638 	}
4639 	/* must come after ether_ifattach() */
4640 	err = mxge_add_irq(sc);
4641 	if (err != 0) {
4642 		device_printf(sc->dev, "failed to add irq\n");
4643 		goto abort_with_rings;
4644 	}
4645 
4646 	mxge_add_sysctls(sc);
4647 #ifdef IFNET_BUF_RING
4648 	ifp->if_transmit = mxge_transmit;
4649 	ifp->if_qflush = mxge_qflush;
4650 #endif
4651 	return 0;
4652 
4653 abort_with_rings:
4654 	mxge_free_rings(sc);
4655 abort_with_slices:
4656 	mxge_free_slices(sc);
4657 abort_with_dmabench:
4658 	mxge_dma_free(&sc->dmabench_dma);
4659 abort_with_zeropad_dma:
4660 	mxge_dma_free(&sc->zeropad_dma);
4661 abort_with_cmd_dma:
4662 	mxge_dma_free(&sc->cmd_dma);
4663 abort_with_mem_res:
4664 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4665 	pci_disable_busmaster(dev);
4666 	bus_dma_tag_destroy(sc->parent_dmat);
4667 abort_with_nothing:
4668 	return err;
4669 }
4670 
4671 static int
4672 mxge_detach(device_t dev)
4673 {
4674 	mxge_softc_t *sc = device_get_softc(dev);
4675 
4676 	lwkt_serialize_enter(sc->ifp->if_serializer);
4677 	sc->dying = 1;
4678 	if (sc->ifp->if_flags & IFF_RUNNING)
4679 		mxge_close(sc);
4680 	/*
4681 	 * XXX: race: the callout callback could be spinning on
4682 	 * the serializer and run anyway
4683 	 */
4684 	callout_stop(&sc->co_hdl);
4685 	lwkt_serialize_exit(sc->ifp->if_serializer);
4686 
4687 	ether_ifdetach(sc->ifp);
4688 	ifmedia_removeall(&sc->media);
4689 	mxge_dummy_rdma(sc, 0);
4690 	mxge_rem_sysctls(sc);
4691 	mxge_rem_irq(sc);
4692 	mxge_free_rings(sc);
4693 	mxge_free_slices(sc);
4694 	mxge_dma_free(&sc->dmabench_dma);
4695 	mxge_dma_free(&sc->zeropad_dma);
4696 	mxge_dma_free(&sc->cmd_dma);
4697 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4698 	pci_disable_busmaster(dev);
4699 	bus_dma_tag_destroy(sc->parent_dmat);
4700 	return 0;
4701 }
4702 
4703 static int
4704 mxge_shutdown(device_t dev)
4705 {
4706 	return 0;
4707 }
4708 
4709 /*
4710   This file uses Myri10GE driver indentation.
4711 
4712   Local Variables:
4713   c-file-style:"linux"
4714   tab-width:8
4715   End:
4716 */
4717