xref: /dragonfly/sys/dev/netif/mxge/if_mxge.c (revision 92fc8b5c)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2009, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 $FreeBSD: src/sys/dev/mxge/if_mxge.c,v 1.63 2009/06/26 11:45:06 rwatson Exp $
29 
30 ***************************************************************************/
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/linker.h>
35 #include <sys/firmware.h>
36 #include <sys/endian.h>
37 #include <sys/in_cksum.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kernel.h>
42 #include <sys/module.h>
43 #include <sys/serialize.h>
44 #include <sys/socket.h>
45 #include <sys/sysctl.h>
46 
47 /* count xmits ourselves, rather than via drbr */
48 #define NO_SLOW_STATS
49 #include <net/if.h>
50 #include <net/if_arp.h>
51 #include <net/ifq_var.h>
52 #include <net/ethernet.h>
53 #include <net/if_dl.h>
54 #include <net/if_media.h>
55 
56 #include <net/bpf.h>
57 
58 #include <net/if_types.h>
59 #include <net/vlan/if_vlan_var.h>
60 #include <net/zlib.h>
61 
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/tcp.h>
66 
67 #include <sys/bus.h>
68 #include <sys/rman.h>
69 
70 #include <bus/pci/pcireg.h>
71 #include <bus/pci/pcivar.h>
72 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
73 
74 #include <vm/vm.h>		/* for pmap_mapdev() */
75 #include <vm/pmap.h>
76 
77 #if defined(__i386) || defined(__x86_64)
78 #include <machine/specialreg.h>
79 #endif
80 
81 #include <dev/netif/mxge/mxge_mcp.h>
82 #include <dev/netif/mxge/mcp_gen_header.h>
83 /*#define MXGE_FAKE_IFP*/
84 #include <dev/netif/mxge/if_mxge_var.h>
85 #ifdef IFNET_BUF_RING
86 #include <sys/buf_ring.h>
87 #endif
88 
89 #include "opt_inet.h"
90 
91 /* tunable params */
92 static int mxge_nvidia_ecrc_enable = 1;
93 static int mxge_force_firmware = 0;
94 static int mxge_intr_coal_delay = 30;
95 static int mxge_deassert_wait = 1;
96 static int mxge_flow_control = 1;
97 static int mxge_verbose = 0;
98 static int mxge_lro_cnt = 8;
99 static int mxge_ticks;
100 static int mxge_max_slices = 1;
101 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
102 static int mxge_always_promisc = 0;
103 /* XXX: not yet */
104 /* static int mxge_initial_mtu = ETHERMTU_JUMBO; */
105 static int mxge_initial_mtu = ETHERMTU;
106 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
107 static char *mxge_fw_aligned = "mxge_eth_z8e";
108 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
109 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
110 
111 static int mxge_probe(device_t dev);
112 static int mxge_attach(device_t dev);
113 static int mxge_detach(device_t dev);
114 static int mxge_shutdown(device_t dev);
115 static void mxge_intr(void *arg);
116 
117 static device_method_t mxge_methods[] =
118 {
119   /* Device interface */
120   DEVMETHOD(device_probe, mxge_probe),
121   DEVMETHOD(device_attach, mxge_attach),
122   DEVMETHOD(device_detach, mxge_detach),
123   DEVMETHOD(device_shutdown, mxge_shutdown),
124   {0, 0}
125 };
126 
127 static driver_t mxge_driver =
128 {
129   "mxge",
130   mxge_methods,
131   sizeof(mxge_softc_t),
132 };
133 
134 static devclass_t mxge_devclass;
135 
136 /* Declare ourselves to be a child of the PCI bus.*/
137 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
138 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
139 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
140 
141 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
142 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
143 static int mxge_close(mxge_softc_t *sc);
144 static int mxge_open(mxge_softc_t *sc);
145 static void mxge_tick(void *arg);
146 
147 /* XXX: we don't have Large Receive Offload support yet */
148  inline int
149 mxge_lro_rx(struct mxge_slice_state *ss, struct mbuf *m_head, uint32_t csum)
150 {
151 	(void)ss;
152 	(void)m_head;
153 	(void)csum;
154 	return 1;
155 }
156 
157  inline void
158 mxge_lro_flush(struct mxge_slice_state *ss, struct lro_entry *lro)
159 {
160 	(void)ss;
161 	(void)lro;
162 }
163 
164 static int
165 mxge_probe(device_t dev)
166 {
167 	int rev;
168 
169 
170 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
171 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
172 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
173 		rev = pci_get_revid(dev);
174 		switch (rev) {
175 		case MXGE_PCI_REV_Z8E:
176 			device_set_desc(dev, "Myri10G-PCIE-8A");
177 			break;
178 		case MXGE_PCI_REV_Z8ES:
179 			device_set_desc(dev, "Myri10G-PCIE-8B");
180 			break;
181 		default:
182 			device_set_desc(dev, "Myri10G-PCIE-8??");
183 			device_printf(dev, "Unrecognized rev %d NIC\n",
184 				      rev);
185 			break;
186 		}
187 		return 0;
188 	}
189 	return ENXIO;
190 }
191 
192 static void
193 mxge_enable_wc(mxge_softc_t *sc)
194 {
195 #if 0
196 #if defined(__i386) || defined(__x86_64)
197 	vm_offset_t len;
198 	int err;
199 
200 	sc->wc = 1;
201 	len = rman_get_size(sc->mem_res);
202 	err = pmap_change_attr((vm_offset_t) sc->sram,
203 			       len, PAT_WRITE_COMBINING);
204 	if (err != 0) {
205 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
206 			      err);
207 		sc->wc = 0;
208 	}
209 #endif
210 #else
211 	sc->wc = 0;	/* TBD: PAT support */
212 #endif
213 }
214 
215 
216 /* callback to get our DMA address */
217 static void
218 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
219 			 int error)
220 {
221 	if (error == 0) {
222 		*(bus_addr_t *) arg = segs->ds_addr;
223 	}
224 }
225 
226 static int
227 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
228 		   bus_size_t alignment)
229 {
230 	int err;
231 	device_t dev = sc->dev;
232 	bus_size_t boundary, maxsegsize;
233 
234 	if (bytes > 4096 && alignment == 4096) {
235 		boundary = 0;
236 		maxsegsize = bytes;
237 	} else {
238 		boundary = 4096;
239 		maxsegsize = 4096;
240 	}
241 
242 	/* allocate DMAable memory tags */
243 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
244 				 alignment,		/* alignment */
245 				 boundary,		/* boundary */
246 				 BUS_SPACE_MAXADDR,	/* low */
247 				 BUS_SPACE_MAXADDR,	/* high */
248 				 NULL, NULL,		/* filter */
249 				 bytes,			/* maxsize */
250 				 1,			/* num segs */
251 				 maxsegsize,		/* maxsegsize */
252 				 BUS_DMA_COHERENT,	/* flags */
253 				 &dma->dmat);		/* tag */
254 	if (err != 0) {
255 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
256 		return err;
257 	}
258 
259 	/* allocate DMAable memory & map */
260 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
261 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
262 				| BUS_DMA_ZERO),  &dma->map);
263 	if (err != 0) {
264 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
265 		goto abort_with_dmat;
266 	}
267 
268 	/* load the memory */
269 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
270 			      mxge_dmamap_callback,
271 			      (void *)&dma->bus_addr, 0);
272 	if (err != 0) {
273 		device_printf(dev, "couldn't load map (err = %d)\n", err);
274 		goto abort_with_mem;
275 	}
276 	return 0;
277 
278 abort_with_mem:
279 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
280 abort_with_dmat:
281 	(void)bus_dma_tag_destroy(dma->dmat);
282 	return err;
283 }
284 
285 
286 static void
287 mxge_dma_free(mxge_dma_t *dma)
288 {
289 	bus_dmamap_unload(dma->dmat, dma->map);
290 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
291 	(void)bus_dma_tag_destroy(dma->dmat);
292 }
293 
294 /*
295  * The eeprom strings on the lanaiX have the format
296  * SN=x\0
297  * MAC=x:x:x:x:x:x\0
298  * PC=text\0
299  */
300 
301 static int
302 mxge_parse_strings(mxge_softc_t *sc)
303 {
304 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
305 
306 	char *ptr, *limit;
307 	int i, found_mac;
308 
309 	ptr = sc->eeprom_strings;
310 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
311 	found_mac = 0;
312 	while (ptr < limit && *ptr != '\0') {
313 		if (memcmp(ptr, "MAC=", 4) == 0) {
314 			ptr += 1;
315 			sc->mac_addr_string = ptr;
316 			for (i = 0; i < 6; i++) {
317 				ptr += 3;
318 				if ((ptr + 2) > limit)
319 					goto abort;
320 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
321 				found_mac = 1;
322 			}
323 		} else if (memcmp(ptr, "PC=", 3) == 0) {
324 			ptr += 3;
325 			strncpy(sc->product_code_string, ptr,
326 				sizeof (sc->product_code_string) - 1);
327 		} else if (memcmp(ptr, "SN=", 3) == 0) {
328 			ptr += 3;
329 			strncpy(sc->serial_number_string, ptr,
330 				sizeof (sc->serial_number_string) - 1);
331 		}
332 		MXGE_NEXT_STRING(ptr);
333 	}
334 
335 	if (found_mac)
336 		return 0;
337 
338  abort:
339 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
340 
341 	return ENXIO;
342 }
343 
344 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
345 static void
346 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
347 {
348 	uint32_t val;
349 	unsigned long base, off;
350 	char *va, *cfgptr;
351 	device_t pdev, mcp55;
352 	uint16_t vendor_id, device_id, word;
353 	uintptr_t bus, slot, func, ivend, idev;
354 	uint32_t *ptr32;
355 
356 
357 	if (!mxge_nvidia_ecrc_enable)
358 		return;
359 
360 	pdev = device_get_parent(device_get_parent(sc->dev));
361 	if (pdev == NULL) {
362 		device_printf(sc->dev, "could not find parent?\n");
363 		return;
364 	}
365 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
366 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
367 
368 	if (vendor_id != 0x10de)
369 		return;
370 
371 	base = 0;
372 
373 	if (device_id == 0x005d) {
374 		/* ck804, base address is magic */
375 		base = 0xe0000000UL;
376 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
377 		/* mcp55, base address stored in chipset */
378 		mcp55 = pci_find_bsf(0, 0, 0);
379 		if (mcp55 &&
380 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
381 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
382 			word = pci_read_config(mcp55, 0x90, 2);
383 			base = ((unsigned long)word & 0x7ffeU) << 25;
384 		}
385 	}
386 	if (!base)
387 		return;
388 
389 	/* XXXX
390 	   Test below is commented because it is believed that doing
391 	   config read/write beyond 0xff will access the config space
392 	   for the next larger function.  Uncomment this and remove
393 	   the hacky pmap_mapdev() way of accessing config space when
394 	   FreeBSD grows support for extended pcie config space access
395 	*/
396 #if 0
397 	/* See if we can, by some miracle, access the extended
398 	   config space */
399 	val = pci_read_config(pdev, 0x178, 4);
400 	if (val != 0xffffffff) {
401 		val |= 0x40;
402 		pci_write_config(pdev, 0x178, val, 4);
403 		return;
404 	}
405 #endif
406 	/* Rather than using normal pci config space writes, we must
407 	 * map the Nvidia config space ourselves.  This is because on
408 	 * opteron/nvidia class machine the 0xe000000 mapping is
409 	 * handled by the nvidia chipset, that means the internal PCI
410 	 * device (the on-chip northbridge), or the amd-8131 bridge
411 	 * and things behind them are not visible by this method.
412 	 */
413 
414 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
415 		      PCI_IVAR_BUS, &bus);
416 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
417 		      PCI_IVAR_SLOT, &slot);
418 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
419 		      PCI_IVAR_FUNCTION, &func);
420 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
421 		      PCI_IVAR_VENDOR, &ivend);
422 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
423 		      PCI_IVAR_DEVICE, &idev);
424 
425 	off =  base
426 		+ 0x00100000UL * (unsigned long)bus
427 		+ 0x00001000UL * (unsigned long)(func
428 						 + 8 * slot);
429 
430 	/* map it into the kernel */
431 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
432 
433 
434 	if (va == NULL) {
435 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
436 		return;
437 	}
438 	/* get a pointer to the config space mapped into the kernel */
439 	cfgptr = va + (off & PAGE_MASK);
440 
441 	/* make sure that we can really access it */
442 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
443 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
444 	if (! (vendor_id == ivend && device_id == idev)) {
445 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
446 			      vendor_id, device_id);
447 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
448 		return;
449 	}
450 
451 	ptr32 = (uint32_t*)(cfgptr + 0x178);
452 	val = *ptr32;
453 
454 	if (val == 0xffffffff) {
455 		device_printf(sc->dev, "extended mapping failed\n");
456 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
457 		return;
458 	}
459 	*ptr32 = val | 0x40;
460 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
461 	if (mxge_verbose)
462 		device_printf(sc->dev,
463 			      "Enabled ECRC on upstream Nvidia bridge "
464 			      "at %d:%d:%d\n",
465 			      (int)bus, (int)slot, (int)func);
466 	return;
467 }
468 #else
469 static void
470 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
471 {
472 	device_printf(sc->dev,
473 		      "Nforce 4 chipset on non-x86/x86_64!?!?!\n");
474 	return;
475 }
476 #endif
477 
478 
479 static int
480 mxge_dma_test(mxge_softc_t *sc, int test_type)
481 {
482 	mxge_cmd_t cmd;
483 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
484 	int status;
485 	uint32_t len;
486 	char *test = " ";
487 
488 
489 	/* Run a small DMA test.
490 	 * The magic multipliers to the length tell the firmware
491 	 * to do DMA read, write, or read+write tests.  The
492 	 * results are returned in cmd.data0.  The upper 16
493 	 * bits of the return is the number of transfers completed.
494 	 * The lower 16 bits is the time in 0.5us ticks that the
495 	 * transfers took to complete.
496 	 */
497 
498 	len = sc->tx_boundary;
499 
500 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
501 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
502 	cmd.data2 = len * 0x10000;
503 	status = mxge_send_cmd(sc, test_type, &cmd);
504 	if (status != 0) {
505 		test = "read";
506 		goto abort;
507 	}
508 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
509 		(cmd.data0 & 0xffff);
510 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
511 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
512 	cmd.data2 = len * 0x1;
513 	status = mxge_send_cmd(sc, test_type, &cmd);
514 	if (status != 0) {
515 		test = "write";
516 		goto abort;
517 	}
518 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
519 		(cmd.data0 & 0xffff);
520 
521 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
522 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
523 	cmd.data2 = len * 0x10001;
524 	status = mxge_send_cmd(sc, test_type, &cmd);
525 	if (status != 0) {
526 		test = "read/write";
527 		goto abort;
528 	}
529 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
530 		(cmd.data0 & 0xffff);
531 
532 abort:
533 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
534 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
535 			      test, status);
536 
537 	return status;
538 }
539 
540 /*
541  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
542  * when the PCI-E Completion packets are aligned on an 8-byte
543  * boundary.  Some PCI-E chip sets always align Completion packets; on
544  * the ones that do not, the alignment can be enforced by enabling
545  * ECRC generation (if supported).
546  *
547  * When PCI-E Completion packets are not aligned, it is actually more
548  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
549  *
550  * If the driver can neither enable ECRC nor verify that it has
551  * already been enabled, then it must use a firmware image which works
552  * around unaligned completion packets (ethp_z8e.dat), and it should
553  * also ensure that it never gives the device a Read-DMA which is
554  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
555  * enabled, then the driver should use the aligned (eth_z8e.dat)
556  * firmware image, and set tx_boundary to 4KB.
557  */
558 
559 static int
560 mxge_firmware_probe(mxge_softc_t *sc)
561 {
562 	device_t dev = sc->dev;
563 	int reg, status;
564 	uint16_t pectl;
565 
566 	sc->tx_boundary = 4096;
567 	/*
568 	 * Verify the max read request size was set to 4KB
569 	 * before trying the test with 4KB.
570 	 */
571 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
572 		pectl = pci_read_config(dev, reg + 0x8, 2);
573 		if ((pectl & (5 << 12)) != (5 << 12)) {
574 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
575 				      pectl);
576 			sc->tx_boundary = 2048;
577 		}
578 	}
579 
580 	/*
581 	 * load the optimized firmware (which assumes aligned PCIe
582 	 * completions) in order to see if it works on this host.
583 	 */
584 	sc->fw_name = mxge_fw_aligned;
585 	status = mxge_load_firmware(sc, 1);
586 	if (status != 0) {
587 		return status;
588 	}
589 
590 	/*
591 	 * Enable ECRC if possible
592 	 */
593 	mxge_enable_nvidia_ecrc(sc);
594 
595 	/*
596 	 * Run a DMA test which watches for unaligned completions and
597 	 * aborts on the first one seen.
598 	 */
599 
600 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
601 	if (status == 0)
602 		return 0; /* keep the aligned firmware */
603 
604 	if (status != E2BIG)
605 		device_printf(dev, "DMA test failed: %d\n", status);
606 	if (status == ENOSYS)
607 		device_printf(dev, "Falling back to ethp! "
608 			      "Please install up to date fw\n");
609 	return status;
610 }
611 
612 static int
613 mxge_select_firmware(mxge_softc_t *sc)
614 {
615 	int aligned = 0;
616 
617 
618 	if (mxge_force_firmware != 0) {
619 		if (mxge_force_firmware == 1)
620 			aligned = 1;
621 		else
622 			aligned = 0;
623 		if (mxge_verbose)
624 			device_printf(sc->dev,
625 				      "Assuming %s completions (forced)\n",
626 				      aligned ? "aligned" : "unaligned");
627 		goto abort;
628 	}
629 
630 	/* if the PCIe link width is 4 or less, we can use the aligned
631 	   firmware and skip any checks */
632 	if (sc->link_width != 0 && sc->link_width <= 4) {
633 		device_printf(sc->dev,
634 			      "PCIe x%d Link, expect reduced performance\n",
635 			      sc->link_width);
636 		aligned = 1;
637 		goto abort;
638 	}
639 
640 	if (0 == mxge_firmware_probe(sc))
641 		return 0;
642 
643 abort:
644 	if (aligned) {
645 		sc->fw_name = mxge_fw_aligned;
646 		sc->tx_boundary = 4096;
647 	} else {
648 		sc->fw_name = mxge_fw_unaligned;
649 		sc->tx_boundary = 2048;
650 	}
651 	return (mxge_load_firmware(sc, 0));
652 }
653 
654 union qualhack
655 {
656         const char *ro_char;
657         char *rw_char;
658 };
659 
660 static int
661 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
662 {
663 
664 
665 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
666 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
667 			      be32toh(hdr->mcp_type));
668 		return EIO;
669 	}
670 
671 	/* save firmware version for sysctl */
672 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
673 	if (mxge_verbose)
674 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
675 
676 	ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
677 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
678 
679 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
680 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
681 		device_printf(sc->dev, "Found firmware version %s\n",
682 			      sc->fw_version);
683 		device_printf(sc->dev, "Driver needs %d.%d\n",
684 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
685 		return EINVAL;
686 	}
687 	return 0;
688 
689 }
690 
691 static void *
692 z_alloc(void *nil, u_int items, u_int size)
693 {
694         void *ptr;
695 
696         ptr = kmalloc(items * size, M_TEMP, M_NOWAIT);
697         return ptr;
698 }
699 
700 static void
701 z_free(void *nil, void *ptr)
702 {
703         kfree(ptr, M_TEMP);
704 }
705 
706 
707 static int
708 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
709 {
710 	z_stream zs;
711 	char *inflate_buffer;
712 	const struct firmware *fw;
713 	const mcp_gen_header_t *hdr;
714 	unsigned hdr_offset;
715 	int status;
716 	unsigned int i;
717 	char dummy;
718 	size_t fw_len;
719 
720 	fw = firmware_get(sc->fw_name);
721 	if (fw == NULL) {
722 		device_printf(sc->dev, "Could not find firmware image %s\n",
723 			      sc->fw_name);
724 		return ENOENT;
725 	}
726 
727 
728 
729 	/* setup zlib and decompress f/w */
730 	bzero(&zs, sizeof (zs));
731 	zs.zalloc = z_alloc;
732 	zs.zfree = z_free;
733 	status = inflateInit(&zs);
734 	if (status != Z_OK) {
735 		status = EIO;
736 		goto abort_with_fw;
737 	}
738 
739 	/* the uncompressed size is stored as the firmware version,
740 	   which would otherwise go unused */
741 	fw_len = (size_t) fw->version;
742 	inflate_buffer = kmalloc(fw_len, M_TEMP, M_NOWAIT);
743 	if (inflate_buffer == NULL)
744 		goto abort_with_zs;
745 	zs.avail_in = fw->datasize;
746 	zs.next_in = __DECONST(char *, fw->data);
747 	zs.avail_out = fw_len;
748 	zs.next_out = inflate_buffer;
749 	status = inflate(&zs, Z_FINISH);
750 	if (status != Z_STREAM_END) {
751 		device_printf(sc->dev, "zlib %d\n", status);
752 		status = EIO;
753 		goto abort_with_buffer;
754 	}
755 
756 	/* check id */
757 	hdr_offset = htobe32(*(const uint32_t *)
758 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
759 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
760 		device_printf(sc->dev, "Bad firmware file");
761 		status = EIO;
762 		goto abort_with_buffer;
763 	}
764 	hdr = (const void*)(inflate_buffer + hdr_offset);
765 
766 	status = mxge_validate_firmware(sc, hdr);
767 	if (status != 0)
768 		goto abort_with_buffer;
769 
770 	/* Copy the inflated firmware to NIC SRAM. */
771 	for (i = 0; i < fw_len; i += 256) {
772 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
773 			      inflate_buffer + i,
774 			      min(256U, (unsigned)(fw_len - i)));
775 		wmb();
776 		dummy = *sc->sram;
777 		wmb();
778 	}
779 
780 	*limit = fw_len;
781 	status = 0;
782 abort_with_buffer:
783 	kfree(inflate_buffer, M_TEMP);
784 abort_with_zs:
785 	inflateEnd(&zs);
786 abort_with_fw:
787 	firmware_put(fw, FIRMWARE_UNLOAD);
788 	return status;
789 }
790 
791 /*
792  * Enable or disable periodic RDMAs from the host to make certain
793  * chipsets resend dropped PCIe messages
794  */
795 
796 static void
797 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
798 {
799 	char buf_bytes[72];
800 	volatile uint32_t *confirm;
801 	volatile char *submit;
802 	uint32_t *buf, dma_low, dma_high;
803 	int i;
804 
805 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
806 
807 	/* clear confirmation addr */
808 	confirm = (volatile uint32_t *)sc->cmd;
809 	*confirm = 0;
810 	wmb();
811 
812 	/* send an rdma command to the PCIe engine, and wait for the
813 	   response in the confirmation address.  The firmware should
814 	   write a -1 there to indicate it is alive and well
815 	*/
816 
817 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
818 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
819 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
820 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
821 	buf[2] = htobe32(0xffffffff);		/* confirm data */
822 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
823 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
824 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
825 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
826 	buf[5] = htobe32(enable);			/* enable? */
827 
828 
829 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
830 
831 	mxge_pio_copy(submit, buf, 64);
832 	wmb();
833 	DELAY(1000);
834 	wmb();
835 	i = 0;
836 	while (*confirm != 0xffffffff && i < 20) {
837 		DELAY(1000);
838 		i++;
839 	}
840 	if (*confirm != 0xffffffff) {
841 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
842 			      (enable ? "enable" : "disable"), confirm,
843 			      *confirm);
844 	}
845 	return;
846 }
847 
848 static int
849 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
850 {
851 	mcp_cmd_t *buf;
852 	char buf_bytes[sizeof(*buf) + 8];
853 	volatile mcp_cmd_response_t *response = sc->cmd;
854 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
855 	uint32_t dma_low, dma_high;
856 	int err, sleep_total = 0;
857 
858 	/*
859 	 * We may be called during attach, before if_serializer is available.
860 	 * This is not a fast path, just check for NULL
861 	 */
862 
863 	if (sc->ifp->if_serializer)
864 		ASSERT_SERIALIZED(sc->ifp->if_serializer);
865 
866 	/* ensure buf is aligned to 8 bytes */
867 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
868 
869 	buf->data0 = htobe32(data->data0);
870 	buf->data1 = htobe32(data->data1);
871 	buf->data2 = htobe32(data->data2);
872 	buf->cmd = htobe32(cmd);
873 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
874 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
875 
876 	buf->response_addr.low = htobe32(dma_low);
877 	buf->response_addr.high = htobe32(dma_high);
878 
879 
880 	response->result = 0xffffffff;
881 	wmb();
882 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
883 
884 	/* wait up to 20ms */
885 	err = EAGAIN;
886 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
887 		bus_dmamap_sync(sc->cmd_dma.dmat,
888 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
889 		wmb();
890 		switch (be32toh(response->result)) {
891 		case 0:
892 			data->data0 = be32toh(response->data);
893 			err = 0;
894 			break;
895 		case 0xffffffff:
896 			DELAY(1000);
897 			break;
898 		case MXGEFW_CMD_UNKNOWN:
899 			err = ENOSYS;
900 			break;
901 		case MXGEFW_CMD_ERROR_UNALIGNED:
902 			err = E2BIG;
903 			break;
904 		case MXGEFW_CMD_ERROR_BUSY:
905 			err = EBUSY;
906 			break;
907 		default:
908 			device_printf(sc->dev,
909 				      "mxge: command %d "
910 				      "failed, result = %d\n",
911 				      cmd, be32toh(response->result));
912 			err = ENXIO;
913 			break;
914 		}
915 		if (err != EAGAIN)
916 			break;
917 	}
918 	if (err == EAGAIN)
919 		device_printf(sc->dev, "mxge: command %d timed out"
920 			      "result = %d\n",
921 			      cmd, be32toh(response->result));
922 	return err;
923 }
924 
925 static int
926 mxge_adopt_running_firmware(mxge_softc_t *sc)
927 {
928 	struct mcp_gen_header *hdr;
929 	const size_t bytes = sizeof (struct mcp_gen_header);
930 	size_t hdr_offset;
931 	int status;
932 
933 	/* find running firmware header */
934 	hdr_offset = htobe32(*(volatile uint32_t *)
935 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
936 
937 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
938 		device_printf(sc->dev,
939 			      "Running firmware has bad header offset (%d)\n",
940 			      (int)hdr_offset);
941 		return EIO;
942 	}
943 
944 	/* copy header of running firmware from SRAM to host memory to
945 	 * validate firmware */
946 	hdr = kmalloc(bytes, M_DEVBUF, M_NOWAIT);
947 	if (hdr == NULL) {
948 		device_printf(sc->dev, "could not kmalloc firmware hdr\n");
949 		return ENOMEM;
950 	}
951 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
952 				rman_get_bushandle(sc->mem_res),
953 				hdr_offset, (char *)hdr, bytes);
954 	status = mxge_validate_firmware(sc, hdr);
955 	kfree(hdr, M_DEVBUF);
956 
957 	/*
958 	 * check to see if adopted firmware has bug where adopting
959 	 * it will cause broadcasts to be filtered unless the NIC
960 	 * is kept in ALLMULTI mode
961 	 */
962 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
963 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
964 		sc->adopted_rx_filter_bug = 1;
965 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
966 			      "working around rx filter bug\n",
967 			      sc->fw_ver_major, sc->fw_ver_minor,
968 			      sc->fw_ver_tiny);
969 	}
970 
971 	return status;
972 }
973 
974 
975 static int
976 mxge_load_firmware(mxge_softc_t *sc, int adopt)
977 {
978 	volatile uint32_t *confirm;
979 	volatile char *submit;
980 	char buf_bytes[72];
981 	uint32_t *buf, size, dma_low, dma_high;
982 	int status, i;
983 
984 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
985 
986 	size = sc->sram_size;
987 	status = mxge_load_firmware_helper(sc, &size);
988 	if (status) {
989 		if (!adopt)
990 			return status;
991 		/* Try to use the currently running firmware, if
992 		   it is new enough */
993 		status = mxge_adopt_running_firmware(sc);
994 		if (status) {
995 			device_printf(sc->dev,
996 				      "failed to adopt running firmware\n");
997 			return status;
998 		}
999 		device_printf(sc->dev,
1000 			      "Successfully adopted running firmware\n");
1001 		if (sc->tx_boundary == 4096) {
1002 			device_printf(sc->dev,
1003 				"Using firmware currently running on NIC"
1004 				 ".  For optimal\n");
1005 			device_printf(sc->dev,
1006 				 "performance consider loading optimized "
1007 				 "firmware\n");
1008 		}
1009 		sc->fw_name = mxge_fw_unaligned;
1010 		sc->tx_boundary = 2048;
1011 		return 0;
1012 	}
1013 	/* clear confirmation addr */
1014 	confirm = (volatile uint32_t *)sc->cmd;
1015 	*confirm = 0;
1016 	wmb();
1017 	/* send a reload command to the bootstrap MCP, and wait for the
1018 	   response in the confirmation address.  The firmware should
1019 	   write a -1 there to indicate it is alive and well
1020 	*/
1021 
1022 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1023 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1024 
1025 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1026 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1027 	buf[2] = htobe32(0xffffffff);	/* confirm data */
1028 
1029 	/* FIX: All newest firmware should un-protect the bottom of
1030 	   the sram before handoff. However, the very first interfaces
1031 	   do not. Therefore the handoff copy must skip the first 8 bytes
1032 	*/
1033 					/* where the code starts*/
1034 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1035 	buf[4] = htobe32(size - 8); 	/* length of code */
1036 	buf[5] = htobe32(8);		/* where to copy to */
1037 	buf[6] = htobe32(0);		/* where to jump to */
1038 
1039 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1040 	mxge_pio_copy(submit, buf, 64);
1041 	wmb();
1042 	DELAY(1000);
1043 	wmb();
1044 	i = 0;
1045 	while (*confirm != 0xffffffff && i < 20) {
1046 		DELAY(1000*10);
1047 		i++;
1048 		bus_dmamap_sync(sc->cmd_dma.dmat,
1049 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1050 	}
1051 	if (*confirm != 0xffffffff) {
1052 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1053 			confirm, *confirm);
1054 
1055 		return ENXIO;
1056 	}
1057 	return 0;
1058 }
1059 
1060 static int
1061 mxge_update_mac_address(mxge_softc_t *sc)
1062 {
1063 	mxge_cmd_t cmd;
1064 	uint8_t *addr = sc->mac_addr;
1065 	int status;
1066 
1067 
1068 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1069 		     | (addr[2] << 8) | addr[3]);
1070 
1071 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1072 
1073 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1074 	return status;
1075 }
1076 
1077 static int
1078 mxge_change_pause(mxge_softc_t *sc, int pause)
1079 {
1080 	mxge_cmd_t cmd;
1081 	int status;
1082 
1083 	if (pause)
1084 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1085 				       &cmd);
1086 	else
1087 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1088 				       &cmd);
1089 
1090 	if (status) {
1091 		device_printf(sc->dev, "Failed to set flow control mode\n");
1092 		return ENXIO;
1093 	}
1094 	sc->pause = pause;
1095 	return 0;
1096 }
1097 
1098 static void
1099 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1100 {
1101 	mxge_cmd_t cmd;
1102 	int status;
1103 
1104 	if( sc->ifp->if_serializer)
1105 		ASSERT_SERIALIZED(sc->ifp->if_serializer);
1106 	if (mxge_always_promisc)
1107 		promisc = 1;
1108 
1109 	if (promisc)
1110 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1111 				       &cmd);
1112 	else
1113 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1114 				       &cmd);
1115 
1116 	if (status) {
1117 		device_printf(sc->dev, "Failed to set promisc mode\n");
1118 	}
1119 }
1120 
1121 static void
1122 mxge_set_multicast_list(mxge_softc_t *sc)
1123 {
1124 	mxge_cmd_t cmd;
1125 	struct ifmultiaddr *ifma;
1126 	struct ifnet *ifp = sc->ifp;
1127 	int err;
1128 
1129 	if (ifp->if_serializer)
1130 		ASSERT_SERIALIZED(ifp->if_serializer);
1131 
1132 	/* This firmware is known to not support multicast */
1133 	if (!sc->fw_multicast_support)
1134 		return;
1135 
1136 	/* Disable multicast filtering while we play with the lists*/
1137 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1138 	if (err != 0) {
1139 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1140 		       " error status: %d\n", err);
1141 		return;
1142 	}
1143 
1144 	if (sc->adopted_rx_filter_bug)
1145 		return;
1146 
1147 	if (ifp->if_flags & IFF_ALLMULTI)
1148 		/* request to disable multicast filtering, so quit here */
1149 		return;
1150 
1151 	/* Flush all the filters */
1152 
1153 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1154 	if (err != 0) {
1155 		device_printf(sc->dev,
1156 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1157 			      ", error status: %d\n", err);
1158 		return;
1159 	}
1160 
1161 	/* Walk the multicast list, and add each address */
1162 
1163 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1164 		if (ifma->ifma_addr->sa_family != AF_LINK)
1165 			continue;
1166 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1167 		      &cmd.data0, 4);
1168 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1169 		      &cmd.data1, 2);
1170 		cmd.data0 = htonl(cmd.data0);
1171 		cmd.data1 = htonl(cmd.data1);
1172 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1173 		if (err != 0) {
1174 			device_printf(sc->dev, "Failed "
1175 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1176 			       "%d\t", err);
1177 			/* abort, leaving multicast filtering off */
1178 			return;
1179 		}
1180 	}
1181 	/* Enable multicast filtering */
1182 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1183 	if (err != 0) {
1184 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1185 		       ", error status: %d\n", err);
1186 	}
1187 }
1188 
1189 static int
1190 mxge_max_mtu(mxge_softc_t *sc)
1191 {
1192 	mxge_cmd_t cmd;
1193 	int status;
1194 
1195 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1196 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1197 
1198 	/* try to set nbufs to see if it we can
1199 	   use virtually contiguous jumbos */
1200 	cmd.data0 = 0;
1201 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1202 			       &cmd);
1203 	if (status == 0)
1204 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1205 
1206 	/* otherwise, we're limited to MJUMPAGESIZE */
1207 	return MJUMPAGESIZE - MXGEFW_PAD;
1208 }
1209 
1210 static int
1211 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1212 {
1213 	struct mxge_slice_state *ss;
1214 	mxge_rx_done_t *rx_done;
1215 	volatile uint32_t *irq_claim;
1216 	mxge_cmd_t cmd;
1217 	int slice, status;
1218 
1219 	/* try to send a reset command to the card to see if it
1220 	   is alive */
1221 	memset(&cmd, 0, sizeof (cmd));
1222 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1223 	if (status != 0) {
1224 		device_printf(sc->dev, "failed reset\n");
1225 		return ENXIO;
1226 	}
1227 
1228 	mxge_dummy_rdma(sc, 1);
1229 
1230 
1231 	/* set the intrq size */
1232 	cmd.data0 = sc->rx_ring_size;
1233 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1234 
1235 	/*
1236 	 * Even though we already know how many slices are supported
1237 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1238 	 * has magic side effects, and must be called after a reset.
1239 	 * It must be called prior to calling any RSS related cmds,
1240 	 * including assigning an interrupt queue for anything but
1241 	 * slice 0.  It must also be called *after*
1242 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1243 	 * the firmware to compute offsets.
1244 	 */
1245 
1246 	if (sc->num_slices > 1) {
1247 		/* ask the maximum number of slices it supports */
1248 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1249 					   &cmd);
1250 		if (status != 0) {
1251 			device_printf(sc->dev,
1252 				      "failed to get number of slices\n");
1253 			return status;
1254 		}
1255 		/*
1256 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1257 		 * to setting up the interrupt queue DMA
1258 		 */
1259 		cmd.data0 = sc->num_slices;
1260 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1261 #ifdef IFNET_BUF_RING
1262 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1263 #endif
1264 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1265 					   &cmd);
1266 		if (status != 0) {
1267 			device_printf(sc->dev,
1268 				      "failed to set number of slices\n");
1269 			return status;
1270 		}
1271 	}
1272 
1273 
1274 	if (interrupts_setup) {
1275 		/* Now exchange information about interrupts  */
1276 		for (slice = 0; slice < sc->num_slices; slice++) {
1277 			rx_done = &sc->ss[slice].rx_done;
1278 			memset(rx_done->entry, 0, sc->rx_ring_size);
1279 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1280 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1281 			cmd.data2 = slice;
1282 			status |= mxge_send_cmd(sc,
1283 						MXGEFW_CMD_SET_INTRQ_DMA,
1284 						&cmd);
1285 		}
1286 	}
1287 
1288 	status |= mxge_send_cmd(sc,
1289 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1290 
1291 
1292 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1293 
1294 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1295 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1296 
1297 
1298 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1299 				&cmd);
1300 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1301 	if (status != 0) {
1302 		device_printf(sc->dev, "failed set interrupt parameters\n");
1303 		return status;
1304 	}
1305 
1306 
1307 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1308 
1309 
1310 	/* run a DMA benchmark */
1311 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1312 
1313 	for (slice = 0; slice < sc->num_slices; slice++) {
1314 		ss = &sc->ss[slice];
1315 
1316 		ss->irq_claim = irq_claim + (2 * slice);
1317 		/* reset mcp/driver shared state back to 0 */
1318 		ss->rx_done.idx = 0;
1319 		ss->rx_done.cnt = 0;
1320 		ss->tx.req = 0;
1321 		ss->tx.done = 0;
1322 		ss->tx.pkt_done = 0;
1323 		ss->tx.queue_active = 0;
1324 		ss->tx.activate = 0;
1325 		ss->tx.deactivate = 0;
1326 		ss->tx.wake = 0;
1327 		ss->tx.defrag = 0;
1328 		ss->tx.stall = 0;
1329 		ss->rx_big.cnt = 0;
1330 		ss->rx_small.cnt = 0;
1331 		ss->lro_bad_csum = 0;
1332 		ss->lro_queued = 0;
1333 		ss->lro_flushed = 0;
1334 		if (ss->fw_stats != NULL) {
1335 			ss->fw_stats->valid = 0;
1336 			ss->fw_stats->send_done_count = 0;
1337 		}
1338 	}
1339 	sc->rdma_tags_available = 15;
1340 	status = mxge_update_mac_address(sc);
1341 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1342 	mxge_change_pause(sc, sc->pause);
1343 	mxge_set_multicast_list(sc);
1344 	return status;
1345 }
1346 
1347 static int
1348 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1349 {
1350         mxge_softc_t *sc;
1351         unsigned int intr_coal_delay;
1352         int err;
1353 
1354         sc = arg1;
1355         intr_coal_delay = sc->intr_coal_delay;
1356         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1357         if (err != 0) {
1358                 return err;
1359         }
1360         if (intr_coal_delay == sc->intr_coal_delay)
1361                 return 0;
1362 
1363         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1364                 return EINVAL;
1365 
1366 	lwkt_serialize_enter(sc->ifp->if_serializer);
1367 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1368 	sc->intr_coal_delay = intr_coal_delay;
1369 
1370 	lwkt_serialize_exit(sc->ifp->if_serializer);
1371         return err;
1372 }
1373 
1374 static int
1375 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1376 {
1377         mxge_softc_t *sc;
1378         unsigned int enabled;
1379         int err;
1380 
1381         sc = arg1;
1382         enabled = sc->pause;
1383         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1384         if (err != 0) {
1385                 return err;
1386         }
1387         if (enabled == sc->pause)
1388                 return 0;
1389 
1390 	lwkt_serialize_enter(sc->ifp->if_serializer);
1391 	err = mxge_change_pause(sc, enabled);
1392 	lwkt_serialize_exit(sc->ifp->if_serializer);
1393         return err;
1394 }
1395 
1396 static int
1397 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1398 {
1399 	struct ifnet *ifp;
1400 	int err = 0;
1401 
1402 	ifp = sc->ifp;
1403 	if (lro_cnt == 0)
1404 		ifp->if_capenable &= ~IFCAP_LRO;
1405 	else
1406 		ifp->if_capenable |= IFCAP_LRO;
1407 	sc->lro_cnt = lro_cnt;
1408 	if (ifp->if_flags & IFF_RUNNING) {
1409 		mxge_close(sc);
1410 		err = mxge_open(sc);
1411 	}
1412 	return err;
1413 }
1414 
1415 static int
1416 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1417 {
1418 	mxge_softc_t *sc;
1419 	unsigned int lro_cnt;
1420 	int err;
1421 
1422 	sc = arg1;
1423 	lro_cnt = sc->lro_cnt;
1424 	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1425 	if (err != 0)
1426 		return err;
1427 
1428 	if (lro_cnt == sc->lro_cnt)
1429 		return 0;
1430 
1431 	if (lro_cnt > 128)
1432 		return EINVAL;
1433 
1434 	lwkt_serialize_enter(sc->ifp->if_serializer);
1435 	err = mxge_change_lro_locked(sc, lro_cnt);
1436 	lwkt_serialize_exit(sc->ifp->if_serializer);
1437 	return err;
1438 }
1439 
1440 static int
1441 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1442 {
1443         int err;
1444 
1445         if (arg1 == NULL)
1446                 return EFAULT;
1447         arg2 = be32toh(*(int *)arg1);
1448         arg1 = NULL;
1449         err = sysctl_handle_int(oidp, arg1, arg2, req);
1450 
1451         return err;
1452 }
1453 
1454 static void
1455 mxge_rem_sysctls(mxge_softc_t *sc)
1456 {
1457 	struct mxge_slice_state *ss;
1458 	int slice;
1459 
1460 	if (sc->slice_sysctl_tree == NULL)
1461 		return;
1462 
1463 	for (slice = 0; slice < sc->num_slices; slice++) {
1464 		ss = &sc->ss[slice];
1465 		if (ss == NULL || ss->sysctl_tree == NULL)
1466 			continue;
1467 		sysctl_ctx_free(&ss->sysctl_ctx);
1468 		ss->sysctl_tree = NULL;
1469 	}
1470 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1471 	sc->slice_sysctl_tree = NULL;
1472 	sysctl_ctx_free(&sc->sysctl_ctx);
1473 	sc->sysctl_tree = NULL;
1474 
1475 }
1476 
1477 static void
1478 mxge_add_sysctls(mxge_softc_t *sc)
1479 {
1480 	struct sysctl_ctx_list *ctx;
1481 	struct sysctl_oid_list *children;
1482 	mcp_irq_data_t *fw;
1483 	struct mxge_slice_state *ss;
1484 	int slice;
1485 	char slice_num[8];
1486 
1487 	ctx = &sc->sysctl_ctx;
1488 	sysctl_ctx_init(ctx);
1489 	sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1490 					  OID_AUTO,
1491 					  device_get_nameunit(sc->dev),
1492 					  CTLFLAG_RD, 0, "");
1493 	if (sc->sysctl_tree == NULL) {
1494 		device_printf(sc->dev, "can't add sysctl node\n");
1495 		return;
1496 	}
1497 
1498 	children = SYSCTL_CHILDREN(sc->sysctl_tree);
1499 	fw = sc->ss[0].fw_stats;
1500 
1501 	/* random information */
1502 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1503 		       "firmware_version",
1504 		       CTLFLAG_RD, &sc->fw_version,
1505 		       0, "firmware version");
1506 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1507 		       "serial_number",
1508 		       CTLFLAG_RD, &sc->serial_number_string,
1509 		       0, "serial number");
1510 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1511 		       "product_code",
1512 		       CTLFLAG_RD, &sc->product_code_string,
1513 		       0, "product_code");
1514 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1515 		       "pcie_link_width",
1516 		       CTLFLAG_RD, &sc->link_width,
1517 		       0, "tx_boundary");
1518 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1519 		       "tx_boundary",
1520 		       CTLFLAG_RD, &sc->tx_boundary,
1521 		       0, "tx_boundary");
1522 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1523 		       "write_combine",
1524 		       CTLFLAG_RD, &sc->wc,
1525 		       0, "write combining PIO?");
1526 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1527 		       "read_dma_MBs",
1528 		       CTLFLAG_RD, &sc->read_dma,
1529 		       0, "DMA Read speed in MB/s");
1530 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1531 		       "write_dma_MBs",
1532 		       CTLFLAG_RD, &sc->write_dma,
1533 		       0, "DMA Write speed in MB/s");
1534 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1535 		       "read_write_dma_MBs",
1536 		       CTLFLAG_RD, &sc->read_write_dma,
1537 		       0, "DMA concurrent Read/Write speed in MB/s");
1538 
1539 
1540 	/* performance related tunables */
1541 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1542 			"intr_coal_delay",
1543 			CTLTYPE_INT|CTLFLAG_RW, sc,
1544 			0, mxge_change_intr_coal,
1545 			"I", "interrupt coalescing delay in usecs");
1546 
1547 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1548 			"flow_control_enabled",
1549 			CTLTYPE_INT|CTLFLAG_RW, sc,
1550 			0, mxge_change_flow_control,
1551 			"I", "interrupt coalescing delay in usecs");
1552 
1553 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1554 		       "deassert_wait",
1555 		       CTLFLAG_RW, &mxge_deassert_wait,
1556 		       0, "Wait for IRQ line to go low in ihandler");
1557 
1558 	/* stats block from firmware is in network byte order.
1559 	   Need to swap it */
1560 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1561 			"link_up",
1562 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1563 			0, mxge_handle_be32,
1564 			"I", "link up");
1565 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1566 			"rdma_tags_available",
1567 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1568 			0, mxge_handle_be32,
1569 			"I", "rdma_tags_available");
1570 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1571 			"dropped_bad_crc32",
1572 			CTLTYPE_INT|CTLFLAG_RD,
1573 			&fw->dropped_bad_crc32,
1574 			0, mxge_handle_be32,
1575 			"I", "dropped_bad_crc32");
1576 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1577 			"dropped_bad_phy",
1578 			CTLTYPE_INT|CTLFLAG_RD,
1579 			&fw->dropped_bad_phy,
1580 			0, mxge_handle_be32,
1581 			"I", "dropped_bad_phy");
1582 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1583 			"dropped_link_error_or_filtered",
1584 			CTLTYPE_INT|CTLFLAG_RD,
1585 			&fw->dropped_link_error_or_filtered,
1586 			0, mxge_handle_be32,
1587 			"I", "dropped_link_error_or_filtered");
1588 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1589 			"dropped_link_overflow",
1590 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1591 			0, mxge_handle_be32,
1592 			"I", "dropped_link_overflow");
1593 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1594 			"dropped_multicast_filtered",
1595 			CTLTYPE_INT|CTLFLAG_RD,
1596 			&fw->dropped_multicast_filtered,
1597 			0, mxge_handle_be32,
1598 			"I", "dropped_multicast_filtered");
1599 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1600 			"dropped_no_big_buffer",
1601 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1602 			0, mxge_handle_be32,
1603 			"I", "dropped_no_big_buffer");
1604 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1605 			"dropped_no_small_buffer",
1606 			CTLTYPE_INT|CTLFLAG_RD,
1607 			&fw->dropped_no_small_buffer,
1608 			0, mxge_handle_be32,
1609 			"I", "dropped_no_small_buffer");
1610 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1611 			"dropped_overrun",
1612 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1613 			0, mxge_handle_be32,
1614 			"I", "dropped_overrun");
1615 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1616 			"dropped_pause",
1617 			CTLTYPE_INT|CTLFLAG_RD,
1618 			&fw->dropped_pause,
1619 			0, mxge_handle_be32,
1620 			"I", "dropped_pause");
1621 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1622 			"dropped_runt",
1623 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1624 			0, mxge_handle_be32,
1625 			"I", "dropped_runt");
1626 
1627 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1628 			"dropped_unicast_filtered",
1629 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1630 			0, mxge_handle_be32,
1631 			"I", "dropped_unicast_filtered");
1632 
1633 	/* verbose printing? */
1634 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1635 		       "verbose",
1636 		       CTLFLAG_RW, &mxge_verbose,
1637 		       0, "verbose printing");
1638 
1639 	/* lro */
1640 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1641 			"lro_cnt",
1642 			CTLTYPE_INT|CTLFLAG_RW, sc,
1643 			0, mxge_change_lro,
1644 			"I", "number of lro merge queues");
1645 
1646 
1647 	/* add counters exported for debugging from all slices */
1648 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1649 	sc->slice_sysctl_tree =
1650 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1651 				"slice", CTLFLAG_RD, 0, "");
1652 
1653 	for (slice = 0; slice < sc->num_slices; slice++) {
1654 		ss = &sc->ss[slice];
1655 		sysctl_ctx_init(&ss->sysctl_ctx);
1656 		ctx = &ss->sysctl_ctx;
1657 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1658 		ksprintf(slice_num, "%d", slice);
1659 		ss->sysctl_tree =
1660 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1661 					CTLFLAG_RD, 0, "");
1662 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1663 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1664 			       "rx_small_cnt",
1665 			       CTLFLAG_RD, &ss->rx_small.cnt,
1666 			       0, "rx_small_cnt");
1667 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1668 			       "rx_big_cnt",
1669 			       CTLFLAG_RD, &ss->rx_big.cnt,
1670 			       0, "rx_small_cnt");
1671 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1672 			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1673 			       0, "number of lro merge queues flushed");
1674 
1675 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1676 			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1677 			       0, "number of frames appended to lro merge"
1678 			       "queues");
1679 
1680 #ifndef IFNET_BUF_RING
1681 		/* only transmit from slice 0 for now */
1682 		if (slice > 0)
1683 			continue;
1684 #endif
1685 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1686 			       "tx_req",
1687 			       CTLFLAG_RD, &ss->tx.req,
1688 			       0, "tx_req");
1689 
1690 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1691 			       "tx_done",
1692 			       CTLFLAG_RD, &ss->tx.done,
1693 			       0, "tx_done");
1694 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1695 			       "tx_pkt_done",
1696 			       CTLFLAG_RD, &ss->tx.pkt_done,
1697 			       0, "tx_done");
1698 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1699 			       "tx_stall",
1700 			       CTLFLAG_RD, &ss->tx.stall,
1701 			       0, "tx_stall");
1702 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1703 			       "tx_wake",
1704 			       CTLFLAG_RD, &ss->tx.wake,
1705 			       0, "tx_wake");
1706 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1707 			       "tx_defrag",
1708 			       CTLFLAG_RD, &ss->tx.defrag,
1709 			       0, "tx_defrag");
1710 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1711 			       "tx_queue_active",
1712 			       CTLFLAG_RD, &ss->tx.queue_active,
1713 			       0, "tx_queue_active");
1714 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1715 			       "tx_activate",
1716 			       CTLFLAG_RD, &ss->tx.activate,
1717 			       0, "tx_activate");
1718 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1719 			       "tx_deactivate",
1720 			       CTLFLAG_RD, &ss->tx.deactivate,
1721 			       0, "tx_deactivate");
1722 	}
1723 }
1724 
1725 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1726    backwards one at a time and handle ring wraps */
1727 
1728 static inline void
1729 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1730 			    mcp_kreq_ether_send_t *src, int cnt)
1731 {
1732         int idx, starting_slot;
1733         starting_slot = tx->req;
1734         while (cnt > 1) {
1735                 cnt--;
1736                 idx = (starting_slot + cnt) & tx->mask;
1737                 mxge_pio_copy(&tx->lanai[idx],
1738 			      &src[cnt], sizeof(*src));
1739                 wmb();
1740         }
1741 }
1742 
1743 /*
1744  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1745  * at most 32 bytes at a time, so as to avoid involving the software
1746  * pio handler in the nic.   We re-write the first segment's flags
1747  * to mark them valid only after writing the entire chain
1748  */
1749 
1750 static inline void
1751 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1752                   int cnt)
1753 {
1754         int idx, i;
1755         uint32_t *src_ints;
1756 	volatile uint32_t *dst_ints;
1757         mcp_kreq_ether_send_t *srcp;
1758 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1759 	uint8_t last_flags;
1760 
1761         idx = tx->req & tx->mask;
1762 
1763 	last_flags = src->flags;
1764 	src->flags = 0;
1765         wmb();
1766         dst = dstp = &tx->lanai[idx];
1767         srcp = src;
1768 
1769         if ((idx + cnt) < tx->mask) {
1770                 for (i = 0; i < (cnt - 1); i += 2) {
1771                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1772                         wmb(); /* force write every 32 bytes */
1773                         srcp += 2;
1774                         dstp += 2;
1775                 }
1776         } else {
1777                 /* submit all but the first request, and ensure
1778                    that it is submitted below */
1779                 mxge_submit_req_backwards(tx, src, cnt);
1780                 i = 0;
1781         }
1782         if (i < cnt) {
1783                 /* submit the first request */
1784                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1785                 wmb(); /* barrier before setting valid flag */
1786         }
1787 
1788         /* re-write the last 32-bits with the valid flags */
1789         src->flags = last_flags;
1790         src_ints = (uint32_t *)src;
1791         src_ints+=3;
1792         dst_ints = (volatile uint32_t *)dst;
1793         dst_ints+=3;
1794         *dst_ints =  *src_ints;
1795         tx->req += cnt;
1796         wmb();
1797 }
1798 
1799 #if IFCAP_TSO4
1800 
1801 static void
1802 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1803 	       int busdma_seg_cnt, int ip_off)
1804 {
1805 	mxge_tx_ring_t *tx;
1806 	mcp_kreq_ether_send_t *req;
1807 	bus_dma_segment_t *seg;
1808 	struct ip *ip;
1809 	struct tcphdr *tcp;
1810 	uint32_t low, high_swapped;
1811 	int len, seglen, cum_len, cum_len_next;
1812 	int next_is_first, chop, cnt, rdma_count, small;
1813 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1814 	uint8_t flags, flags_next;
1815 	static int once;
1816 
1817 	mss = m->m_pkthdr.tso_segsz;
1818 
1819 	/* negative cum_len signifies to the
1820 	 * send loop that we are still in the
1821 	 * header portion of the TSO packet.
1822 	 */
1823 
1824 	/* ensure we have the ethernet, IP and TCP
1825 	   header together in the first mbuf, copy
1826 	   it to a scratch buffer if not */
1827 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1828 		m_copydata(m, 0, ip_off + sizeof (*ip),
1829 			   ss->scratch);
1830 		ip = (struct ip *)(ss->scratch + ip_off);
1831 	} else {
1832 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1833 	}
1834 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1835 			    + sizeof (*tcp))) {
1836 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1837 			   + sizeof (*tcp),  ss->scratch);
1838 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1839 	}
1840 
1841 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1842 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1843 
1844 	/* TSO implies checksum offload on this hardware */
1845 	cksum_offset = ip_off + (ip->ip_hl << 2);
1846 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1847 
1848 
1849 	/* for TSO, pseudo_hdr_offset holds mss.
1850 	 * The firmware figures out where to put
1851 	 * the checksum by parsing the header. */
1852 	pseudo_hdr_offset = htobe16(mss);
1853 
1854 	tx = &ss->tx;
1855 	req = tx->req_list;
1856 	seg = tx->seg_list;
1857 	cnt = 0;
1858 	rdma_count = 0;
1859 	/* "rdma_count" is the number of RDMAs belonging to the
1860 	 * current packet BEFORE the current send request. For
1861 	 * non-TSO packets, this is equal to "count".
1862 	 * For TSO packets, rdma_count needs to be reset
1863 	 * to 0 after a segment cut.
1864 	 *
1865 	 * The rdma_count field of the send request is
1866 	 * the number of RDMAs of the packet starting at
1867 	 * that request. For TSO send requests with one ore more cuts
1868 	 * in the middle, this is the number of RDMAs starting
1869 	 * after the last cut in the request. All previous
1870 	 * segments before the last cut implicitly have 1 RDMA.
1871 	 *
1872 	 * Since the number of RDMAs is not known beforehand,
1873 	 * it must be filled-in retroactively - after each
1874 	 * segmentation cut or at the end of the entire packet.
1875 	 */
1876 
1877 	while (busdma_seg_cnt) {
1878 		/* Break the busdma segment up into pieces*/
1879 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1880 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1881 		len = seg->ds_len;
1882 
1883 		while (len) {
1884 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1885 			seglen = len;
1886 			cum_len_next = cum_len + seglen;
1887 			(req-rdma_count)->rdma_count = rdma_count + 1;
1888 			if (__predict_true(cum_len >= 0)) {
1889 				/* payload */
1890 				chop = (cum_len_next > mss);
1891 				cum_len_next = cum_len_next % mss;
1892 				next_is_first = (cum_len_next == 0);
1893 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1894 				flags_next |= next_is_first *
1895 					MXGEFW_FLAGS_FIRST;
1896 				rdma_count |= -(chop | next_is_first);
1897 				rdma_count += chop & !next_is_first;
1898 			} else if (cum_len_next >= 0) {
1899 				/* header ends */
1900 				rdma_count = -1;
1901 				cum_len_next = 0;
1902 				seglen = -cum_len;
1903 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1904 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1905 					MXGEFW_FLAGS_FIRST |
1906 					(small * MXGEFW_FLAGS_SMALL);
1907 			    }
1908 
1909 			req->addr_high = high_swapped;
1910 			req->addr_low = htobe32(low);
1911 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1912 			req->pad = 0;
1913 			req->rdma_count = 1;
1914 			req->length = htobe16(seglen);
1915 			req->cksum_offset = cksum_offset;
1916 			req->flags = flags | ((cum_len & 1) *
1917 					      MXGEFW_FLAGS_ALIGN_ODD);
1918 			low += seglen;
1919 			len -= seglen;
1920 			cum_len = cum_len_next;
1921 			flags = flags_next;
1922 			req++;
1923 			cnt++;
1924 			rdma_count++;
1925 			if (__predict_false(cksum_offset > seglen))
1926 				cksum_offset -= seglen;
1927 			else
1928 				cksum_offset = 0;
1929 			if (__predict_false(cnt > tx->max_desc))
1930 				goto drop;
1931 		}
1932 		busdma_seg_cnt--;
1933 		seg++;
1934 	}
1935 	(req-rdma_count)->rdma_count = rdma_count;
1936 
1937 	do {
1938 		req--;
1939 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1940 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1941 
1942 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1943 	mxge_submit_req(tx, tx->req_list, cnt);
1944 #ifdef IFNET_BUF_RING
1945 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1946 		/* tell the NIC to start polling this slice */
1947 		*tx->send_go = 1;
1948 		tx->queue_active = 1;
1949 		tx->activate++;
1950 		wmb();
1951 	}
1952 #endif
1953 	return;
1954 
1955 drop:
1956 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1957 	m_freem(m);
1958 	ss->oerrors++;
1959 	if (!once) {
1960 		kprintf("tx->max_desc exceeded via TSO!\n");
1961 		kprintf("mss = %d, %ld, %d!\n", mss,
1962 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1963 		once = 1;
1964 	}
1965 	return;
1966 
1967 }
1968 
1969 #endif /* IFCAP_TSO4 */
1970 
1971 #ifdef MXGE_NEW_VLAN_API
1972 /*
1973  * We reproduce the software vlan tag insertion from
1974  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1975  * vlan tag insertion. We need to advertise this in order to have the
1976  * vlan interface respect our csum offload flags.
1977  */
1978 static struct mbuf *
1979 mxge_vlan_tag_insert(struct mbuf *m)
1980 {
1981 	struct ether_vlan_header *evl;
1982 
1983 	M_PREPEND(m, EVL_ENCAPLEN, MB_DONTWAIT);
1984 	if (__predict_false(m == NULL))
1985 		return NULL;
1986 	if (m->m_len < sizeof(*evl)) {
1987 		m = m_pullup(m, sizeof(*evl));
1988 		if (__predict_false(m == NULL))
1989 			return NULL;
1990 	}
1991 	/*
1992 	 * Transform the Ethernet header into an Ethernet header
1993 	 * with 802.1Q encapsulation.
1994 	 */
1995 	evl = mtod(m, struct ether_vlan_header *);
1996 	bcopy((char *)evl + EVL_ENCAPLEN,
1997 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1998 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1999 	evl->evl_tag = htons(m->m_pkthdr.ether_vlantag);
2000 	m->m_flags &= ~M_VLANTAG;
2001 	return m;
2002 }
2003 #endif /* MXGE_NEW_VLAN_API */
2004 
2005 static void
2006 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2007 {
2008 	mxge_softc_t *sc;
2009 	mcp_kreq_ether_send_t *req;
2010 	bus_dma_segment_t *seg;
2011 	struct mbuf *m_tmp;
2012 	struct ifnet *ifp;
2013 	mxge_tx_ring_t *tx;
2014 	struct ip *ip;
2015 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
2016 	uint16_t pseudo_hdr_offset;
2017         uint8_t flags, cksum_offset;
2018 
2019 
2020 	sc = ss->sc;
2021 	ifp = sc->ifp;
2022 	tx = &ss->tx;
2023 
2024 	ip_off = sizeof (struct ether_header);
2025 #ifdef MXGE_NEW_VLAN_API
2026 	if (m->m_flags & M_VLANTAG) {
2027 		m = mxge_vlan_tag_insert(m);
2028 		if (__predict_false(m == NULL))
2029 			goto drop;
2030 		ip_off += EVL_ENCAPLEN;
2031 	}
2032 #endif
2033 	/* (try to) map the frame for DMA */
2034 	idx = tx->req & tx->mask;
2035 	err = bus_dmamap_load_mbuf_segment(tx->dmat, tx->info[idx].map,
2036 					   m, tx->seg_list, 1, &cnt,
2037 					   BUS_DMA_NOWAIT);
2038 	if (__predict_false(err == EFBIG)) {
2039 		/* Too many segments in the chain.  Try
2040 		   to defrag */
2041 		m_tmp = m_defrag(m, MB_DONTWAIT);
2042 		if (m_tmp == NULL) {
2043 			goto drop;
2044 		}
2045 		ss->tx.defrag++;
2046 		m = m_tmp;
2047 		err = bus_dmamap_load_mbuf_segment(tx->dmat,
2048 					      tx->info[idx].map,
2049 					      m, tx->seg_list, 1, &cnt,
2050 					      BUS_DMA_NOWAIT);
2051 	}
2052 	if (__predict_false(err != 0)) {
2053 		device_printf(sc->dev, "bus_dmamap_load_mbuf_segment returned %d"
2054 			      " packet len = %d\n", err, m->m_pkthdr.len);
2055 		goto drop;
2056 	}
2057 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2058 			BUS_DMASYNC_PREWRITE);
2059 	tx->info[idx].m = m;
2060 
2061 #if IFCAP_TSO4
2062 	/* TSO is different enough, we handle it in another routine */
2063 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2064 		mxge_encap_tso(ss, m, cnt, ip_off);
2065 		return;
2066 	}
2067 #endif
2068 
2069 	req = tx->req_list;
2070 	cksum_offset = 0;
2071 	pseudo_hdr_offset = 0;
2072 	flags = MXGEFW_FLAGS_NO_TSO;
2073 
2074 	/* checksum offloading? */
2075 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2076 		/* ensure ip header is in first mbuf, copy
2077 		   it to a scratch buffer if not */
2078 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2079 			m_copydata(m, 0, ip_off + sizeof (*ip),
2080 				   ss->scratch);
2081 			ip = (struct ip *)(ss->scratch + ip_off);
2082 		} else {
2083 			ip = (struct ip *)(mtod(m, char *) + ip_off);
2084 		}
2085 		cksum_offset = ip_off + (ip->ip_hl << 2);
2086 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2087 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2088 		req->cksum_offset = cksum_offset;
2089 		flags |= MXGEFW_FLAGS_CKSUM;
2090 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2091 	} else {
2092 		odd_flag = 0;
2093 	}
2094 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2095 		flags |= MXGEFW_FLAGS_SMALL;
2096 
2097 	/* convert segments into a request list */
2098 	cum_len = 0;
2099 	seg = tx->seg_list;
2100 	req->flags = MXGEFW_FLAGS_FIRST;
2101 	for (i = 0; i < cnt; i++) {
2102 		req->addr_low =
2103 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2104 		req->addr_high =
2105 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2106 		req->length = htobe16(seg->ds_len);
2107 		req->cksum_offset = cksum_offset;
2108 		if (cksum_offset > seg->ds_len)
2109 			cksum_offset -= seg->ds_len;
2110 		else
2111 			cksum_offset = 0;
2112 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2113 		req->pad = 0; /* complete solid 16-byte block */
2114 		req->rdma_count = 1;
2115 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2116 		cum_len += seg->ds_len;
2117 		seg++;
2118 		req++;
2119 		req->flags = 0;
2120 	}
2121 	req--;
2122 	/* pad runts to 60 bytes */
2123 	if (cum_len < 60) {
2124 		req++;
2125 		req->addr_low =
2126 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2127 		req->addr_high =
2128 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2129 		req->length = htobe16(60 - cum_len);
2130 		req->cksum_offset = 0;
2131 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2132 		req->pad = 0; /* complete solid 16-byte block */
2133 		req->rdma_count = 1;
2134 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2135 		cnt++;
2136 	}
2137 
2138 	tx->req_list[0].rdma_count = cnt;
2139 #if 0
2140 	/* print what the firmware will see */
2141 	for (i = 0; i < cnt; i++) {
2142 		kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2143 		    "cso:%d, flags:0x%x, rdma:%d\n",
2144 		    i, (int)ntohl(tx->req_list[i].addr_high),
2145 		    (int)ntohl(tx->req_list[i].addr_low),
2146 		    (int)ntohs(tx->req_list[i].length),
2147 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2148 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2149 		    tx->req_list[i].rdma_count);
2150 	}
2151 	kprintf("--------------\n");
2152 #endif
2153 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2154 	mxge_submit_req(tx, tx->req_list, cnt);
2155 #ifdef IFNET_BUF_RING
2156 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2157 		/* tell the NIC to start polling this slice */
2158 		*tx->send_go = 1;
2159 		tx->queue_active = 1;
2160 		tx->activate++;
2161 		wmb();
2162 	}
2163 #endif
2164 	return;
2165 
2166 drop:
2167 	m_freem(m);
2168 	ss->oerrors++;
2169 	return;
2170 }
2171 
2172 #ifdef IFNET_BUF_RING
2173 static void
2174 mxge_qflush(struct ifnet *ifp)
2175 {
2176 	mxge_softc_t *sc = ifp->if_softc;
2177 	mxge_tx_ring_t *tx;
2178 	struct mbuf *m;
2179 	int slice;
2180 
2181 	for (slice = 0; slice < sc->num_slices; slice++) {
2182 		tx = &sc->ss[slice].tx;
2183 		lwkt_serialize_enter(sc->ifp->if_serializer);
2184 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2185 			m_freem(m);
2186 		lwkt_serialize_exit(sc->ifp->if_serializer);
2187 	}
2188 	if_qflush(ifp);
2189 }
2190 
2191 static inline void
2192 mxge_start_locked(struct mxge_slice_state *ss)
2193 {
2194 	mxge_softc_t *sc;
2195 	struct mbuf *m;
2196 	struct ifnet *ifp;
2197 	mxge_tx_ring_t *tx;
2198 
2199 	sc = ss->sc;
2200 	ifp = sc->ifp;
2201 	tx = &ss->tx;
2202 
2203 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2204 		m = drbr_dequeue(ifp, tx->br);
2205 		if (m == NULL) {
2206 			return;
2207 		}
2208 		/* let BPF see it */
2209 		BPF_MTAP(ifp, m);
2210 
2211 		/* give it to the nic */
2212 		mxge_encap(ss, m);
2213 	}
2214 	/* ran out of transmit slots */
2215 	if (((ss->if_flags & IFF_OACTIVE) == 0)
2216 	    && (!drbr_empty(ifp, tx->br))) {
2217 		ss->if_flags |= IFF_OACTIVE;
2218 		tx->stall++;
2219 	}
2220 }
2221 
2222 static int
2223 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2224 {
2225 	mxge_softc_t *sc;
2226 	struct ifnet *ifp;
2227 	mxge_tx_ring_t *tx;
2228 	int err;
2229 
2230 	sc = ss->sc;
2231 	ifp = sc->ifp;
2232 	tx = &ss->tx;
2233 
2234 	if ((ss->if_flags & (IFF_RUNNING|IFF_OACTIVE)) !=
2235 	    IFF_RUNNING) {
2236 		err = drbr_enqueue(ifp, tx->br, m);
2237 		return (err);
2238 	}
2239 
2240 	if (drbr_empty(ifp, tx->br) &&
2241 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2242 		/* let BPF see it */
2243 		BPF_MTAP(ifp, m);
2244 		/* give it to the nic */
2245 		mxge_encap(ss, m);
2246 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2247 		return (err);
2248 	}
2249 	if (!drbr_empty(ifp, tx->br))
2250 		mxge_start_locked(ss);
2251 	return (0);
2252 }
2253 
2254 static int
2255 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2256 {
2257 	mxge_softc_t *sc = ifp->if_softc;
2258 	struct mxge_slice_state *ss;
2259 	mxge_tx_ring_t *tx;
2260 	int err = 0;
2261 	int slice;
2262 
2263 #if 0
2264 	slice = m->m_pkthdr.flowid;
2265 #endif
2266 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2267 
2268 	ss = &sc->ss[slice];
2269 	tx = &ss->tx;
2270 
2271 	if(lwkt_serialize_try(ifp->if_serializer)) {
2272 		err = mxge_transmit_locked(ss, m);
2273 		lwkt_serialize_exit(ifp->if_serializer);
2274 	} else {
2275 		err = drbr_enqueue(ifp, tx->br, m);
2276 	}
2277 
2278 	return (err);
2279 }
2280 
2281 #else
2282 
2283 static inline void
2284 mxge_start_locked(struct mxge_slice_state *ss)
2285 {
2286 	mxge_softc_t *sc;
2287 	struct mbuf *m;
2288 	struct ifnet *ifp;
2289 	mxge_tx_ring_t *tx;
2290 
2291 	sc = ss->sc;
2292 	ifp = sc->ifp;
2293 	tx = &ss->tx;
2294 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2295 		m = ifq_dequeue(&ifp->if_snd, NULL);
2296 		if (m == NULL) {
2297 			return;
2298 		}
2299 		/* let BPF see it */
2300 		BPF_MTAP(ifp, m);
2301 
2302 		/* give it to the nic */
2303 		mxge_encap(ss, m);
2304 	}
2305 	/* ran out of transmit slots */
2306 	if ((sc->ifp->if_flags & IFF_OACTIVE) == 0) {
2307 		sc->ifp->if_flags |= IFF_OACTIVE;
2308 		tx->stall++;
2309 	}
2310 }
2311 #endif
2312 static void
2313 mxge_start(struct ifnet *ifp)
2314 {
2315 	mxge_softc_t *sc = ifp->if_softc;
2316 	struct mxge_slice_state *ss;
2317 
2318 	ASSERT_SERIALIZED(sc->ifp->if_serializer);
2319 	/* only use the first slice for now */
2320 	ss = &sc->ss[0];
2321 	mxge_start_locked(ss);
2322 }
2323 
2324 /*
2325  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2326  * at most 32 bytes at a time, so as to avoid involving the software
2327  * pio handler in the nic.   We re-write the first segment's low
2328  * DMA address to mark it valid only after we write the entire chunk
2329  * in a burst
2330  */
2331 static inline void
2332 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2333 		mcp_kreq_ether_recv_t *src)
2334 {
2335 	uint32_t low;
2336 
2337 	low = src->addr_low;
2338 	src->addr_low = 0xffffffff;
2339 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2340 	wmb();
2341 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2342 	wmb();
2343 	src->addr_low = low;
2344 	dst->addr_low = low;
2345 	wmb();
2346 }
2347 
2348 static int
2349 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2350 {
2351 	bus_dma_segment_t seg;
2352 	struct mbuf *m;
2353 	mxge_rx_ring_t *rx = &ss->rx_small;
2354 	int cnt, err;
2355 
2356 	m = m_gethdr(MB_DONTWAIT, MT_DATA);
2357 	if (m == NULL) {
2358 		rx->alloc_fail++;
2359 		err = ENOBUFS;
2360 		goto done;
2361 	}
2362 	m->m_len = m->m_pkthdr.len = MHLEN;
2363 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2364 				      &seg, 1, &cnt, BUS_DMA_NOWAIT);
2365 	if (err != 0) {
2366 		kprintf("can't dmamap small (%d)\n", err);
2367 		m_free(m);
2368 		goto done;
2369 	}
2370 	rx->info[idx].m = m;
2371 	rx->shadow[idx].addr_low =
2372 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2373 	rx->shadow[idx].addr_high =
2374 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2375 
2376 done:
2377 	if ((idx & 7) == 7)
2378 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2379 	return err;
2380 }
2381 
2382 
2383 static int
2384 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2385 {
2386 	bus_dma_segment_t seg[3];
2387 	struct mbuf *m;
2388 	mxge_rx_ring_t *rx = &ss->rx_big;
2389 	int cnt, err, i;
2390 
2391 	if (rx->cl_size == MCLBYTES)
2392 		m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2393 	else {
2394 #if 0
2395 		m = m_getjcl(MB_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2396 #else
2397 		/*
2398 		 * XXX: allocate normal sized buffers for big buffers.
2399 		 * We should be fine as long as we don't get any jumbo frames
2400 		 */
2401 		m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2402 #endif
2403 	}
2404 	if (m == NULL) {
2405 		rx->alloc_fail++;
2406 		err = ENOBUFS;
2407 		goto done;
2408 	}
2409 	m->m_pkthdr.len = 0;
2410 	m->m_len = m->m_pkthdr.len = rx->mlen;
2411 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2412 				      seg, 1, &cnt, BUS_DMA_NOWAIT);
2413 	if (err != 0) {
2414 		kprintf("can't dmamap big (%d)\n", err);
2415 		m_free(m);
2416 		goto done;
2417 	}
2418 	rx->info[idx].m = m;
2419 	rx->shadow[idx].addr_low =
2420 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2421 	rx->shadow[idx].addr_high =
2422 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2423 
2424 #if MXGE_VIRT_JUMBOS
2425 	for (i = 1; i < cnt; i++) {
2426 		rx->shadow[idx + i].addr_low =
2427 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2428 		rx->shadow[idx + i].addr_high =
2429 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2430        }
2431 #endif
2432 
2433 done:
2434        for (i = 0; i < rx->nbufs; i++) {
2435 		if ((idx & 7) == 7) {
2436 			mxge_submit_8rx(&rx->lanai[idx - 7],
2437 					&rx->shadow[idx - 7]);
2438 		}
2439 		idx++;
2440 	}
2441 	return err;
2442 }
2443 
2444 /*
2445  *  Myri10GE hardware checksums are not valid if the sender
2446  *  padded the frame with non-zero padding.  This is because
2447  *  the firmware just does a simple 16-bit 1s complement
2448  *  checksum across the entire frame, excluding the first 14
2449  *  bytes.  It is best to simply to check the checksum and
2450  *  tell the stack about it only if the checksum is good
2451  */
2452 
2453 static inline uint16_t
2454 mxge_rx_csum(struct mbuf *m, int csum)
2455 {
2456 	struct ether_header *eh;
2457 	struct ip *ip;
2458 	uint16_t c;
2459 
2460 	eh = mtod(m, struct ether_header *);
2461 
2462 	/* only deal with IPv4 TCP & UDP for now */
2463 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2464 		return 1;
2465 	ip = (struct ip *)(eh + 1);
2466 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2467 			    ip->ip_p != IPPROTO_UDP))
2468 		return 1;
2469 #ifdef INET
2470 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2471 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2472 			    - (ip->ip_hl << 2) + ip->ip_p));
2473 #else
2474 	c = 1;
2475 #endif
2476 	c ^= 0xffff;
2477 	return (c);
2478 }
2479 
2480 static void
2481 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2482 {
2483 	struct ether_vlan_header *evl;
2484 	struct ether_header *eh;
2485 	uint32_t partial;
2486 
2487 	evl = mtod(m, struct ether_vlan_header *);
2488 	eh = mtod(m, struct ether_header *);
2489 
2490 	/*
2491 	 * fix checksum by subtracting EVL_ENCAPLEN bytes
2492 	 * after what the firmware thought was the end of the ethernet
2493 	 * header.
2494 	 */
2495 
2496 	/* put checksum into host byte order */
2497 	*csum = ntohs(*csum);
2498 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2499 	(*csum) += ~partial;
2500 	(*csum) +=  ((*csum) < ~partial);
2501 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2502 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2503 
2504 	/* restore checksum to network byte order;
2505 	   later consumers expect this */
2506 	*csum = htons(*csum);
2507 
2508 	/* save the tag */
2509 #ifdef MXGE_NEW_VLAN_API
2510 	m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2511 #else
2512 	{
2513 		struct m_tag *mtag;
2514 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2515 				   MB_DONTWAIT);
2516 		if (mtag == NULL)
2517 			return;
2518 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2519 		m_tag_prepend(m, mtag);
2520 	}
2521 
2522 #endif
2523 	m->m_flags |= M_VLANTAG;
2524 
2525 	/*
2526 	 * Remove the 802.1q header by copying the Ethernet
2527 	 * addresses over it and adjusting the beginning of
2528 	 * the data in the mbuf.  The encapsulated Ethernet
2529 	 * type field is already in place.
2530 	 */
2531 	bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2532 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2533 	m_adj(m, EVL_ENCAPLEN);
2534 }
2535 
2536 
2537 static inline void
2538 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum,
2539 		   struct mbuf_chain *chain)
2540 {
2541 	mxge_softc_t *sc;
2542 	struct ifnet *ifp;
2543 	struct mbuf *m;
2544 	struct ether_header *eh;
2545 	mxge_rx_ring_t *rx;
2546 	bus_dmamap_t old_map;
2547 	int idx;
2548 	uint16_t tcpudp_csum;
2549 
2550 	sc = ss->sc;
2551 	ifp = sc->ifp;
2552 	rx = &ss->rx_big;
2553 	idx = rx->cnt & rx->mask;
2554 	rx->cnt += rx->nbufs;
2555 	/* save a pointer to the received mbuf */
2556 	m = rx->info[idx].m;
2557 	/* try to replace the received mbuf */
2558 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2559 		/* drop the frame -- the old mbuf is re-cycled */
2560 		ifp->if_ierrors++;
2561 		return;
2562 	}
2563 
2564 	/* unmap the received buffer */
2565 	old_map = rx->info[idx].map;
2566 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2567 	bus_dmamap_unload(rx->dmat, old_map);
2568 
2569 	/* swap the bus_dmamap_t's */
2570 	rx->info[idx].map = rx->extra_map;
2571 	rx->extra_map = old_map;
2572 
2573 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2574 	 * aligned */
2575 	m->m_data += MXGEFW_PAD;
2576 
2577 	m->m_pkthdr.rcvif = ifp;
2578 	m->m_len = m->m_pkthdr.len = len;
2579 	ss->ipackets++;
2580 	eh = mtod(m, struct ether_header *);
2581 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2582 		mxge_vlan_tag_remove(m, &csum);
2583 	}
2584 	/* if the checksum is valid, mark it in the mbuf header */
2585 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2586 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2587 			return;
2588 		/* otherwise, it was a UDP frame, or a TCP frame which
2589 		   we could not do LRO on.  Tell the stack that the
2590 		   checksum is good */
2591 		m->m_pkthdr.csum_data = 0xffff;
2592 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2593 	}
2594 #if 0
2595 	/* flowid only valid if RSS hashing is enabled */
2596 	if (sc->num_slices > 1) {
2597 		m->m_pkthdr.flowid = (ss - sc->ss);
2598 		m->m_flags |= M_FLOWID;
2599 	}
2600 #endif
2601 	ether_input_chain(ifp, m, NULL, chain);
2602 }
2603 
2604 static inline void
2605 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum,
2606 		   struct mbuf_chain *chain)
2607 {
2608 	mxge_softc_t *sc;
2609 	struct ifnet *ifp;
2610 	struct ether_header *eh;
2611 	struct mbuf *m;
2612 	mxge_rx_ring_t *rx;
2613 	bus_dmamap_t old_map;
2614 	int idx;
2615 	uint16_t tcpudp_csum;
2616 
2617 	sc = ss->sc;
2618 	ifp = sc->ifp;
2619 	rx = &ss->rx_small;
2620 	idx = rx->cnt & rx->mask;
2621 	rx->cnt++;
2622 	/* save a pointer to the received mbuf */
2623 	m = rx->info[idx].m;
2624 	/* try to replace the received mbuf */
2625 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2626 		/* drop the frame -- the old mbuf is re-cycled */
2627 		ifp->if_ierrors++;
2628 		return;
2629 	}
2630 
2631 	/* unmap the received buffer */
2632 	old_map = rx->info[idx].map;
2633 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2634 	bus_dmamap_unload(rx->dmat, old_map);
2635 
2636 	/* swap the bus_dmamap_t's */
2637 	rx->info[idx].map = rx->extra_map;
2638 	rx->extra_map = old_map;
2639 
2640 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2641 	 * aligned */
2642 	m->m_data += MXGEFW_PAD;
2643 
2644 	m->m_pkthdr.rcvif = ifp;
2645 	m->m_len = m->m_pkthdr.len = len;
2646 	ss->ipackets++;
2647 	eh = mtod(m, struct ether_header *);
2648 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2649 		mxge_vlan_tag_remove(m, &csum);
2650 	}
2651 	/* if the checksum is valid, mark it in the mbuf header */
2652 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2653 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2654 			return;
2655 		/* otherwise, it was a UDP frame, or a TCP frame which
2656 		   we could not do LRO on.  Tell the stack that the
2657 		   checksum is good */
2658 		m->m_pkthdr.csum_data = 0xffff;
2659 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2660 	}
2661 #if 0
2662 	/* flowid only valid if RSS hashing is enabled */
2663 	if (sc->num_slices > 1) {
2664 		m->m_pkthdr.flowid = (ss - sc->ss);
2665 		m->m_flags |= M_FLOWID;
2666 	}
2667 #endif
2668 	ether_input_chain(ifp, m, NULL, chain);
2669 }
2670 
2671 /*
2672  * XXX
2673  *
2674  * Inlining the call to this function causes mxge_intr() to grow too large
2675  * for GCC's stack size limits (which shouldn't take into account inlining
2676  * of leaf functions at one call site anyway). Inlining is definitely a
2677  * good idea in this case though, so mark the function appropriately.
2678  */
2679 static __always_inline void
2680 mxge_clean_rx_done(struct mxge_slice_state *ss)
2681 {
2682 	mxge_rx_done_t *rx_done = &ss->rx_done;
2683 	int limit = 0;
2684 	uint16_t length;
2685 	uint16_t checksum;
2686 	struct mbuf_chain chain[MAXCPU];
2687 
2688 	ether_input_chain_init(chain);
2689 	while (rx_done->entry[rx_done->idx].length != 0) {
2690 		length = ntohs(rx_done->entry[rx_done->idx].length);
2691 		rx_done->entry[rx_done->idx].length = 0;
2692 		checksum = rx_done->entry[rx_done->idx].checksum;
2693 		if (length <= (MHLEN - MXGEFW_PAD))
2694 			mxge_rx_done_small(ss, length, checksum, chain);
2695 		else
2696 			mxge_rx_done_big(ss, length, checksum, chain);
2697 		rx_done->cnt++;
2698 		rx_done->idx = rx_done->cnt & rx_done->mask;
2699 
2700 		/* limit potential for livelock */
2701 		if (__predict_false(++limit > rx_done->mask / 2))
2702 			break;
2703 	}
2704 	ether_input_dispatch(chain);
2705 #ifdef INET
2706 	while (!SLIST_EMPTY(&ss->lro_active)) {
2707 		struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2708 		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2709 		mxge_lro_flush(ss, lro);
2710 	}
2711 #endif
2712 }
2713 
2714 
2715 static inline void
2716 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2717 {
2718 	struct ifnet *ifp;
2719 	mxge_tx_ring_t *tx;
2720 	struct mbuf *m;
2721 	bus_dmamap_t map;
2722 	int idx;
2723 	int *flags;
2724 
2725 	tx = &ss->tx;
2726 	ifp = ss->sc->ifp;
2727 	ASSERT_SERIALIZED(ifp->if_serializer);
2728 	while (tx->pkt_done != mcp_idx) {
2729 		idx = tx->done & tx->mask;
2730 		tx->done++;
2731 		m = tx->info[idx].m;
2732 		/* mbuf and DMA map only attached to the first
2733 		   segment per-mbuf */
2734 		if (m != NULL) {
2735 			ss->obytes += m->m_pkthdr.len;
2736 			if (m->m_flags & M_MCAST)
2737 				ss->omcasts++;
2738 			ss->opackets++;
2739 			tx->info[idx].m = NULL;
2740 			map = tx->info[idx].map;
2741 			bus_dmamap_unload(tx->dmat, map);
2742 			m_freem(m);
2743 		}
2744 		if (tx->info[idx].flag) {
2745 			tx->info[idx].flag = 0;
2746 			tx->pkt_done++;
2747 		}
2748 	}
2749 
2750 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2751            its OK to send packets */
2752 #ifdef IFNET_BUF_RING
2753 	flags = &ss->if_flags;
2754 #else
2755 	flags = &ifp->if_flags;
2756 #endif
2757 	if ((*flags) & IFF_OACTIVE &&
2758 	    tx->req - tx->done < (tx->mask + 1)/4) {
2759 		*(flags) &= ~IFF_OACTIVE;
2760 		ss->tx.wake++;
2761 		mxge_start_locked(ss);
2762 	}
2763 #ifdef IFNET_BUF_RING
2764 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2765 		/* let the NIC stop polling this queue, since there
2766 		 * are no more transmits pending */
2767 		if (tx->req == tx->done) {
2768 			*tx->send_stop = 1;
2769 			tx->queue_active = 0;
2770 			tx->deactivate++;
2771 			wmb();
2772 		}
2773 	}
2774 #endif
2775 
2776 }
2777 
2778 static struct mxge_media_type mxge_xfp_media_types[] =
2779 {
2780 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2781 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2782 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2783 	{0,		(1 << 5),	"10GBASE-ER"},
2784 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2785 	{0,		(1 << 3),	"10GBASE-SW"},
2786 	{0,		(1 << 2),	"10GBASE-LW"},
2787 	{0,		(1 << 1),	"10GBASE-EW"},
2788 	{0,		(1 << 0),	"Reserved"}
2789 };
2790 static struct mxge_media_type mxge_sfp_media_types[] =
2791 {
2792 	{0,		(1 << 7),	"Reserved"},
2793 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2794 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2795 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"}
2796 };
2797 
2798 static void
2799 mxge_set_media(mxge_softc_t *sc, int type)
2800 {
2801 	sc->media_flags |= type;
2802 	ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2803 	ifmedia_set(&sc->media, sc->media_flags);
2804 }
2805 
2806 
2807 /*
2808  * Determine the media type for a NIC.  Some XFPs will identify
2809  * themselves only when their link is up, so this is initiated via a
2810  * link up interrupt.  However, this can potentially take up to
2811  * several milliseconds, so it is run via the watchdog routine, rather
2812  * than in the interrupt handler itself.   This need only be done
2813  * once, not each time the link is up.
2814  */
2815 static void
2816 mxge_media_probe(mxge_softc_t *sc)
2817 {
2818 	mxge_cmd_t cmd;
2819 	char *cage_type;
2820 	char *ptr;
2821 	struct mxge_media_type *mxge_media_types = NULL;
2822 	int i, err, ms, mxge_media_type_entries;
2823 	uint32_t byte;
2824 
2825 	sc->need_media_probe = 0;
2826 
2827 	/* if we've already set a media type, we're done */
2828 	if (sc->media_flags  != (IFM_ETHER | IFM_AUTO))
2829 		return;
2830 
2831 	/*
2832 	 * parse the product code to deterimine the interface type
2833 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2834 	 * after the 3rd dash in the driver's cached copy of the
2835 	 * EEPROM's product code string.
2836 	 */
2837 	ptr = sc->product_code_string;
2838 	if (ptr == NULL) {
2839 		device_printf(sc->dev, "Missing product code\n");
2840 	}
2841 
2842 	for (i = 0; i < 3; i++, ptr++) {
2843 		ptr = index(ptr, '-');
2844 		if (ptr == NULL) {
2845 			device_printf(sc->dev,
2846 				      "only %d dashes in PC?!?\n", i);
2847 			return;
2848 		}
2849 	}
2850 	if (*ptr == 'C') {
2851 		/* -C is CX4 */
2852 		mxge_set_media(sc, IFM_10G_CX4);
2853 		return;
2854 	}
2855 	else if (*ptr == 'Q') {
2856 		/* -Q is Quad Ribbon Fiber */
2857 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2858 		/* FreeBSD has no media type for Quad ribbon fiber */
2859 		return;
2860 	}
2861 
2862 	if (*ptr == 'R') {
2863 		/* -R is XFP */
2864 		mxge_media_types = mxge_xfp_media_types;
2865 		mxge_media_type_entries =
2866 			sizeof (mxge_xfp_media_types) /
2867 			sizeof (mxge_xfp_media_types[0]);
2868 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2869 		cage_type = "XFP";
2870 	}
2871 
2872 	if (*ptr == 'S' || *(ptr +1) == 'S') {
2873 		/* -S or -2S is SFP+ */
2874 		mxge_media_types = mxge_sfp_media_types;
2875 		mxge_media_type_entries =
2876 			sizeof (mxge_sfp_media_types) /
2877 			sizeof (mxge_sfp_media_types[0]);
2878 		cage_type = "SFP+";
2879 		byte = 3;
2880 	}
2881 
2882 	if (mxge_media_types == NULL) {
2883 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2884 		return;
2885 	}
2886 
2887 	/*
2888 	 * At this point we know the NIC has an XFP cage, so now we
2889 	 * try to determine what is in the cage by using the
2890 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2891 	 * register.  We read just one byte, which may take over
2892 	 * a millisecond
2893 	 */
2894 
2895 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2896 	cmd.data1 = byte;
2897 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2898 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2899 		device_printf(sc->dev, "failed to read XFP\n");
2900 	}
2901 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2902 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2903 	}
2904 	if (err != MXGEFW_CMD_OK) {
2905 		return;
2906 	}
2907 
2908 	/* now we wait for the data to be cached */
2909 	cmd.data0 = byte;
2910 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2911 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2912 		DELAY(1000);
2913 		cmd.data0 = byte;
2914 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2915 	}
2916 	if (err != MXGEFW_CMD_OK) {
2917 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2918 			      cage_type, err, ms);
2919 		return;
2920 	}
2921 
2922 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2923 		if (mxge_verbose)
2924 			device_printf(sc->dev, "%s:%s\n", cage_type,
2925 				      mxge_media_types[0].name);
2926 		mxge_set_media(sc, IFM_10G_CX4);
2927 		return;
2928 	}
2929 	for (i = 1; i < mxge_media_type_entries; i++) {
2930 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2931 			if (mxge_verbose)
2932 				device_printf(sc->dev, "%s:%s\n",
2933 					      cage_type,
2934 					      mxge_media_types[i].name);
2935 
2936 			mxge_set_media(sc, mxge_media_types[i].flag);
2937 			return;
2938 		}
2939 	}
2940 	device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2941 		      cmd.data0);
2942 
2943 	return;
2944 }
2945 
2946 static void
2947 mxge_intr(void *arg)
2948 {
2949 	struct mxge_slice_state *ss = arg;
2950 	mxge_softc_t *sc = ss->sc;
2951 	mcp_irq_data_t *stats = ss->fw_stats;
2952 	mxge_tx_ring_t *tx = &ss->tx;
2953 	mxge_rx_done_t *rx_done = &ss->rx_done;
2954 	uint32_t send_done_count;
2955 	uint8_t valid;
2956 
2957 
2958 #ifndef IFNET_BUF_RING
2959 	/* an interrupt on a non-zero slice is implicitly valid
2960 	   since MSI-X irqs are not shared */
2961 	if (ss != sc->ss) {
2962 		mxge_clean_rx_done(ss);
2963 		*ss->irq_claim = be32toh(3);
2964 		return;
2965 	}
2966 #endif
2967 
2968 	/* make sure the DMA has finished */
2969 	if (!stats->valid) {
2970 		return;
2971 	}
2972 	valid = stats->valid;
2973 
2974 	if (sc->legacy_irq) {
2975 		/* lower legacy IRQ  */
2976 		*sc->irq_deassert = 0;
2977 		if (!mxge_deassert_wait)
2978 			/* don't wait for conf. that irq is low */
2979 			stats->valid = 0;
2980 	} else {
2981 		stats->valid = 0;
2982 	}
2983 
2984 	/* loop while waiting for legacy irq deassertion */
2985 	do {
2986 		/* check for transmit completes and receives */
2987 		send_done_count = be32toh(stats->send_done_count);
2988 		while ((send_done_count != tx->pkt_done) ||
2989 		       (rx_done->entry[rx_done->idx].length != 0)) {
2990 			if (send_done_count != tx->pkt_done)
2991 				mxge_tx_done(ss, (int)send_done_count);
2992 			mxge_clean_rx_done(ss);
2993 			send_done_count = be32toh(stats->send_done_count);
2994 		}
2995 		if (sc->legacy_irq && mxge_deassert_wait)
2996 			wmb();
2997 	} while (*((volatile uint8_t *) &stats->valid));
2998 
2999 	/* fw link & error stats meaningful only on the first slice */
3000 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3001 		if (sc->link_state != stats->link_up) {
3002 			sc->link_state = stats->link_up;
3003 			if (sc->link_state) {
3004 				sc->ifp->if_link_state = LINK_STATE_UP;
3005 				if_link_state_change(sc->ifp);
3006 				if (mxge_verbose)
3007 					device_printf(sc->dev, "link up\n");
3008 			} else {
3009 				sc->ifp->if_link_state = LINK_STATE_DOWN;
3010 				if_link_state_change(sc->ifp);
3011 				if (mxge_verbose)
3012 					device_printf(sc->dev, "link down\n");
3013 			}
3014 			sc->need_media_probe = 1;
3015 		}
3016 		if (sc->rdma_tags_available !=
3017 		    be32toh(stats->rdma_tags_available)) {
3018 			sc->rdma_tags_available =
3019 				be32toh(stats->rdma_tags_available);
3020 			device_printf(sc->dev, "RDMA timed out! %d tags "
3021 				      "left\n", sc->rdma_tags_available);
3022 		}
3023 
3024 		if (stats->link_down) {
3025 			sc->down_cnt += stats->link_down;
3026 			sc->link_state = 0;
3027 			sc->ifp->if_link_state = LINK_STATE_DOWN;
3028 			if_link_state_change(sc->ifp);
3029 		}
3030 	}
3031 
3032 	/* check to see if we have rx token to pass back */
3033 	if (valid & 0x1)
3034 	    *ss->irq_claim = be32toh(3);
3035 	*(ss->irq_claim + 1) = be32toh(3);
3036 }
3037 
3038 static void
3039 mxge_init(void *arg)
3040 {
3041 }
3042 
3043 
3044 
3045 static void
3046 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3047 {
3048 	struct lro_entry *lro_entry;
3049 	int i;
3050 
3051 	while (!SLIST_EMPTY(&ss->lro_free)) {
3052 		lro_entry = SLIST_FIRST(&ss->lro_free);
3053 		SLIST_REMOVE_HEAD(&ss->lro_free, next);
3054 		kfree(lro_entry, M_DEVBUF);
3055 	}
3056 
3057 	for (i = 0; i <= ss->rx_big.mask; i++) {
3058 		if (ss->rx_big.info[i].m == NULL)
3059 			continue;
3060 		bus_dmamap_unload(ss->rx_big.dmat,
3061 				  ss->rx_big.info[i].map);
3062 		m_freem(ss->rx_big.info[i].m);
3063 		ss->rx_big.info[i].m = NULL;
3064 	}
3065 
3066 	for (i = 0; i <= ss->rx_small.mask; i++) {
3067 		if (ss->rx_small.info[i].m == NULL)
3068 			continue;
3069 		bus_dmamap_unload(ss->rx_small.dmat,
3070 				  ss->rx_small.info[i].map);
3071 		m_freem(ss->rx_small.info[i].m);
3072 		ss->rx_small.info[i].m = NULL;
3073 	}
3074 
3075 	/* transmit ring used only on the first slice */
3076 	if (ss->tx.info == NULL)
3077 		return;
3078 
3079 	for (i = 0; i <= ss->tx.mask; i++) {
3080 		ss->tx.info[i].flag = 0;
3081 		if (ss->tx.info[i].m == NULL)
3082 			continue;
3083 		bus_dmamap_unload(ss->tx.dmat,
3084 				  ss->tx.info[i].map);
3085 		m_freem(ss->tx.info[i].m);
3086 		ss->tx.info[i].m = NULL;
3087 	}
3088 }
3089 
3090 static void
3091 mxge_free_mbufs(mxge_softc_t *sc)
3092 {
3093 	int slice;
3094 
3095 	for (slice = 0; slice < sc->num_slices; slice++)
3096 		mxge_free_slice_mbufs(&sc->ss[slice]);
3097 }
3098 
3099 static void
3100 mxge_free_slice_rings(struct mxge_slice_state *ss)
3101 {
3102 	int i;
3103 
3104 
3105 	if (ss->rx_done.entry != NULL)
3106 		mxge_dma_free(&ss->rx_done.dma);
3107 	ss->rx_done.entry = NULL;
3108 
3109 	if (ss->tx.req_bytes != NULL)
3110 		kfree(ss->tx.req_bytes, M_DEVBUF);
3111 	ss->tx.req_bytes = NULL;
3112 
3113 	if (ss->tx.seg_list != NULL)
3114 		kfree(ss->tx.seg_list, M_DEVBUF);
3115 	ss->tx.seg_list = NULL;
3116 
3117 	if (ss->rx_small.shadow != NULL)
3118 		kfree(ss->rx_small.shadow, M_DEVBUF);
3119 	ss->rx_small.shadow = NULL;
3120 
3121 	if (ss->rx_big.shadow != NULL)
3122 		kfree(ss->rx_big.shadow, M_DEVBUF);
3123 	ss->rx_big.shadow = NULL;
3124 
3125 	if (ss->tx.info != NULL) {
3126 		if (ss->tx.dmat != NULL) {
3127 			for (i = 0; i <= ss->tx.mask; i++) {
3128 				bus_dmamap_destroy(ss->tx.dmat,
3129 						   ss->tx.info[i].map);
3130 			}
3131 			bus_dma_tag_destroy(ss->tx.dmat);
3132 		}
3133 		kfree(ss->tx.info, M_DEVBUF);
3134 	}
3135 	ss->tx.info = NULL;
3136 
3137 	if (ss->rx_small.info != NULL) {
3138 		if (ss->rx_small.dmat != NULL) {
3139 			for (i = 0; i <= ss->rx_small.mask; i++) {
3140 				bus_dmamap_destroy(ss->rx_small.dmat,
3141 						   ss->rx_small.info[i].map);
3142 			}
3143 			bus_dmamap_destroy(ss->rx_small.dmat,
3144 					   ss->rx_small.extra_map);
3145 			bus_dma_tag_destroy(ss->rx_small.dmat);
3146 		}
3147 		kfree(ss->rx_small.info, M_DEVBUF);
3148 	}
3149 	ss->rx_small.info = NULL;
3150 
3151 	if (ss->rx_big.info != NULL) {
3152 		if (ss->rx_big.dmat != NULL) {
3153 			for (i = 0; i <= ss->rx_big.mask; i++) {
3154 				bus_dmamap_destroy(ss->rx_big.dmat,
3155 						   ss->rx_big.info[i].map);
3156 			}
3157 			bus_dmamap_destroy(ss->rx_big.dmat,
3158 					   ss->rx_big.extra_map);
3159 			bus_dma_tag_destroy(ss->rx_big.dmat);
3160 		}
3161 		kfree(ss->rx_big.info, M_DEVBUF);
3162 	}
3163 	ss->rx_big.info = NULL;
3164 }
3165 
3166 static void
3167 mxge_free_rings(mxge_softc_t *sc)
3168 {
3169 	int slice;
3170 
3171 	for (slice = 0; slice < sc->num_slices; slice++)
3172 		mxge_free_slice_rings(&sc->ss[slice]);
3173 }
3174 
3175 static int
3176 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3177 		       int tx_ring_entries)
3178 {
3179 	mxge_softc_t *sc = ss->sc;
3180 	size_t bytes;
3181 	int err, i;
3182 
3183 	err = ENOMEM;
3184 
3185 	/* allocate per-slice receive resources */
3186 
3187 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3188 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3189 
3190 	/* allocate the rx shadow rings */
3191 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3192 	ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3193 	if (ss->rx_small.shadow == NULL)
3194 		return err;;
3195 
3196 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3197 	ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3198 	if (ss->rx_big.shadow == NULL)
3199 		return err;;
3200 
3201 	/* allocate the rx host info rings */
3202 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3203 	ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3204 	if (ss->rx_small.info == NULL)
3205 		return err;;
3206 
3207 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3208 	ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3209 	if (ss->rx_big.info == NULL)
3210 		return err;;
3211 
3212 	/* allocate the rx busdma resources */
3213 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3214 				 1,			/* alignment */
3215 				 4096,			/* boundary */
3216 				 BUS_SPACE_MAXADDR,	/* low */
3217 				 BUS_SPACE_MAXADDR,	/* high */
3218 				 NULL, NULL,		/* filter */
3219 				 MHLEN,			/* maxsize */
3220 				 1,			/* num segs */
3221 				 MHLEN,			/* maxsegsize */
3222 				 BUS_DMA_ALLOCNOW,	/* flags */
3223 				 &ss->rx_small.dmat);	/* tag */
3224 	if (err != 0) {
3225 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3226 			      err);
3227 		return err;;
3228 	}
3229 
3230 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3231 				 1,			/* alignment */
3232 #if MXGE_VIRT_JUMBOS
3233 				 4096,			/* boundary */
3234 #else
3235 				 0,			/* boundary */
3236 #endif
3237 				 BUS_SPACE_MAXADDR,	/* low */
3238 				 BUS_SPACE_MAXADDR,	/* high */
3239 				 NULL, NULL,		/* filter */
3240 				 3*4096,		/* maxsize */
3241 #if MXGE_VIRT_JUMBOS
3242 				 3,			/* num segs */
3243 				 4096,			/* maxsegsize*/
3244 #else
3245 				 1,			/* num segs */
3246 				 MJUM9BYTES,		/* maxsegsize*/
3247 #endif
3248 				 BUS_DMA_ALLOCNOW,	/* flags */
3249 				 &ss->rx_big.dmat);	/* tag */
3250 	if (err != 0) {
3251 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3252 			      err);
3253 		return err;;
3254 	}
3255 	for (i = 0; i <= ss->rx_small.mask; i++) {
3256 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3257 					&ss->rx_small.info[i].map);
3258 		if (err != 0) {
3259 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3260 				      err);
3261 			return err;;
3262 		}
3263 	}
3264 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3265 				&ss->rx_small.extra_map);
3266 	if (err != 0) {
3267 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3268 			      err);
3269 		return err;;
3270 	}
3271 
3272 	for (i = 0; i <= ss->rx_big.mask; i++) {
3273 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3274 					&ss->rx_big.info[i].map);
3275 		if (err != 0) {
3276 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3277 				      err);
3278 			return err;;
3279 		}
3280 	}
3281 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3282 				&ss->rx_big.extra_map);
3283 	if (err != 0) {
3284 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3285 			      err);
3286 		return err;;
3287 	}
3288 
3289 	/* now allocate TX resouces */
3290 
3291 #ifndef IFNET_BUF_RING
3292 	/* only use a single TX ring for now */
3293 	if (ss != ss->sc->ss)
3294 		return 0;
3295 #endif
3296 
3297 	ss->tx.mask = tx_ring_entries - 1;
3298 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3299 
3300 
3301 	/* allocate the tx request copy block */
3302 	bytes = 8 +
3303 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3304 	ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3305 	if (ss->tx.req_bytes == NULL)
3306 		return err;;
3307 	/* ensure req_list entries are aligned to 8 bytes */
3308 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3309 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3310 
3311 	/* allocate the tx busdma segment list */
3312 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3313 	ss->tx.seg_list = (bus_dma_segment_t *)
3314 		kmalloc(bytes, M_DEVBUF, M_WAITOK);
3315 	if (ss->tx.seg_list == NULL)
3316 		return err;;
3317 
3318 	/* allocate the tx host info ring */
3319 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3320 	ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3321 	if (ss->tx.info == NULL)
3322 		return err;;
3323 
3324 	/* allocate the tx busdma resources */
3325 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3326 				 1,			/* alignment */
3327 				 sc->tx_boundary,	/* boundary */
3328 				 BUS_SPACE_MAXADDR,	/* low */
3329 				 BUS_SPACE_MAXADDR,	/* high */
3330 				 NULL, NULL,		/* filter */
3331 				 65536 + 256,		/* maxsize */
3332 				 ss->tx.max_desc - 2,	/* num segs */
3333 				 sc->tx_boundary,	/* maxsegsz */
3334 				 BUS_DMA_ALLOCNOW,	/* flags */
3335 				 &ss->tx.dmat);		/* tag */
3336 
3337 	if (err != 0) {
3338 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3339 			      err);
3340 		return err;;
3341 	}
3342 
3343 	/* now use these tags to setup dmamaps for each slot
3344 	   in the ring */
3345 	for (i = 0; i <= ss->tx.mask; i++) {
3346 		err = bus_dmamap_create(ss->tx.dmat, 0,
3347 					&ss->tx.info[i].map);
3348 		if (err != 0) {
3349 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3350 				      err);
3351 			return err;;
3352 		}
3353 	}
3354 	return 0;
3355 
3356 }
3357 
3358 static int
3359 mxge_alloc_rings(mxge_softc_t *sc)
3360 {
3361 	mxge_cmd_t cmd;
3362 	int tx_ring_size;
3363 	int tx_ring_entries, rx_ring_entries;
3364 	int err, slice;
3365 
3366 	/* get ring sizes */
3367 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3368 	tx_ring_size = cmd.data0;
3369 	if (err != 0) {
3370 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3371 		goto abort;
3372 	}
3373 
3374 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3375 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3376 	ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3377 	ifq_set_ready(&sc->ifp->if_snd);
3378 
3379 	for (slice = 0; slice < sc->num_slices; slice++) {
3380 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3381 					     rx_ring_entries,
3382 					     tx_ring_entries);
3383 		if (err != 0)
3384 			goto abort;
3385 	}
3386 	return 0;
3387 
3388 abort:
3389 	mxge_free_rings(sc);
3390 	return err;
3391 
3392 }
3393 
3394 
3395 static void
3396 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3397 {
3398 	int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3399 
3400 	if (bufsize < MCLBYTES) {
3401 		/* easy, everything fits in a single buffer */
3402 		*big_buf_size = MCLBYTES;
3403 		*cl_size = MCLBYTES;
3404 		*nbufs = 1;
3405 		return;
3406 	}
3407 
3408 	if (bufsize < MJUMPAGESIZE) {
3409 		/* still easy, everything still fits in a single buffer */
3410 		*big_buf_size = MJUMPAGESIZE;
3411 		*cl_size = MJUMPAGESIZE;
3412 		*nbufs = 1;
3413 		return;
3414 	}
3415 #if MXGE_VIRT_JUMBOS
3416 	/* now we need to use virtually contiguous buffers */
3417 	*cl_size = MJUM9BYTES;
3418 	*big_buf_size = 4096;
3419 	*nbufs = mtu / 4096 + 1;
3420 	/* needs to be a power of two, so round up */
3421 	if (*nbufs == 3)
3422 		*nbufs = 4;
3423 #else
3424 	*cl_size = MJUM9BYTES;
3425 	*big_buf_size = MJUM9BYTES;
3426 	*nbufs = 1;
3427 #endif
3428 }
3429 
3430 static int
3431 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3432 {
3433 	mxge_softc_t *sc;
3434 	mxge_cmd_t cmd;
3435 	bus_dmamap_t map;
3436 	struct lro_entry *lro_entry;
3437 	int err, i, slice;
3438 
3439 
3440 	sc = ss->sc;
3441 	slice = ss - sc->ss;
3442 
3443 	SLIST_INIT(&ss->lro_free);
3444 	SLIST_INIT(&ss->lro_active);
3445 
3446 	for (i = 0; i < sc->lro_cnt; i++) {
3447 		lro_entry = (struct lro_entry *)
3448 			kmalloc(sizeof (*lro_entry), M_DEVBUF,
3449 			       M_NOWAIT | M_ZERO);
3450 		if (lro_entry == NULL) {
3451 			sc->lro_cnt = i;
3452 			break;
3453 		}
3454 		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3455 	}
3456 	/* get the lanai pointers to the send and receive rings */
3457 
3458 	err = 0;
3459 #ifndef IFNET_BUF_RING
3460 	/* We currently only send from the first slice */
3461 	if (slice == 0) {
3462 #endif
3463 		cmd.data0 = slice;
3464 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3465 		ss->tx.lanai =
3466 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3467 		ss->tx.send_go = (volatile uint32_t *)
3468 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3469 		ss->tx.send_stop = (volatile uint32_t *)
3470 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3471 #ifndef IFNET_BUF_RING
3472 	}
3473 #endif
3474 	cmd.data0 = slice;
3475 	err |= mxge_send_cmd(sc,
3476 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3477 	ss->rx_small.lanai =
3478 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3479 	cmd.data0 = slice;
3480 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3481 	ss->rx_big.lanai =
3482 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3483 
3484 	if (err != 0) {
3485 		device_printf(sc->dev,
3486 			      "failed to get ring sizes or locations\n");
3487 		return EIO;
3488 	}
3489 
3490 	/* stock receive rings */
3491 	for (i = 0; i <= ss->rx_small.mask; i++) {
3492 		map = ss->rx_small.info[i].map;
3493 		err = mxge_get_buf_small(ss, map, i);
3494 		if (err) {
3495 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3496 				      i, ss->rx_small.mask + 1);
3497 			return ENOMEM;
3498 		}
3499 	}
3500 	for (i = 0; i <= ss->rx_big.mask; i++) {
3501 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3502 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3503 	}
3504 	ss->rx_big.nbufs = nbufs;
3505 	ss->rx_big.cl_size = cl_size;
3506 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3507 		EVL_ENCAPLEN + MXGEFW_PAD;
3508 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3509 		map = ss->rx_big.info[i].map;
3510 		err = mxge_get_buf_big(ss, map, i);
3511 		if (err) {
3512 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3513 				      i, ss->rx_big.mask + 1);
3514 			return ENOMEM;
3515 		}
3516 	}
3517 	return 0;
3518 }
3519 
3520 static int
3521 mxge_open(mxge_softc_t *sc)
3522 {
3523 	mxge_cmd_t cmd;
3524 	int err, big_bytes, nbufs, slice, cl_size, i;
3525 	bus_addr_t bus;
3526 	volatile uint8_t *itable;
3527 	struct mxge_slice_state *ss;
3528 
3529 	ASSERT_SERIALIZED(sc->ifp->if_serializer);
3530 	/* Copy the MAC address in case it was overridden */
3531 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3532 
3533 	err = mxge_reset(sc, 1);
3534 	if (err != 0) {
3535 		device_printf(sc->dev, "failed to reset\n");
3536 		return EIO;
3537 	}
3538 
3539 	if (sc->num_slices > 1) {
3540 		/* setup the indirection table */
3541 		cmd.data0 = sc->num_slices;
3542 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3543 				    &cmd);
3544 
3545 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3546 				     &cmd);
3547 		if (err != 0) {
3548 			device_printf(sc->dev,
3549 				      "failed to setup rss tables\n");
3550 			return err;
3551 		}
3552 
3553 		/* just enable an identity mapping */
3554 		itable = sc->sram + cmd.data0;
3555 		for (i = 0; i < sc->num_slices; i++)
3556 			itable[i] = (uint8_t)i;
3557 
3558 		cmd.data0 = 1;
3559 		cmd.data1 = mxge_rss_hash_type;
3560 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3561 		if (err != 0) {
3562 			device_printf(sc->dev, "failed to enable slices\n");
3563 			return err;
3564 		}
3565 	}
3566 
3567 
3568 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3569 
3570 	cmd.data0 = nbufs;
3571 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3572 			    &cmd);
3573 	/* error is only meaningful if we're trying to set
3574 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3575 	if (err && nbufs > 1) {
3576 		device_printf(sc->dev,
3577 			      "Failed to set alway-use-n to %d\n",
3578 			      nbufs);
3579 		return EIO;
3580 	}
3581 	/* Give the firmware the mtu and the big and small buffer
3582 	   sizes.  The firmware wants the big buf size to be a power
3583 	   of two. Luckily, FreeBSD's clusters are powers of two */
3584 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3585 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3586 	cmd.data0 = MHLEN - MXGEFW_PAD;
3587 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3588 			     &cmd);
3589 	cmd.data0 = big_bytes;
3590 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3591 
3592 	if (err != 0) {
3593 		device_printf(sc->dev, "failed to setup params\n");
3594 		goto abort;
3595 	}
3596 
3597 	/* Now give him the pointer to the stats block */
3598 	for (slice = 0;
3599 #ifdef IFNET_BUF_RING
3600 	     slice < sc->num_slices;
3601 #else
3602 	     slice < 1;
3603 #endif
3604 	     slice++) {
3605 		ss = &sc->ss[slice];
3606 		cmd.data0 =
3607 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3608 		cmd.data1 =
3609 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3610 		cmd.data2 = sizeof(struct mcp_irq_data);
3611 		cmd.data2 |= (slice << 16);
3612 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3613 	}
3614 
3615 	if (err != 0) {
3616 		bus = sc->ss->fw_stats_dma.bus_addr;
3617 		bus += offsetof(struct mcp_irq_data, send_done_count);
3618 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3619 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3620 		err = mxge_send_cmd(sc,
3621 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3622 				    &cmd);
3623 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3624 		sc->fw_multicast_support = 0;
3625 	} else {
3626 		sc->fw_multicast_support = 1;
3627 	}
3628 
3629 	if (err != 0) {
3630 		device_printf(sc->dev, "failed to setup params\n");
3631 		goto abort;
3632 	}
3633 
3634 	for (slice = 0; slice < sc->num_slices; slice++) {
3635 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3636 		if (err != 0) {
3637 			device_printf(sc->dev, "couldn't open slice %d\n",
3638 				      slice);
3639 			goto abort;
3640 		}
3641 	}
3642 
3643 	/* Finally, start the firmware running */
3644 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3645 	if (err) {
3646 		device_printf(sc->dev, "Couldn't bring up link\n");
3647 		goto abort;
3648 	}
3649 #ifdef IFNET_BUF_RING
3650 	for (slice = 0; slice < sc->num_slices; slice++) {
3651 		ss = &sc->ss[slice];
3652 		ss->if_flags |= IFF_RUNNING;
3653 		ss->if_flags &= ~IFF_OACTIVE;
3654 	}
3655 #endif
3656 	sc->ifp->if_flags |= IFF_RUNNING;
3657 	sc->ifp->if_flags &= ~IFF_OACTIVE;
3658 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3659 
3660 	return 0;
3661 
3662 
3663 abort:
3664 	mxge_free_mbufs(sc);
3665 
3666 	return err;
3667 }
3668 
3669 static int
3670 mxge_close(mxge_softc_t *sc)
3671 {
3672 	mxge_cmd_t cmd;
3673 	int err, old_down_cnt;
3674 #ifdef IFNET_BUF_RING
3675 	struct mxge_slice_state *ss;
3676 	int slice;
3677 #endif
3678 
3679 	ASSERT_SERIALIZED(sc->ifp->if_serializer);
3680 	callout_stop(&sc->co_hdl);
3681 #ifdef IFNET_BUF_RING
3682 	for (slice = 0; slice < sc->num_slices; slice++) {
3683 		ss = &sc->ss[slice];
3684 		ss->if_flags &= ~IFF_RUNNING;
3685 	}
3686 #endif
3687 	sc->ifp->if_flags &= ~IFF_RUNNING;
3688 	old_down_cnt = sc->down_cnt;
3689 	wmb();
3690 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3691 	if (err) {
3692 		device_printf(sc->dev, "Couldn't bring down link\n");
3693 	}
3694 	if (old_down_cnt == sc->down_cnt) {
3695 		/* wait for down irq */
3696 		DELAY(10 * sc->intr_coal_delay);
3697 	}
3698 	wmb();
3699 	if (old_down_cnt == sc->down_cnt) {
3700 		device_printf(sc->dev, "never got down irq\n");
3701 	}
3702 
3703 	mxge_free_mbufs(sc);
3704 
3705 	return 0;
3706 }
3707 
3708 static void
3709 mxge_setup_cfg_space(mxge_softc_t *sc)
3710 {
3711 	device_t dev = sc->dev;
3712 	int reg;
3713 	uint16_t cmd, lnk, pectl;
3714 
3715 	/* find the PCIe link width and set max read request to 4KB*/
3716 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3717 		lnk = pci_read_config(dev, reg + 0x12, 2);
3718 		sc->link_width = (lnk >> 4) & 0x3f;
3719 
3720 		pectl = pci_read_config(dev, reg + 0x8, 2);
3721 		pectl = (pectl & ~0x7000) | (5 << 12);
3722 		pci_write_config(dev, reg + 0x8, pectl, 2);
3723 	}
3724 
3725 	/* Enable DMA and Memory space access */
3726 	pci_enable_busmaster(dev);
3727 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3728 	cmd |= PCIM_CMD_MEMEN;
3729 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3730 }
3731 
3732 static uint32_t
3733 mxge_read_reboot(mxge_softc_t *sc)
3734 {
3735 	device_t dev = sc->dev;
3736 	uint32_t vs;
3737 
3738 	/* find the vendor specific offset */
3739 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3740 		device_printf(sc->dev,
3741 			      "could not find vendor specific offset\n");
3742 		return (uint32_t)-1;
3743 	}
3744 	/* enable read32 mode */
3745 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3746 	/* tell NIC which register to read */
3747 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3748 	return (pci_read_config(dev, vs + 0x14, 4));
3749 }
3750 
3751 static int
3752 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3753 {
3754 	struct pci_devinfo *dinfo;
3755 	mxge_tx_ring_t *tx;
3756 	int err;
3757 	uint32_t reboot;
3758 	uint16_t cmd;
3759 
3760 	err = ENXIO;
3761 
3762 	device_printf(sc->dev, "Watchdog reset!\n");
3763 
3764 	/*
3765 	 * check to see if the NIC rebooted.  If it did, then all of
3766 	 * PCI config space has been reset, and things like the
3767 	 * busmaster bit will be zero.  If this is the case, then we
3768 	 * must restore PCI config space before the NIC can be used
3769 	 * again
3770 	 */
3771 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3772 	if (cmd == 0xffff) {
3773 		/*
3774 		 * maybe the watchdog caught the NIC rebooting; wait
3775 		 * up to 100ms for it to finish.  If it does not come
3776 		 * back, then give up
3777 		 */
3778 		DELAY(1000*100);
3779 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3780 		if (cmd == 0xffff) {
3781 			device_printf(sc->dev, "NIC disappeared!\n");
3782 			return (err);
3783 		}
3784 	}
3785 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3786 		/* print the reboot status */
3787 		reboot = mxge_read_reboot(sc);
3788 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3789 			      reboot);
3790 		/* restore PCI configuration space */
3791 		dinfo = device_get_ivars(sc->dev);
3792 		pci_cfg_restore(sc->dev, dinfo);
3793 
3794 		/* and redo any changes we made to our config space */
3795 		mxge_setup_cfg_space(sc);
3796 
3797 		if (sc->ifp->if_flags & IFF_RUNNING) {
3798 			mxge_close(sc);
3799 			err = mxge_open(sc);
3800 		}
3801 	} else {
3802 		tx = &sc->ss[slice].tx;
3803 		device_printf(sc->dev,
3804 			      "NIC did not reboot, slice %d ring state:\n",
3805 			      slice);
3806 		device_printf(sc->dev,
3807 			      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3808 			      tx->req, tx->done, tx->queue_active);
3809 		device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3810 			      tx->activate, tx->deactivate);
3811 		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3812 			      tx->pkt_done,
3813 			      be32toh(sc->ss->fw_stats->send_done_count));
3814 		device_printf(sc->dev, "not resetting\n");
3815 	}
3816 	return (err);
3817 }
3818 
3819 static int
3820 mxge_watchdog(mxge_softc_t *sc)
3821 {
3822 	mxge_tx_ring_t *tx;
3823 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3824 	int i, err = 0;
3825 
3826 	/* see if we have outstanding transmits, which
3827 	   have been pending for more than mxge_ticks */
3828 	for (i = 0;
3829 #ifdef IFNET_BUF_RING
3830 	     (i < sc->num_slices) && (err == 0);
3831 #else
3832 	     (i < 1) && (err == 0);
3833 #endif
3834 	     i++) {
3835 		tx = &sc->ss[i].tx;
3836 		if (tx->req != tx->done &&
3837 		    tx->watchdog_req != tx->watchdog_done &&
3838 		    tx->done == tx->watchdog_done) {
3839 			/* check for pause blocking before resetting */
3840 			if (tx->watchdog_rx_pause == rx_pause)
3841 				err = mxge_watchdog_reset(sc, i);
3842 			else
3843 				device_printf(sc->dev, "Flow control blocking "
3844 					      "xmits, check link partner\n");
3845 		}
3846 
3847 		tx->watchdog_req = tx->req;
3848 		tx->watchdog_done = tx->done;
3849 		tx->watchdog_rx_pause = rx_pause;
3850 	}
3851 
3852 	if (sc->need_media_probe)
3853 		mxge_media_probe(sc);
3854 	return (err);
3855 }
3856 
3857 static void
3858 mxge_update_stats(mxge_softc_t *sc)
3859 {
3860 	struct mxge_slice_state *ss;
3861 	u_long ipackets = 0;
3862 	u_long opackets = 0;
3863 #ifdef IFNET_BUF_RING
3864 	u_long obytes = 0;
3865 	u_long omcasts = 0;
3866 	u_long odrops = 0;
3867 #endif
3868 	u_long oerrors = 0;
3869 	int slice;
3870 
3871 	for (slice = 0; slice < sc->num_slices; slice++) {
3872 		ss = &sc->ss[slice];
3873 		ipackets += ss->ipackets;
3874 		opackets += ss->opackets;
3875 #ifdef IFNET_BUF_RING
3876 		obytes += ss->obytes;
3877 		omcasts += ss->omcasts;
3878 		odrops += ss->tx.br->br_drops;
3879 #endif
3880 		oerrors += ss->oerrors;
3881 	}
3882 	sc->ifp->if_ipackets = ipackets;
3883 	sc->ifp->if_opackets = opackets;
3884 #ifdef IFNET_BUF_RING
3885 	sc->ifp->if_obytes = obytes;
3886 	sc->ifp->if_omcasts = omcasts;
3887 	sc->ifp->if_snd.ifq_drops = odrops;
3888 #endif
3889 	sc->ifp->if_oerrors = oerrors;
3890 }
3891 
3892 static void
3893 mxge_tick(void *arg)
3894 {
3895 	mxge_softc_t *sc = arg;
3896 	int err = 0;
3897 
3898 	lwkt_serialize_enter(sc->ifp->if_serializer);
3899 	/* aggregate stats from different slices */
3900 	mxge_update_stats(sc);
3901 	if (!sc->watchdog_countdown) {
3902 		err = mxge_watchdog(sc);
3903 		sc->watchdog_countdown = 4;
3904 	}
3905 	sc->watchdog_countdown--;
3906 	if (err == 0)
3907 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3908 	lwkt_serialize_exit(sc->ifp->if_serializer);
3909 }
3910 
3911 static int
3912 mxge_media_change(struct ifnet *ifp)
3913 {
3914 	return EINVAL;
3915 }
3916 
3917 static int
3918 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3919 {
3920 	struct ifnet *ifp = sc->ifp;
3921 	int real_mtu, old_mtu;
3922 	int err = 0;
3923 
3924 	if (ifp->if_serializer)
3925 		ASSERT_SERIALIZED(ifp->if_serializer);
3926 
3927 	real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3928 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3929 		return EINVAL;
3930 	old_mtu = ifp->if_mtu;
3931 	ifp->if_mtu = mtu;
3932 	if (ifp->if_flags & IFF_RUNNING) {
3933 		mxge_close(sc);
3934 		err = mxge_open(sc);
3935 		if (err != 0) {
3936 			ifp->if_mtu = old_mtu;
3937 			mxge_close(sc);
3938 			(void) mxge_open(sc);
3939 		}
3940 	}
3941 	return err;
3942 }
3943 
3944 static void
3945 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3946 {
3947 	mxge_softc_t *sc = ifp->if_softc;
3948 
3949 
3950 	if (sc == NULL)
3951 		return;
3952 	ifmr->ifm_status = IFM_AVALID;
3953 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3954 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3955 	ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3956 }
3957 
3958 static int
3959 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data, struct ucred *cr)
3960 {
3961 	mxge_softc_t *sc = ifp->if_softc;
3962 	struct ifreq *ifr = (struct ifreq *)data;
3963 	int err, mask;
3964 
3965 	(void)cr;
3966 	err = 0;
3967 	ASSERT_SERIALIZED(ifp->if_serializer);
3968 	switch (command) {
3969 	case SIOCSIFADDR:
3970 	case SIOCGIFADDR:
3971 		err = ether_ioctl(ifp, command, data);
3972 		break;
3973 
3974 	case SIOCSIFMTU:
3975 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3976 		break;
3977 
3978 	case SIOCSIFFLAGS:
3979 		if (sc->dying) {
3980 			return EINVAL;
3981 		}
3982 		if (ifp->if_flags & IFF_UP) {
3983 			if (!(ifp->if_flags & IFF_RUNNING)) {
3984 				err = mxge_open(sc);
3985 			} else {
3986 				/* take care of promis can allmulti
3987 				   flag chages */
3988 				mxge_change_promisc(sc,
3989 						    ifp->if_flags & IFF_PROMISC);
3990 				mxge_set_multicast_list(sc);
3991 			}
3992 		} else {
3993 			if (ifp->if_flags & IFF_RUNNING) {
3994 				mxge_close(sc);
3995 			}
3996 		}
3997 		break;
3998 
3999 	case SIOCADDMULTI:
4000 	case SIOCDELMULTI:
4001 		mxge_set_multicast_list(sc);
4002 		break;
4003 
4004 	case SIOCSIFCAP:
4005 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4006 		if (mask & IFCAP_TXCSUM) {
4007 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4008 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4009 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
4010 						      | CSUM_TSO);
4011 			} else {
4012 				ifp->if_capenable |= IFCAP_TXCSUM;
4013 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4014 			}
4015 		} else if (mask & IFCAP_RXCSUM) {
4016 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4017 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4018 				sc->csum_flag = 0;
4019 			} else {
4020 				ifp->if_capenable |= IFCAP_RXCSUM;
4021 				sc->csum_flag = 1;
4022 			}
4023 		}
4024 		if (mask & IFCAP_TSO4) {
4025 			if (IFCAP_TSO4 & ifp->if_capenable) {
4026 				ifp->if_capenable &= ~IFCAP_TSO4;
4027 				ifp->if_hwassist &= ~CSUM_TSO;
4028 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4029 				ifp->if_capenable |= IFCAP_TSO4;
4030 				ifp->if_hwassist |= CSUM_TSO;
4031 			} else {
4032 				kprintf("mxge requires tx checksum offload"
4033 				       " be enabled to use TSO\n");
4034 				err = EINVAL;
4035 			}
4036 		}
4037 		if (mask & IFCAP_LRO) {
4038 			if (IFCAP_LRO & ifp->if_capenable)
4039 				err = mxge_change_lro_locked(sc, 0);
4040 			else
4041 				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
4042 		}
4043 		if (mask & IFCAP_VLAN_HWTAGGING)
4044 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4045 		VLAN_CAPABILITIES(ifp);
4046 
4047 		break;
4048 
4049 	case SIOCGIFMEDIA:
4050 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4051 				    &sc->media, command);
4052                 break;
4053 
4054 	default:
4055 		err = ENOTTY;
4056         }
4057 	return err;
4058 }
4059 
4060 static void
4061 mxge_fetch_tunables(mxge_softc_t *sc)
4062 {
4063 
4064 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4065 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4066 			  &mxge_flow_control);
4067 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4068 			  &mxge_intr_coal_delay);
4069 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4070 			  &mxge_nvidia_ecrc_enable);
4071 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4072 			  &mxge_force_firmware);
4073 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4074 			  &mxge_deassert_wait);
4075 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4076 			  &mxge_verbose);
4077 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4078 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4079 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4080 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4081 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4082 	if (sc->lro_cnt != 0)
4083 		mxge_lro_cnt = sc->lro_cnt;
4084 
4085 	if (bootverbose)
4086 		mxge_verbose = 1;
4087 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4088 		mxge_intr_coal_delay = 30;
4089 	if (mxge_ticks == 0)
4090 		mxge_ticks = hz / 2;
4091 	sc->pause = mxge_flow_control;
4092 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4093 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4094 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4095 	}
4096 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4097 	    mxge_initial_mtu < ETHER_MIN_LEN)
4098 		mxge_initial_mtu = ETHERMTU_JUMBO;
4099 }
4100 
4101 
4102 static void
4103 mxge_free_slices(mxge_softc_t *sc)
4104 {
4105 	struct mxge_slice_state *ss;
4106 	int i;
4107 
4108 
4109 	if (sc->ss == NULL)
4110 		return;
4111 
4112 	for (i = 0; i < sc->num_slices; i++) {
4113 		ss = &sc->ss[i];
4114 		if (ss->fw_stats != NULL) {
4115 			mxge_dma_free(&ss->fw_stats_dma);
4116 			ss->fw_stats = NULL;
4117 #ifdef IFNET_BUF_RING
4118 			if (ss->tx.br != NULL) {
4119 				drbr_free(ss->tx.br, M_DEVBUF);
4120 				ss->tx.br = NULL;
4121 			}
4122 #endif
4123 		}
4124 		if (ss->rx_done.entry != NULL) {
4125 			mxge_dma_free(&ss->rx_done.dma);
4126 			ss->rx_done.entry = NULL;
4127 		}
4128 	}
4129 	kfree(sc->ss, M_DEVBUF);
4130 	sc->ss = NULL;
4131 }
4132 
4133 static int
4134 mxge_alloc_slices(mxge_softc_t *sc)
4135 {
4136 	mxge_cmd_t cmd;
4137 	struct mxge_slice_state *ss;
4138 	size_t bytes;
4139 	int err, i, max_intr_slots;
4140 
4141 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4142 	if (err != 0) {
4143 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4144 		return err;
4145 	}
4146 	sc->rx_ring_size = cmd.data0;
4147 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4148 
4149 	bytes = sizeof (*sc->ss) * sc->num_slices;
4150 	sc->ss = kmalloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4151 	if (sc->ss == NULL)
4152 		return (ENOMEM);
4153 	for (i = 0; i < sc->num_slices; i++) {
4154 		ss = &sc->ss[i];
4155 
4156 		ss->sc = sc;
4157 
4158 		/* allocate per-slice rx interrupt queues */
4159 
4160 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4161 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4162 		if (err != 0)
4163 			goto abort;
4164 		ss->rx_done.entry = ss->rx_done.dma.addr;
4165 		bzero(ss->rx_done.entry, bytes);
4166 
4167 		/*
4168 		 * allocate the per-slice firmware stats; stats
4169 		 * (including tx) are used used only on the first
4170 		 * slice for now
4171 		 */
4172 #ifndef IFNET_BUF_RING
4173 		if (i > 0)
4174 			continue;
4175 #endif
4176 
4177 		bytes = sizeof (*ss->fw_stats);
4178 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4179 				     sizeof (*ss->fw_stats), 64);
4180 		if (err != 0)
4181 			goto abort;
4182 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4183 #ifdef IFNET_BUF_RING
4184 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4185 					   &ss->tx.lock);
4186 #endif
4187 	}
4188 
4189 	return (0);
4190 
4191 abort:
4192 	mxge_free_slices(sc);
4193 	return (ENOMEM);
4194 }
4195 
4196 static void
4197 mxge_slice_probe(mxge_softc_t *sc)
4198 {
4199 	mxge_cmd_t cmd;
4200 	char *old_fw;
4201 	int msix_cnt, status, max_intr_slots;
4202 
4203 	sc->num_slices = 1;
4204 	/*
4205 	 *  don't enable multiple slices if they are not enabled,
4206 	 *  or if this is not an SMP system
4207 	 */
4208 
4209 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
4210 		return;
4211 
4212 	/* see how many MSI-X interrupts are available */
4213 	msix_cnt = pci_msix_count(sc->dev);
4214 	if (msix_cnt < 2)
4215 		return;
4216 
4217 	/* now load the slice aware firmware see what it supports */
4218 	old_fw = sc->fw_name;
4219 	if (old_fw == mxge_fw_aligned)
4220 		sc->fw_name = mxge_fw_rss_aligned;
4221 	else
4222 		sc->fw_name = mxge_fw_rss_unaligned;
4223 	status = mxge_load_firmware(sc, 0);
4224 	if (status != 0) {
4225 		device_printf(sc->dev, "Falling back to a single slice\n");
4226 		return;
4227 	}
4228 
4229 	/* try to send a reset command to the card to see if it
4230 	   is alive */
4231 	memset(&cmd, 0, sizeof (cmd));
4232 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4233 	if (status != 0) {
4234 		device_printf(sc->dev, "failed reset\n");
4235 		goto abort_with_fw;
4236 	}
4237 
4238 	/* get rx ring size */
4239 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4240 	if (status != 0) {
4241 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4242 		goto abort_with_fw;
4243 	}
4244 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4245 
4246 	/* tell it the size of the interrupt queues */
4247 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4248 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4249 	if (status != 0) {
4250 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4251 		goto abort_with_fw;
4252 	}
4253 
4254 	/* ask the maximum number of slices it supports */
4255 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4256 	if (status != 0) {
4257 		device_printf(sc->dev,
4258 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4259 		goto abort_with_fw;
4260 	}
4261 	sc->num_slices = cmd.data0;
4262 	if (sc->num_slices > msix_cnt)
4263 		sc->num_slices = msix_cnt;
4264 
4265 	if (mxge_max_slices == -1) {
4266 		/* cap to number of CPUs in system */
4267 		if (sc->num_slices > ncpus)
4268 			sc->num_slices = ncpus;
4269 	} else {
4270 		if (sc->num_slices > mxge_max_slices)
4271 			sc->num_slices = mxge_max_slices;
4272 	}
4273 	/* make sure it is a power of two */
4274 	while (sc->num_slices & (sc->num_slices - 1))
4275 		sc->num_slices--;
4276 
4277 	if (mxge_verbose)
4278 		device_printf(sc->dev, "using %d slices\n",
4279 			      sc->num_slices);
4280 
4281 	return;
4282 
4283 abort_with_fw:
4284 	sc->fw_name = old_fw;
4285 	(void) mxge_load_firmware(sc, 0);
4286 }
4287 
4288 static int
4289 mxge_add_msix_irqs(mxge_softc_t *sc)
4290 {
4291 	size_t bytes;
4292 	int count, err, i, rid;
4293 
4294 	rid = PCIR_BAR(2);
4295 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4296 						    &rid, RF_ACTIVE);
4297 
4298 	if (sc->msix_table_res == NULL) {
4299 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4300 		return ENXIO;
4301 	}
4302 
4303 	count = sc->num_slices;
4304 	err = pci_alloc_msix(sc->dev, &count);
4305 	if (err != 0) {
4306 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4307 			      "err = %d \n", sc->num_slices, err);
4308 		goto abort_with_msix_table;
4309 	}
4310 	if (count < sc->num_slices) {
4311 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4312 			      count, sc->num_slices);
4313 		device_printf(sc->dev,
4314 			      "Try setting hw.mxge.max_slices to %d\n",
4315 			      count);
4316 		err = ENOSPC;
4317 		goto abort_with_msix;
4318 	}
4319 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4320 	sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4321 	if (sc->msix_irq_res == NULL) {
4322 		err = ENOMEM;
4323 		goto abort_with_msix;
4324 	}
4325 
4326 	for (i = 0; i < sc->num_slices; i++) {
4327 		rid = i + 1;
4328 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4329 							  SYS_RES_IRQ,
4330 							  &rid, RF_ACTIVE);
4331 		if (sc->msix_irq_res[i] == NULL) {
4332 			device_printf(sc->dev, "couldn't allocate IRQ res"
4333 				      " for message %d\n", i);
4334 			err = ENXIO;
4335 			goto abort_with_res;
4336 		}
4337 	}
4338 
4339 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4340 	sc->msix_ih =  kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4341 
4342 	for (i = 0; i < sc->num_slices; i++) {
4343 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4344 				     INTR_MPSAFE,
4345 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i],
4346 				     sc->ifp->if_serializer);
4347 		if (err != 0) {
4348 			device_printf(sc->dev, "couldn't setup intr for "
4349 				      "message %d\n", i);
4350 			goto abort_with_intr;
4351 		}
4352 	}
4353 
4354 	if (mxge_verbose) {
4355 		device_printf(sc->dev, "using %d msix IRQs:",
4356 			      sc->num_slices);
4357 		for (i = 0; i < sc->num_slices; i++)
4358 			kprintf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4359 		kprintf("\n");
4360 	}
4361 	return (0);
4362 
4363 abort_with_intr:
4364 	for (i = 0; i < sc->num_slices; i++) {
4365 		if (sc->msix_ih[i] != NULL) {
4366 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4367 					  sc->msix_ih[i]);
4368 			sc->msix_ih[i] = NULL;
4369 		}
4370 	}
4371 	kfree(sc->msix_ih, M_DEVBUF);
4372 
4373 
4374 abort_with_res:
4375 	for (i = 0; i < sc->num_slices; i++) {
4376 		rid = i + 1;
4377 		if (sc->msix_irq_res[i] != NULL)
4378 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4379 					     sc->msix_irq_res[i]);
4380 		sc->msix_irq_res[i] = NULL;
4381 	}
4382 	kfree(sc->msix_irq_res, M_DEVBUF);
4383 
4384 
4385 abort_with_msix:
4386 	pci_release_msi(sc->dev);
4387 
4388 abort_with_msix_table:
4389 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4390 			     sc->msix_table_res);
4391 
4392 	return err;
4393 }
4394 
4395 static int
4396 mxge_add_single_irq(mxge_softc_t *sc)
4397 {
4398 	int count, err, rid;
4399 
4400 	count = pci_msi_count(sc->dev);
4401 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4402 		rid = 1;
4403 	} else {
4404 		rid = 0;
4405 		sc->legacy_irq = 1;
4406 	}
4407 	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4408 					 1, RF_SHAREABLE | RF_ACTIVE);
4409 	if (sc->irq_res == NULL) {
4410 		device_printf(sc->dev, "could not alloc interrupt\n");
4411 		return ENXIO;
4412 	}
4413 	if (mxge_verbose)
4414 		device_printf(sc->dev, "using %s irq %ld\n",
4415 			      sc->legacy_irq ? "INTx" : "MSI",
4416 			      rman_get_start(sc->irq_res));
4417 	err = bus_setup_intr(sc->dev, sc->irq_res,
4418 			     INTR_MPSAFE,
4419 			     mxge_intr, &sc->ss[0], &sc->ih,
4420 			     sc->ifp->if_serializer);
4421 	if (err != 0) {
4422 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4423 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4424 		if (!sc->legacy_irq)
4425 			pci_release_msi(sc->dev);
4426 	}
4427 	return err;
4428 }
4429 
4430 static void
4431 mxge_rem_msix_irqs(mxge_softc_t *sc)
4432 {
4433 	int i, rid;
4434 
4435 	for (i = 0; i < sc->num_slices; i++) {
4436 		if (sc->msix_ih[i] != NULL) {
4437 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4438 					  sc->msix_ih[i]);
4439 			sc->msix_ih[i] = NULL;
4440 		}
4441 	}
4442 	kfree(sc->msix_ih, M_DEVBUF);
4443 
4444 	for (i = 0; i < sc->num_slices; i++) {
4445 		rid = i + 1;
4446 		if (sc->msix_irq_res[i] != NULL)
4447 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4448 					     sc->msix_irq_res[i]);
4449 		sc->msix_irq_res[i] = NULL;
4450 	}
4451 	kfree(sc->msix_irq_res, M_DEVBUF);
4452 
4453 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4454 			     sc->msix_table_res);
4455 
4456 	pci_release_msi(sc->dev);
4457 	return;
4458 }
4459 
4460 static void
4461 mxge_rem_single_irq(mxge_softc_t *sc)
4462 {
4463 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4464 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4465 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4466 	if (!sc->legacy_irq)
4467 		pci_release_msi(sc->dev);
4468 }
4469 
4470 static void
4471 mxge_rem_irq(mxge_softc_t *sc)
4472 {
4473 	if (sc->num_slices > 1)
4474 		mxge_rem_msix_irqs(sc);
4475 	else
4476 		mxge_rem_single_irq(sc);
4477 }
4478 
4479 static int
4480 mxge_add_irq(mxge_softc_t *sc)
4481 {
4482 	int err;
4483 
4484 	if (sc->num_slices > 1)
4485 		err = mxge_add_msix_irqs(sc);
4486 	else
4487 		err = mxge_add_single_irq(sc);
4488 
4489 	if (0 && err == 0 && sc->num_slices > 1) {
4490 		mxge_rem_msix_irqs(sc);
4491 		err = mxge_add_msix_irqs(sc);
4492 	}
4493 	return err;
4494 }
4495 
4496 
4497 static int
4498 mxge_attach(device_t dev)
4499 {
4500 	mxge_softc_t *sc = device_get_softc(dev);
4501 	struct ifnet *ifp = &sc->arpcom.ac_if;
4502 	int err, rid;
4503 
4504 	/*
4505 	 * avoid rewriting half the lines in this file to use
4506 	 * &sc->arpcom.ac_if instead
4507 	 */
4508 	sc->ifp = ifp;
4509 	sc->dev = dev;
4510 	mxge_fetch_tunables(sc);
4511 
4512 	err = bus_dma_tag_create(NULL,			/* parent */
4513 				 1,			/* alignment */
4514 				 0,			/* boundary */
4515 				 BUS_SPACE_MAXADDR,	/* low */
4516 				 BUS_SPACE_MAXADDR,	/* high */
4517 				 NULL, NULL,		/* filter */
4518 				 65536 + 256,		/* maxsize */
4519 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4520 				 65536,			/* maxsegsize */
4521 				 0,			/* flags */
4522 				 &sc->parent_dmat);	/* tag */
4523 
4524 	if (err != 0) {
4525 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4526 			      err);
4527 		goto abort_with_nothing;
4528 	}
4529 
4530 	sc->ifp = ifp;
4531 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4532 
4533 	callout_init_mp(&sc->co_hdl);
4534 
4535 	mxge_setup_cfg_space(sc);
4536 
4537 	/* Map the board into the kernel */
4538 	rid = PCIR_BARS;
4539 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4540 					 ~0, 1, RF_ACTIVE);
4541 	if (sc->mem_res == NULL) {
4542 		device_printf(dev, "could not map memory\n");
4543 		err = ENXIO;
4544 		goto abort_with_nothing;
4545 	}
4546 	sc->sram = rman_get_virtual(sc->mem_res);
4547 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4548 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4549 		device_printf(dev, "impossible memory region size %ld\n",
4550 			      rman_get_size(sc->mem_res));
4551 		err = ENXIO;
4552 		goto abort_with_mem_res;
4553 	}
4554 
4555 	/* make NULL terminated copy of the EEPROM strings section of
4556 	   lanai SRAM */
4557 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4558 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4559 				rman_get_bushandle(sc->mem_res),
4560 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4561 				sc->eeprom_strings,
4562 				MXGE_EEPROM_STRINGS_SIZE - 2);
4563 	err = mxge_parse_strings(sc);
4564 	if (err != 0)
4565 		goto abort_with_mem_res;
4566 
4567 	/* Enable write combining for efficient use of PCIe bus */
4568 	mxge_enable_wc(sc);
4569 
4570 	/* Allocate the out of band dma memory */
4571 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4572 			     sizeof (mxge_cmd_t), 64);
4573 	if (err != 0)
4574 		goto abort_with_mem_res;
4575 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4576 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4577 	if (err != 0)
4578 		goto abort_with_cmd_dma;
4579 
4580 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4581 	if (err != 0)
4582 		goto abort_with_zeropad_dma;
4583 
4584 	/* select & load the firmware */
4585 	err = mxge_select_firmware(sc);
4586 	if (err != 0)
4587 		goto abort_with_dmabench;
4588 	sc->intr_coal_delay = mxge_intr_coal_delay;
4589 
4590 	mxge_slice_probe(sc);
4591 	err = mxge_alloc_slices(sc);
4592 	if (err != 0)
4593 		goto abort_with_dmabench;
4594 
4595 	err = mxge_reset(sc, 0);
4596 	if (err != 0)
4597 		goto abort_with_slices;
4598 
4599 	err = mxge_alloc_rings(sc);
4600 	if (err != 0) {
4601 		device_printf(sc->dev, "failed to allocate rings\n");
4602 		goto abort_with_dmabench;
4603 	}
4604 
4605 	ifp->if_baudrate = IF_Gbps(10UL);
4606 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4607 		IFCAP_VLAN_MTU;
4608 #ifdef INET
4609 	ifp->if_capabilities |= IFCAP_LRO;
4610 #endif
4611 
4612 #ifdef MXGE_NEW_VLAN_API
4613 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4614 #endif
4615 
4616 	sc->max_mtu = mxge_max_mtu(sc);
4617 	if (sc->max_mtu >= 9000)
4618 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4619 	else
4620 		device_printf(dev, "MTU limited to %d.  Install "
4621 			      "latest firmware for 9000 byte jumbo support\n",
4622 			      sc->max_mtu - ETHER_HDR_LEN);
4623 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4624 	ifp->if_capenable = ifp->if_capabilities;
4625 	if (sc->lro_cnt == 0)
4626 		ifp->if_capenable &= ~IFCAP_LRO;
4627 	sc->csum_flag = 1;
4628         ifp->if_init = mxge_init;
4629         ifp->if_softc = sc;
4630         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4631         ifp->if_ioctl = mxge_ioctl;
4632         ifp->if_start = mxge_start;
4633 	/* Initialise the ifmedia structure */
4634 	ifmedia_init(&sc->media, 0, mxge_media_change,
4635 		     mxge_media_status);
4636 	mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4637 	mxge_media_probe(sc);
4638 	sc->dying = 0;
4639 	ether_ifattach(ifp, sc->mac_addr, NULL);
4640 	/* ether_ifattach sets mtu to ETHERMTU */
4641 	if (mxge_initial_mtu != ETHERMTU) {
4642 		lwkt_serialize_enter(ifp->if_serializer);
4643 		mxge_change_mtu(sc, mxge_initial_mtu);
4644 		lwkt_serialize_exit(ifp->if_serializer);
4645 	}
4646 	/* must come after ether_ifattach() */
4647 	err = mxge_add_irq(sc);
4648 	if (err != 0) {
4649 		device_printf(sc->dev, "failed to add irq\n");
4650 		goto abort_with_rings;
4651 	}
4652 
4653 	mxge_add_sysctls(sc);
4654 #ifdef IFNET_BUF_RING
4655 	ifp->if_transmit = mxge_transmit;
4656 	ifp->if_qflush = mxge_qflush;
4657 #endif
4658 	return 0;
4659 
4660 abort_with_rings:
4661 	mxge_free_rings(sc);
4662 abort_with_slices:
4663 	mxge_free_slices(sc);
4664 abort_with_dmabench:
4665 	mxge_dma_free(&sc->dmabench_dma);
4666 abort_with_zeropad_dma:
4667 	mxge_dma_free(&sc->zeropad_dma);
4668 abort_with_cmd_dma:
4669 	mxge_dma_free(&sc->cmd_dma);
4670 abort_with_mem_res:
4671 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4672 	pci_disable_busmaster(dev);
4673 	bus_dma_tag_destroy(sc->parent_dmat);
4674 abort_with_nothing:
4675 	return err;
4676 }
4677 
4678 static int
4679 mxge_detach(device_t dev)
4680 {
4681 	mxge_softc_t *sc = device_get_softc(dev);
4682 
4683 	lwkt_serialize_enter(sc->ifp->if_serializer);
4684 	sc->dying = 1;
4685 	if (sc->ifp->if_flags & IFF_RUNNING)
4686 		mxge_close(sc);
4687 	/*
4688 	 * XXX: race: the callout callback could be spinning on
4689 	 * the serializer and run anyway
4690 	 */
4691 	callout_stop(&sc->co_hdl);
4692 	lwkt_serialize_exit(sc->ifp->if_serializer);
4693 
4694 	ether_ifdetach(sc->ifp);
4695 	ifmedia_removeall(&sc->media);
4696 	mxge_dummy_rdma(sc, 0);
4697 	mxge_rem_sysctls(sc);
4698 	mxge_rem_irq(sc);
4699 	mxge_free_rings(sc);
4700 	mxge_free_slices(sc);
4701 	mxge_dma_free(&sc->dmabench_dma);
4702 	mxge_dma_free(&sc->zeropad_dma);
4703 	mxge_dma_free(&sc->cmd_dma);
4704 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4705 	pci_disable_busmaster(dev);
4706 	bus_dma_tag_destroy(sc->parent_dmat);
4707 	return 0;
4708 }
4709 
4710 static int
4711 mxge_shutdown(device_t dev)
4712 {
4713 	return 0;
4714 }
4715 
4716 /*
4717   This file uses Myri10GE driver indentation.
4718 
4719   Local Variables:
4720   c-file-style:"linux"
4721   tab-width:8
4722   End:
4723 */
4724