xref: /dragonfly/sys/dev/netif/mxge/if_mxge.c (revision 650094e1)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2009, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 $FreeBSD: src/sys/dev/mxge/if_mxge.c,v 1.63 2009/06/26 11:45:06 rwatson Exp $
29 
30 ***************************************************************************/
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/linker.h>
35 #include <sys/firmware.h>
36 #include <sys/endian.h>
37 #include <sys/in_cksum.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kernel.h>
42 #include <sys/module.h>
43 #include <sys/serialize.h>
44 #include <sys/socket.h>
45 #include <sys/sysctl.h>
46 
47 /* count xmits ourselves, rather than via drbr */
48 #define NO_SLOW_STATS
49 #include <net/if.h>
50 #include <net/if_arp.h>
51 #include <net/ifq_var.h>
52 #include <net/ethernet.h>
53 #include <net/if_dl.h>
54 #include <net/if_media.h>
55 
56 #include <net/bpf.h>
57 
58 #include <net/if_types.h>
59 #include <net/vlan/if_vlan_var.h>
60 #include <net/zlib.h>
61 
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/tcp.h>
66 
67 #include <sys/bus.h>
68 #include <sys/rman.h>
69 
70 #include <bus/pci/pcireg.h>
71 #include <bus/pci/pcivar.h>
72 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
73 
74 #include <vm/vm.h>		/* for pmap_mapdev() */
75 #include <vm/pmap.h>
76 
77 #if defined(__i386) || defined(__x86_64)
78 #include <machine/specialreg.h>
79 #endif
80 
81 #include <dev/netif/mxge/mxge_mcp.h>
82 #include <dev/netif/mxge/mcp_gen_header.h>
83 /*#define MXGE_FAKE_IFP*/
84 #include <dev/netif/mxge/if_mxge_var.h>
85 #ifdef IFNET_BUF_RING
86 #include <sys/buf_ring.h>
87 #endif
88 
89 #include "opt_inet.h"
90 
91 /* tunable params */
92 static int mxge_nvidia_ecrc_enable = 1;
93 static int mxge_force_firmware = 0;
94 static int mxge_intr_coal_delay = 30;
95 static int mxge_deassert_wait = 1;
96 static int mxge_flow_control = 1;
97 static int mxge_verbose = 0;
98 static int mxge_lro_cnt = 8;
99 static int mxge_ticks;
100 static int mxge_max_slices = 1;
101 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
102 static int mxge_always_promisc = 0;
103 /* XXX: not yet */
104 /* static int mxge_initial_mtu = ETHERMTU_JUMBO; */
105 static int mxge_initial_mtu = ETHERMTU;
106 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
107 static char *mxge_fw_aligned = "mxge_eth_z8e";
108 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
109 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
110 
111 static int mxge_probe(device_t dev);
112 static int mxge_attach(device_t dev);
113 static int mxge_detach(device_t dev);
114 static int mxge_shutdown(device_t dev);
115 static void mxge_intr(void *arg);
116 
117 static device_method_t mxge_methods[] =
118 {
119   /* Device interface */
120   DEVMETHOD(device_probe, mxge_probe),
121   DEVMETHOD(device_attach, mxge_attach),
122   DEVMETHOD(device_detach, mxge_detach),
123   DEVMETHOD(device_shutdown, mxge_shutdown),
124   {0, 0}
125 };
126 
127 static driver_t mxge_driver =
128 {
129   "mxge",
130   mxge_methods,
131   sizeof(mxge_softc_t),
132 };
133 
134 static devclass_t mxge_devclass;
135 
136 /* Declare ourselves to be a child of the PCI bus.*/
137 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, NULL, NULL);
138 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
139 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
140 
141 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
142 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
143 static int mxge_close(mxge_softc_t *sc);
144 static int mxge_open(mxge_softc_t *sc);
145 static void mxge_tick(void *arg);
146 
147 /* XXX: we don't have Large Receive Offload support yet */
148  inline int
149 mxge_lro_rx(struct mxge_slice_state *ss, struct mbuf *m_head, uint32_t csum)
150 {
151 	(void)ss;
152 	(void)m_head;
153 	(void)csum;
154 	return 1;
155 }
156 
157  inline void
158 mxge_lro_flush(struct mxge_slice_state *ss, struct lro_entry *lro)
159 {
160 	(void)ss;
161 	(void)lro;
162 }
163 
164 static int
165 mxge_probe(device_t dev)
166 {
167 	int rev;
168 
169 
170 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
171 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
172 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
173 		rev = pci_get_revid(dev);
174 		switch (rev) {
175 		case MXGE_PCI_REV_Z8E:
176 			device_set_desc(dev, "Myri10G-PCIE-8A");
177 			break;
178 		case MXGE_PCI_REV_Z8ES:
179 			device_set_desc(dev, "Myri10G-PCIE-8B");
180 			break;
181 		default:
182 			device_set_desc(dev, "Myri10G-PCIE-8??");
183 			device_printf(dev, "Unrecognized rev %d NIC\n",
184 				      rev);
185 			break;
186 		}
187 		return 0;
188 	}
189 	return ENXIO;
190 }
191 
192 static void
193 mxge_enable_wc(mxge_softc_t *sc)
194 {
195 #if 0
196 #if defined(__i386) || defined(__x86_64)
197 	vm_offset_t len;
198 	int err;
199 
200 	sc->wc = 1;
201 	len = rman_get_size(sc->mem_res);
202 	err = pmap_change_attr((vm_offset_t) sc->sram,
203 			       len, PAT_WRITE_COMBINING);
204 	if (err != 0) {
205 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
206 			      err);
207 		sc->wc = 0;
208 	}
209 #endif
210 #else
211 	sc->wc = 0;	/* TBD: PAT support */
212 #endif
213 }
214 
215 
216 /* callback to get our DMA address */
217 static void
218 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
219 			 int error)
220 {
221 	if (error == 0) {
222 		*(bus_addr_t *) arg = segs->ds_addr;
223 	}
224 }
225 
226 static int
227 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
228 		   bus_size_t alignment)
229 {
230 	int err;
231 	device_t dev = sc->dev;
232 	bus_size_t boundary, maxsegsize;
233 
234 	if (bytes > 4096 && alignment == 4096) {
235 		boundary = 0;
236 		maxsegsize = bytes;
237 	} else {
238 		boundary = 4096;
239 		maxsegsize = 4096;
240 	}
241 
242 	/* allocate DMAable memory tags */
243 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
244 				 alignment,		/* alignment */
245 				 boundary,		/* boundary */
246 				 BUS_SPACE_MAXADDR,	/* low */
247 				 BUS_SPACE_MAXADDR,	/* high */
248 				 NULL, NULL,		/* filter */
249 				 bytes,			/* maxsize */
250 				 1,			/* num segs */
251 				 maxsegsize,		/* maxsegsize */
252 				 BUS_DMA_COHERENT,	/* flags */
253 				 &dma->dmat);		/* tag */
254 	if (err != 0) {
255 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
256 		return err;
257 	}
258 
259 	/* allocate DMAable memory & map */
260 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
261 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
262 				| BUS_DMA_ZERO),  &dma->map);
263 	if (err != 0) {
264 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
265 		goto abort_with_dmat;
266 	}
267 
268 	/* load the memory */
269 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
270 			      mxge_dmamap_callback,
271 			      (void *)&dma->bus_addr, 0);
272 	if (err != 0) {
273 		device_printf(dev, "couldn't load map (err = %d)\n", err);
274 		goto abort_with_mem;
275 	}
276 	return 0;
277 
278 abort_with_mem:
279 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
280 abort_with_dmat:
281 	(void)bus_dma_tag_destroy(dma->dmat);
282 	return err;
283 }
284 
285 
286 static void
287 mxge_dma_free(mxge_dma_t *dma)
288 {
289 	bus_dmamap_unload(dma->dmat, dma->map);
290 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
291 	(void)bus_dma_tag_destroy(dma->dmat);
292 }
293 
294 /*
295  * The eeprom strings on the lanaiX have the format
296  * SN=x\0
297  * MAC=x:x:x:x:x:x\0
298  * PC=text\0
299  */
300 
301 static int
302 mxge_parse_strings(mxge_softc_t *sc)
303 {
304 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
305 
306 	char *ptr, *limit;
307 	int i, found_mac;
308 
309 	ptr = sc->eeprom_strings;
310 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
311 	found_mac = 0;
312 	while (ptr < limit && *ptr != '\0') {
313 		if (memcmp(ptr, "MAC=", 4) == 0) {
314 			ptr += 1;
315 			sc->mac_addr_string = ptr;
316 			for (i = 0; i < 6; i++) {
317 				ptr += 3;
318 				if ((ptr + 2) > limit)
319 					goto abort;
320 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
321 				found_mac = 1;
322 			}
323 		} else if (memcmp(ptr, "PC=", 3) == 0) {
324 			ptr += 3;
325 			strncpy(sc->product_code_string, ptr,
326 				sizeof (sc->product_code_string) - 1);
327 		} else if (memcmp(ptr, "SN=", 3) == 0) {
328 			ptr += 3;
329 			strncpy(sc->serial_number_string, ptr,
330 				sizeof (sc->serial_number_string) - 1);
331 		}
332 		MXGE_NEXT_STRING(ptr);
333 	}
334 
335 	if (found_mac)
336 		return 0;
337 
338  abort:
339 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
340 
341 	return ENXIO;
342 }
343 
344 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
345 static void
346 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
347 {
348 	uint32_t val;
349 	unsigned long base, off;
350 	char *va, *cfgptr;
351 	device_t pdev, mcp55;
352 	uint16_t vendor_id, device_id, word;
353 	uintptr_t bus, slot, func, ivend, idev;
354 	uint32_t *ptr32;
355 
356 
357 	if (!mxge_nvidia_ecrc_enable)
358 		return;
359 
360 	pdev = device_get_parent(device_get_parent(sc->dev));
361 	if (pdev == NULL) {
362 		device_printf(sc->dev, "could not find parent?\n");
363 		return;
364 	}
365 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
366 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
367 
368 	if (vendor_id != 0x10de)
369 		return;
370 
371 	base = 0;
372 
373 	if (device_id == 0x005d) {
374 		/* ck804, base address is magic */
375 		base = 0xe0000000UL;
376 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
377 		/* mcp55, base address stored in chipset */
378 		mcp55 = pci_find_bsf(0, 0, 0);
379 		if (mcp55 &&
380 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
381 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
382 			word = pci_read_config(mcp55, 0x90, 2);
383 			base = ((unsigned long)word & 0x7ffeU) << 25;
384 		}
385 	}
386 	if (!base)
387 		return;
388 
389 	/* XXXX
390 	   Test below is commented because it is believed that doing
391 	   config read/write beyond 0xff will access the config space
392 	   for the next larger function.  Uncomment this and remove
393 	   the hacky pmap_mapdev() way of accessing config space when
394 	   FreeBSD grows support for extended pcie config space access
395 	*/
396 #if 0
397 	/* See if we can, by some miracle, access the extended
398 	   config space */
399 	val = pci_read_config(pdev, 0x178, 4);
400 	if (val != 0xffffffff) {
401 		val |= 0x40;
402 		pci_write_config(pdev, 0x178, val, 4);
403 		return;
404 	}
405 #endif
406 	/* Rather than using normal pci config space writes, we must
407 	 * map the Nvidia config space ourselves.  This is because on
408 	 * opteron/nvidia class machine the 0xe000000 mapping is
409 	 * handled by the nvidia chipset, that means the internal PCI
410 	 * device (the on-chip northbridge), or the amd-8131 bridge
411 	 * and things behind them are not visible by this method.
412 	 */
413 
414 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
415 		      PCI_IVAR_BUS, &bus);
416 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
417 		      PCI_IVAR_SLOT, &slot);
418 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
419 		      PCI_IVAR_FUNCTION, &func);
420 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
421 		      PCI_IVAR_VENDOR, &ivend);
422 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
423 		      PCI_IVAR_DEVICE, &idev);
424 
425 	off =  base
426 		+ 0x00100000UL * (unsigned long)bus
427 		+ 0x00001000UL * (unsigned long)(func
428 						 + 8 * slot);
429 
430 	/* map it into the kernel */
431 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
432 
433 
434 	if (va == NULL) {
435 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
436 		return;
437 	}
438 	/* get a pointer to the config space mapped into the kernel */
439 	cfgptr = va + (off & PAGE_MASK);
440 
441 	/* make sure that we can really access it */
442 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
443 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
444 	if (! (vendor_id == ivend && device_id == idev)) {
445 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
446 			      vendor_id, device_id);
447 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
448 		return;
449 	}
450 
451 	ptr32 = (uint32_t*)(cfgptr + 0x178);
452 	val = *ptr32;
453 
454 	if (val == 0xffffffff) {
455 		device_printf(sc->dev, "extended mapping failed\n");
456 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
457 		return;
458 	}
459 	*ptr32 = val | 0x40;
460 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
461 	if (mxge_verbose)
462 		device_printf(sc->dev,
463 			      "Enabled ECRC on upstream Nvidia bridge "
464 			      "at %d:%d:%d\n",
465 			      (int)bus, (int)slot, (int)func);
466 	return;
467 }
468 #else
469 static void
470 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
471 {
472 	device_printf(sc->dev,
473 		      "Nforce 4 chipset on non-x86/x86_64!?!?!\n");
474 	return;
475 }
476 #endif
477 
478 
479 static int
480 mxge_dma_test(mxge_softc_t *sc, int test_type)
481 {
482 	mxge_cmd_t cmd;
483 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
484 	int status;
485 	uint32_t len;
486 	char *test = " ";
487 
488 
489 	/* Run a small DMA test.
490 	 * The magic multipliers to the length tell the firmware
491 	 * to do DMA read, write, or read+write tests.  The
492 	 * results are returned in cmd.data0.  The upper 16
493 	 * bits of the return is the number of transfers completed.
494 	 * The lower 16 bits is the time in 0.5us ticks that the
495 	 * transfers took to complete.
496 	 */
497 
498 	len = sc->tx_boundary;
499 
500 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
501 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
502 	cmd.data2 = len * 0x10000;
503 	status = mxge_send_cmd(sc, test_type, &cmd);
504 	if (status != 0) {
505 		test = "read";
506 		goto abort;
507 	}
508 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
509 		(cmd.data0 & 0xffff);
510 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
511 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
512 	cmd.data2 = len * 0x1;
513 	status = mxge_send_cmd(sc, test_type, &cmd);
514 	if (status != 0) {
515 		test = "write";
516 		goto abort;
517 	}
518 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
519 		(cmd.data0 & 0xffff);
520 
521 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
522 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
523 	cmd.data2 = len * 0x10001;
524 	status = mxge_send_cmd(sc, test_type, &cmd);
525 	if (status != 0) {
526 		test = "read/write";
527 		goto abort;
528 	}
529 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
530 		(cmd.data0 & 0xffff);
531 
532 abort:
533 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
534 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
535 			      test, status);
536 
537 	return status;
538 }
539 
540 /*
541  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
542  * when the PCI-E Completion packets are aligned on an 8-byte
543  * boundary.  Some PCI-E chip sets always align Completion packets; on
544  * the ones that do not, the alignment can be enforced by enabling
545  * ECRC generation (if supported).
546  *
547  * When PCI-E Completion packets are not aligned, it is actually more
548  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
549  *
550  * If the driver can neither enable ECRC nor verify that it has
551  * already been enabled, then it must use a firmware image which works
552  * around unaligned completion packets (ethp_z8e.dat), and it should
553  * also ensure that it never gives the device a Read-DMA which is
554  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
555  * enabled, then the driver should use the aligned (eth_z8e.dat)
556  * firmware image, and set tx_boundary to 4KB.
557  */
558 
559 static int
560 mxge_firmware_probe(mxge_softc_t *sc)
561 {
562 	device_t dev = sc->dev;
563 	int reg, status;
564 	uint16_t pectl;
565 
566 	sc->tx_boundary = 4096;
567 	/*
568 	 * Verify the max read request size was set to 4KB
569 	 * before trying the test with 4KB.
570 	 */
571 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
572 		pectl = pci_read_config(dev, reg + 0x8, 2);
573 		if ((pectl & (5 << 12)) != (5 << 12)) {
574 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
575 				      pectl);
576 			sc->tx_boundary = 2048;
577 		}
578 	}
579 
580 	/*
581 	 * load the optimized firmware (which assumes aligned PCIe
582 	 * completions) in order to see if it works on this host.
583 	 */
584 	sc->fw_name = mxge_fw_aligned;
585 	status = mxge_load_firmware(sc, 1);
586 	if (status != 0) {
587 		return status;
588 	}
589 
590 	/*
591 	 * Enable ECRC if possible
592 	 */
593 	mxge_enable_nvidia_ecrc(sc);
594 
595 	/*
596 	 * Run a DMA test which watches for unaligned completions and
597 	 * aborts on the first one seen.
598 	 */
599 
600 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
601 	if (status == 0)
602 		return 0; /* keep the aligned firmware */
603 
604 	if (status != E2BIG)
605 		device_printf(dev, "DMA test failed: %d\n", status);
606 	if (status == ENOSYS)
607 		device_printf(dev, "Falling back to ethp! "
608 			      "Please install up to date fw\n");
609 	return status;
610 }
611 
612 static int
613 mxge_select_firmware(mxge_softc_t *sc)
614 {
615 	int aligned = 0;
616 
617 
618 	if (mxge_force_firmware != 0) {
619 		if (mxge_force_firmware == 1)
620 			aligned = 1;
621 		else
622 			aligned = 0;
623 		if (mxge_verbose)
624 			device_printf(sc->dev,
625 				      "Assuming %s completions (forced)\n",
626 				      aligned ? "aligned" : "unaligned");
627 		goto abort;
628 	}
629 
630 	/* if the PCIe link width is 4 or less, we can use the aligned
631 	   firmware and skip any checks */
632 	if (sc->link_width != 0 && sc->link_width <= 4) {
633 		device_printf(sc->dev,
634 			      "PCIe x%d Link, expect reduced performance\n",
635 			      sc->link_width);
636 		aligned = 1;
637 		goto abort;
638 	}
639 
640 	if (0 == mxge_firmware_probe(sc))
641 		return 0;
642 
643 abort:
644 	if (aligned) {
645 		sc->fw_name = mxge_fw_aligned;
646 		sc->tx_boundary = 4096;
647 	} else {
648 		sc->fw_name = mxge_fw_unaligned;
649 		sc->tx_boundary = 2048;
650 	}
651 	return (mxge_load_firmware(sc, 0));
652 }
653 
654 union qualhack
655 {
656         const char *ro_char;
657         char *rw_char;
658 };
659 
660 static int
661 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
662 {
663 
664 
665 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
666 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
667 			      be32toh(hdr->mcp_type));
668 		return EIO;
669 	}
670 
671 	/* save firmware version for sysctl */
672 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
673 	if (mxge_verbose)
674 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
675 
676 	ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
677 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
678 
679 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
680 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
681 		device_printf(sc->dev, "Found firmware version %s\n",
682 			      sc->fw_version);
683 		device_printf(sc->dev, "Driver needs %d.%d\n",
684 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
685 		return EINVAL;
686 	}
687 	return 0;
688 
689 }
690 
691 static void *
692 z_alloc(void *nil, u_int items, u_int size)
693 {
694         void *ptr;
695 
696         ptr = kmalloc(items * size, M_TEMP, M_NOWAIT);
697         return ptr;
698 }
699 
700 static void
701 z_free(void *nil, void *ptr)
702 {
703         kfree(ptr, M_TEMP);
704 }
705 
706 
707 static int
708 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
709 {
710 	z_stream zs;
711 	char *inflate_buffer;
712 	const struct firmware *fw;
713 	const mcp_gen_header_t *hdr;
714 	unsigned hdr_offset;
715 	int status;
716 	unsigned int i;
717 	char dummy;
718 	size_t fw_len;
719 
720 	fw = firmware_get(sc->fw_name);
721 	if (fw == NULL) {
722 		device_printf(sc->dev, "Could not find firmware image %s\n",
723 			      sc->fw_name);
724 		return ENOENT;
725 	}
726 
727 
728 
729 	/* setup zlib and decompress f/w */
730 	bzero(&zs, sizeof (zs));
731 	zs.zalloc = z_alloc;
732 	zs.zfree = z_free;
733 	status = inflateInit(&zs);
734 	if (status != Z_OK) {
735 		status = EIO;
736 		goto abort_with_fw;
737 	}
738 
739 	/* the uncompressed size is stored as the firmware version,
740 	   which would otherwise go unused */
741 	fw_len = (size_t) fw->version;
742 	inflate_buffer = kmalloc(fw_len, M_TEMP, M_NOWAIT);
743 	if (inflate_buffer == NULL)
744 		goto abort_with_zs;
745 	zs.avail_in = fw->datasize;
746 	zs.next_in = __DECONST(char *, fw->data);
747 	zs.avail_out = fw_len;
748 	zs.next_out = inflate_buffer;
749 	status = inflate(&zs, Z_FINISH);
750 	if (status != Z_STREAM_END) {
751 		device_printf(sc->dev, "zlib %d\n", status);
752 		status = EIO;
753 		goto abort_with_buffer;
754 	}
755 
756 	/* check id */
757 	hdr_offset = htobe32(*(const uint32_t *)
758 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
759 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
760 		device_printf(sc->dev, "Bad firmware file");
761 		status = EIO;
762 		goto abort_with_buffer;
763 	}
764 	hdr = (const void*)(inflate_buffer + hdr_offset);
765 
766 	status = mxge_validate_firmware(sc, hdr);
767 	if (status != 0)
768 		goto abort_with_buffer;
769 
770 	/* Copy the inflated firmware to NIC SRAM. */
771 	for (i = 0; i < fw_len; i += 256) {
772 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
773 			      inflate_buffer + i,
774 			      min(256U, (unsigned)(fw_len - i)));
775 		wmb();
776 		dummy = *sc->sram;
777 		wmb();
778 	}
779 
780 	*limit = fw_len;
781 	status = 0;
782 abort_with_buffer:
783 	kfree(inflate_buffer, M_TEMP);
784 abort_with_zs:
785 	inflateEnd(&zs);
786 abort_with_fw:
787 	firmware_put(fw, FIRMWARE_UNLOAD);
788 	return status;
789 }
790 
791 /*
792  * Enable or disable periodic RDMAs from the host to make certain
793  * chipsets resend dropped PCIe messages
794  */
795 
796 static void
797 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
798 {
799 	char buf_bytes[72];
800 	volatile uint32_t *confirm;
801 	volatile char *submit;
802 	uint32_t *buf, dma_low, dma_high;
803 	int i;
804 
805 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
806 
807 	/* clear confirmation addr */
808 	confirm = (volatile uint32_t *)sc->cmd;
809 	*confirm = 0;
810 	wmb();
811 
812 	/* send an rdma command to the PCIe engine, and wait for the
813 	   response in the confirmation address.  The firmware should
814 	   write a -1 there to indicate it is alive and well
815 	*/
816 
817 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
818 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
819 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
820 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
821 	buf[2] = htobe32(0xffffffff);		/* confirm data */
822 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
823 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
824 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
825 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
826 	buf[5] = htobe32(enable);			/* enable? */
827 
828 
829 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
830 
831 	mxge_pio_copy(submit, buf, 64);
832 	wmb();
833 	DELAY(1000);
834 	wmb();
835 	i = 0;
836 	while (*confirm != 0xffffffff && i < 20) {
837 		DELAY(1000);
838 		i++;
839 	}
840 	if (*confirm != 0xffffffff) {
841 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
842 			      (enable ? "enable" : "disable"), confirm,
843 			      *confirm);
844 	}
845 	return;
846 }
847 
848 static int
849 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
850 {
851 	mcp_cmd_t *buf;
852 	char buf_bytes[sizeof(*buf) + 8];
853 	volatile mcp_cmd_response_t *response = sc->cmd;
854 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
855 	uint32_t dma_low, dma_high;
856 	int err, sleep_total = 0;
857 
858 	/*
859 	 * We may be called during attach, before if_serializer is available.
860 	 * This is not a fast path, just check for NULL
861 	 */
862 
863 	if (sc->ifp->if_serializer)
864 		ASSERT_SERIALIZED(sc->ifp->if_serializer);
865 
866 	/* ensure buf is aligned to 8 bytes */
867 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
868 
869 	buf->data0 = htobe32(data->data0);
870 	buf->data1 = htobe32(data->data1);
871 	buf->data2 = htobe32(data->data2);
872 	buf->cmd = htobe32(cmd);
873 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
874 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
875 
876 	buf->response_addr.low = htobe32(dma_low);
877 	buf->response_addr.high = htobe32(dma_high);
878 
879 
880 	response->result = 0xffffffff;
881 	wmb();
882 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
883 
884 	/* wait up to 20ms */
885 	err = EAGAIN;
886 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
887 		bus_dmamap_sync(sc->cmd_dma.dmat,
888 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
889 		wmb();
890 		switch (be32toh(response->result)) {
891 		case 0:
892 			data->data0 = be32toh(response->data);
893 			err = 0;
894 			break;
895 		case 0xffffffff:
896 			DELAY(1000);
897 			break;
898 		case MXGEFW_CMD_UNKNOWN:
899 			err = ENOSYS;
900 			break;
901 		case MXGEFW_CMD_ERROR_UNALIGNED:
902 			err = E2BIG;
903 			break;
904 		case MXGEFW_CMD_ERROR_BUSY:
905 			err = EBUSY;
906 			break;
907 		default:
908 			device_printf(sc->dev,
909 				      "mxge: command %d "
910 				      "failed, result = %d\n",
911 				      cmd, be32toh(response->result));
912 			err = ENXIO;
913 			break;
914 		}
915 		if (err != EAGAIN)
916 			break;
917 	}
918 	if (err == EAGAIN)
919 		device_printf(sc->dev, "mxge: command %d timed out"
920 			      "result = %d\n",
921 			      cmd, be32toh(response->result));
922 	return err;
923 }
924 
925 static int
926 mxge_adopt_running_firmware(mxge_softc_t *sc)
927 {
928 	struct mcp_gen_header *hdr;
929 	const size_t bytes = sizeof (struct mcp_gen_header);
930 	size_t hdr_offset;
931 	int status;
932 
933 	/* find running firmware header */
934 	hdr_offset = htobe32(*(volatile uint32_t *)
935 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
936 
937 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
938 		device_printf(sc->dev,
939 			      "Running firmware has bad header offset (%d)\n",
940 			      (int)hdr_offset);
941 		return EIO;
942 	}
943 
944 	/* copy header of running firmware from SRAM to host memory to
945 	 * validate firmware */
946 	hdr = kmalloc(bytes, M_DEVBUF, M_NOWAIT);
947 	if (hdr == NULL) {
948 		device_printf(sc->dev, "could not kmalloc firmware hdr\n");
949 		return ENOMEM;
950 	}
951 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
952 				rman_get_bushandle(sc->mem_res),
953 				hdr_offset, (char *)hdr, bytes);
954 	status = mxge_validate_firmware(sc, hdr);
955 	kfree(hdr, M_DEVBUF);
956 
957 	/*
958 	 * check to see if adopted firmware has bug where adopting
959 	 * it will cause broadcasts to be filtered unless the NIC
960 	 * is kept in ALLMULTI mode
961 	 */
962 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
963 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
964 		sc->adopted_rx_filter_bug = 1;
965 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
966 			      "working around rx filter bug\n",
967 			      sc->fw_ver_major, sc->fw_ver_minor,
968 			      sc->fw_ver_tiny);
969 	}
970 
971 	return status;
972 }
973 
974 
975 static int
976 mxge_load_firmware(mxge_softc_t *sc, int adopt)
977 {
978 	volatile uint32_t *confirm;
979 	volatile char *submit;
980 	char buf_bytes[72];
981 	uint32_t *buf, size, dma_low, dma_high;
982 	int status, i;
983 
984 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
985 
986 	size = sc->sram_size;
987 	status = mxge_load_firmware_helper(sc, &size);
988 	if (status) {
989 		if (!adopt)
990 			return status;
991 		/* Try to use the currently running firmware, if
992 		   it is new enough */
993 		status = mxge_adopt_running_firmware(sc);
994 		if (status) {
995 			device_printf(sc->dev,
996 				      "failed to adopt running firmware\n");
997 			return status;
998 		}
999 		device_printf(sc->dev,
1000 			      "Successfully adopted running firmware\n");
1001 		if (sc->tx_boundary == 4096) {
1002 			device_printf(sc->dev,
1003 				"Using firmware currently running on NIC"
1004 				 ".  For optimal\n");
1005 			device_printf(sc->dev,
1006 				 "performance consider loading optimized "
1007 				 "firmware\n");
1008 		}
1009 		sc->fw_name = mxge_fw_unaligned;
1010 		sc->tx_boundary = 2048;
1011 		return 0;
1012 	}
1013 	/* clear confirmation addr */
1014 	confirm = (volatile uint32_t *)sc->cmd;
1015 	*confirm = 0;
1016 	wmb();
1017 	/* send a reload command to the bootstrap MCP, and wait for the
1018 	   response in the confirmation address.  The firmware should
1019 	   write a -1 there to indicate it is alive and well
1020 	*/
1021 
1022 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1023 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1024 
1025 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1026 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1027 	buf[2] = htobe32(0xffffffff);	/* confirm data */
1028 
1029 	/* FIX: All newest firmware should un-protect the bottom of
1030 	   the sram before handoff. However, the very first interfaces
1031 	   do not. Therefore the handoff copy must skip the first 8 bytes
1032 	*/
1033 					/* where the code starts*/
1034 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1035 	buf[4] = htobe32(size - 8); 	/* length of code */
1036 	buf[5] = htobe32(8);		/* where to copy to */
1037 	buf[6] = htobe32(0);		/* where to jump to */
1038 
1039 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1040 	mxge_pio_copy(submit, buf, 64);
1041 	wmb();
1042 	DELAY(1000);
1043 	wmb();
1044 	i = 0;
1045 	while (*confirm != 0xffffffff && i < 20) {
1046 		DELAY(1000*10);
1047 		i++;
1048 		bus_dmamap_sync(sc->cmd_dma.dmat,
1049 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1050 	}
1051 	if (*confirm != 0xffffffff) {
1052 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1053 			confirm, *confirm);
1054 
1055 		return ENXIO;
1056 	}
1057 	return 0;
1058 }
1059 
1060 static int
1061 mxge_update_mac_address(mxge_softc_t *sc)
1062 {
1063 	mxge_cmd_t cmd;
1064 	uint8_t *addr = sc->mac_addr;
1065 	int status;
1066 
1067 
1068 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1069 		     | (addr[2] << 8) | addr[3]);
1070 
1071 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1072 
1073 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1074 	return status;
1075 }
1076 
1077 static int
1078 mxge_change_pause(mxge_softc_t *sc, int pause)
1079 {
1080 	mxge_cmd_t cmd;
1081 	int status;
1082 
1083 	if (pause)
1084 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1085 				       &cmd);
1086 	else
1087 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1088 				       &cmd);
1089 
1090 	if (status) {
1091 		device_printf(sc->dev, "Failed to set flow control mode\n");
1092 		return ENXIO;
1093 	}
1094 	sc->pause = pause;
1095 	return 0;
1096 }
1097 
1098 static void
1099 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1100 {
1101 	mxge_cmd_t cmd;
1102 	int status;
1103 
1104 	if( sc->ifp->if_serializer)
1105 		ASSERT_SERIALIZED(sc->ifp->if_serializer);
1106 	if (mxge_always_promisc)
1107 		promisc = 1;
1108 
1109 	if (promisc)
1110 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1111 				       &cmd);
1112 	else
1113 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1114 				       &cmd);
1115 
1116 	if (status) {
1117 		device_printf(sc->dev, "Failed to set promisc mode\n");
1118 	}
1119 }
1120 
1121 static void
1122 mxge_set_multicast_list(mxge_softc_t *sc)
1123 {
1124 	mxge_cmd_t cmd;
1125 	struct ifmultiaddr *ifma;
1126 	struct ifnet *ifp = sc->ifp;
1127 	int err;
1128 
1129 	if (ifp->if_serializer)
1130 		ASSERT_SERIALIZED(ifp->if_serializer);
1131 
1132 	/* This firmware is known to not support multicast */
1133 	if (!sc->fw_multicast_support)
1134 		return;
1135 
1136 	/* Disable multicast filtering while we play with the lists*/
1137 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1138 	if (err != 0) {
1139 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1140 		       " error status: %d\n", err);
1141 		return;
1142 	}
1143 
1144 	if (sc->adopted_rx_filter_bug)
1145 		return;
1146 
1147 	if (ifp->if_flags & IFF_ALLMULTI)
1148 		/* request to disable multicast filtering, so quit here */
1149 		return;
1150 
1151 	/* Flush all the filters */
1152 
1153 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1154 	if (err != 0) {
1155 		device_printf(sc->dev,
1156 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1157 			      ", error status: %d\n", err);
1158 		return;
1159 	}
1160 
1161 	/* Walk the multicast list, and add each address */
1162 
1163 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1164 		if (ifma->ifma_addr->sa_family != AF_LINK)
1165 			continue;
1166 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1167 		      &cmd.data0, 4);
1168 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1169 		      &cmd.data1, 2);
1170 		cmd.data0 = htonl(cmd.data0);
1171 		cmd.data1 = htonl(cmd.data1);
1172 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1173 		if (err != 0) {
1174 			device_printf(sc->dev, "Failed "
1175 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1176 			       "%d\t", err);
1177 			/* abort, leaving multicast filtering off */
1178 			return;
1179 		}
1180 	}
1181 	/* Enable multicast filtering */
1182 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1183 	if (err != 0) {
1184 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1185 		       ", error status: %d\n", err);
1186 	}
1187 }
1188 
1189 static int
1190 mxge_max_mtu(mxge_softc_t *sc)
1191 {
1192 	mxge_cmd_t cmd;
1193 	int status;
1194 
1195 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1196 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1197 
1198 	/* try to set nbufs to see if it we can
1199 	   use virtually contiguous jumbos */
1200 	cmd.data0 = 0;
1201 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1202 			       &cmd);
1203 	if (status == 0)
1204 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1205 
1206 	/* otherwise, we're limited to MJUMPAGESIZE */
1207 	return MJUMPAGESIZE - MXGEFW_PAD;
1208 }
1209 
1210 static int
1211 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1212 {
1213 	struct mxge_slice_state *ss;
1214 	mxge_rx_done_t *rx_done;
1215 	volatile uint32_t *irq_claim;
1216 	mxge_cmd_t cmd;
1217 	int slice, status;
1218 
1219 	/* try to send a reset command to the card to see if it
1220 	   is alive */
1221 	memset(&cmd, 0, sizeof (cmd));
1222 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1223 	if (status != 0) {
1224 		device_printf(sc->dev, "failed reset\n");
1225 		return ENXIO;
1226 	}
1227 
1228 	mxge_dummy_rdma(sc, 1);
1229 
1230 
1231 	/* set the intrq size */
1232 	cmd.data0 = sc->rx_ring_size;
1233 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1234 
1235 	/*
1236 	 * Even though we already know how many slices are supported
1237 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1238 	 * has magic side effects, and must be called after a reset.
1239 	 * It must be called prior to calling any RSS related cmds,
1240 	 * including assigning an interrupt queue for anything but
1241 	 * slice 0.  It must also be called *after*
1242 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1243 	 * the firmware to compute offsets.
1244 	 */
1245 
1246 	if (sc->num_slices > 1) {
1247 		/* ask the maximum number of slices it supports */
1248 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1249 					   &cmd);
1250 		if (status != 0) {
1251 			device_printf(sc->dev,
1252 				      "failed to get number of slices\n");
1253 			return status;
1254 		}
1255 		/*
1256 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1257 		 * to setting up the interrupt queue DMA
1258 		 */
1259 		cmd.data0 = sc->num_slices;
1260 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1261 #ifdef IFNET_BUF_RING
1262 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1263 #endif
1264 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1265 					   &cmd);
1266 		if (status != 0) {
1267 			device_printf(sc->dev,
1268 				      "failed to set number of slices\n");
1269 			return status;
1270 		}
1271 	}
1272 
1273 
1274 	if (interrupts_setup) {
1275 		/* Now exchange information about interrupts  */
1276 		for (slice = 0; slice < sc->num_slices; slice++) {
1277 			rx_done = &sc->ss[slice].rx_done;
1278 			memset(rx_done->entry, 0, sc->rx_ring_size);
1279 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1280 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1281 			cmd.data2 = slice;
1282 			status |= mxge_send_cmd(sc,
1283 						MXGEFW_CMD_SET_INTRQ_DMA,
1284 						&cmd);
1285 		}
1286 	}
1287 
1288 	status |= mxge_send_cmd(sc,
1289 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1290 
1291 
1292 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1293 
1294 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1295 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1296 
1297 
1298 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1299 				&cmd);
1300 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1301 	if (status != 0) {
1302 		device_printf(sc->dev, "failed set interrupt parameters\n");
1303 		return status;
1304 	}
1305 
1306 
1307 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1308 
1309 
1310 	/* run a DMA benchmark */
1311 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1312 
1313 	for (slice = 0; slice < sc->num_slices; slice++) {
1314 		ss = &sc->ss[slice];
1315 
1316 		ss->irq_claim = irq_claim + (2 * slice);
1317 		/* reset mcp/driver shared state back to 0 */
1318 		ss->rx_done.idx = 0;
1319 		ss->rx_done.cnt = 0;
1320 		ss->tx.req = 0;
1321 		ss->tx.done = 0;
1322 		ss->tx.pkt_done = 0;
1323 		ss->tx.queue_active = 0;
1324 		ss->tx.activate = 0;
1325 		ss->tx.deactivate = 0;
1326 		ss->tx.wake = 0;
1327 		ss->tx.defrag = 0;
1328 		ss->tx.stall = 0;
1329 		ss->rx_big.cnt = 0;
1330 		ss->rx_small.cnt = 0;
1331 		ss->lro_bad_csum = 0;
1332 		ss->lro_queued = 0;
1333 		ss->lro_flushed = 0;
1334 		if (ss->fw_stats != NULL) {
1335 			ss->fw_stats->valid = 0;
1336 			ss->fw_stats->send_done_count = 0;
1337 		}
1338 	}
1339 	sc->rdma_tags_available = 15;
1340 	status = mxge_update_mac_address(sc);
1341 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1342 	mxge_change_pause(sc, sc->pause);
1343 	mxge_set_multicast_list(sc);
1344 	return status;
1345 }
1346 
1347 static int
1348 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1349 {
1350         mxge_softc_t *sc;
1351         unsigned int intr_coal_delay;
1352         int err;
1353 
1354         sc = arg1;
1355         intr_coal_delay = sc->intr_coal_delay;
1356         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1357         if (err != 0) {
1358                 return err;
1359         }
1360         if (intr_coal_delay == sc->intr_coal_delay)
1361                 return 0;
1362 
1363         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1364                 return EINVAL;
1365 
1366 	lwkt_serialize_enter(sc->ifp->if_serializer);
1367 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1368 	sc->intr_coal_delay = intr_coal_delay;
1369 
1370 	lwkt_serialize_exit(sc->ifp->if_serializer);
1371         return err;
1372 }
1373 
1374 static int
1375 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1376 {
1377         mxge_softc_t *sc;
1378         unsigned int enabled;
1379         int err;
1380 
1381         sc = arg1;
1382         enabled = sc->pause;
1383         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1384         if (err != 0) {
1385                 return err;
1386         }
1387         if (enabled == sc->pause)
1388                 return 0;
1389 
1390 	lwkt_serialize_enter(sc->ifp->if_serializer);
1391 	err = mxge_change_pause(sc, enabled);
1392 	lwkt_serialize_exit(sc->ifp->if_serializer);
1393         return err;
1394 }
1395 
1396 static int
1397 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1398 {
1399 	struct ifnet *ifp;
1400 	int err = 0;
1401 
1402 	ifp = sc->ifp;
1403 	if (lro_cnt == 0)
1404 		ifp->if_capenable &= ~IFCAP_LRO;
1405 	else
1406 		ifp->if_capenable |= IFCAP_LRO;
1407 	sc->lro_cnt = lro_cnt;
1408 	if (ifp->if_flags & IFF_RUNNING) {
1409 		mxge_close(sc);
1410 		err = mxge_open(sc);
1411 	}
1412 	return err;
1413 }
1414 
1415 static int
1416 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1417 {
1418 	mxge_softc_t *sc;
1419 	unsigned int lro_cnt;
1420 	int err;
1421 
1422 	sc = arg1;
1423 	lro_cnt = sc->lro_cnt;
1424 	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1425 	if (err != 0)
1426 		return err;
1427 
1428 	if (lro_cnt == sc->lro_cnt)
1429 		return 0;
1430 
1431 	if (lro_cnt > 128)
1432 		return EINVAL;
1433 
1434 	lwkt_serialize_enter(sc->ifp->if_serializer);
1435 	err = mxge_change_lro_locked(sc, lro_cnt);
1436 	lwkt_serialize_exit(sc->ifp->if_serializer);
1437 	return err;
1438 }
1439 
1440 static int
1441 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1442 {
1443         int err;
1444 
1445         if (arg1 == NULL)
1446                 return EFAULT;
1447         arg2 = be32toh(*(int *)arg1);
1448         arg1 = NULL;
1449         err = sysctl_handle_int(oidp, arg1, arg2, req);
1450 
1451         return err;
1452 }
1453 
1454 static void
1455 mxge_rem_sysctls(mxge_softc_t *sc)
1456 {
1457 	struct mxge_slice_state *ss;
1458 	int slice;
1459 
1460 	if (sc->slice_sysctl_tree == NULL)
1461 		return;
1462 
1463 	for (slice = 0; slice < sc->num_slices; slice++) {
1464 		ss = &sc->ss[slice];
1465 		if (ss == NULL || ss->sysctl_tree == NULL)
1466 			continue;
1467 		sysctl_ctx_free(&ss->sysctl_ctx);
1468 		ss->sysctl_tree = NULL;
1469 	}
1470 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1471 	sc->slice_sysctl_tree = NULL;
1472 	sysctl_ctx_free(&sc->sysctl_ctx);
1473 	sc->sysctl_tree = NULL;
1474 
1475 }
1476 
1477 static void
1478 mxge_add_sysctls(mxge_softc_t *sc)
1479 {
1480 	struct sysctl_ctx_list *ctx;
1481 	struct sysctl_oid_list *children;
1482 	mcp_irq_data_t *fw;
1483 	struct mxge_slice_state *ss;
1484 	int slice;
1485 	char slice_num[8];
1486 
1487 	ctx = &sc->sysctl_ctx;
1488 	sysctl_ctx_init(ctx);
1489 	sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1490 					  OID_AUTO,
1491 					  device_get_nameunit(sc->dev),
1492 					  CTLFLAG_RD, 0, "");
1493 	if (sc->sysctl_tree == NULL) {
1494 		device_printf(sc->dev, "can't add sysctl node\n");
1495 		return;
1496 	}
1497 
1498 	children = SYSCTL_CHILDREN(sc->sysctl_tree);
1499 	fw = sc->ss[0].fw_stats;
1500 
1501 	/* random information */
1502 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1503 		       "firmware_version",
1504 		       CTLFLAG_RD, &sc->fw_version,
1505 		       0, "firmware version");
1506 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1507 		       "serial_number",
1508 		       CTLFLAG_RD, &sc->serial_number_string,
1509 		       0, "serial number");
1510 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1511 		       "product_code",
1512 		       CTLFLAG_RD, &sc->product_code_string,
1513 		       0, "product_code");
1514 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1515 		       "pcie_link_width",
1516 		       CTLFLAG_RD, &sc->link_width,
1517 		       0, "tx_boundary");
1518 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1519 		       "tx_boundary",
1520 		       CTLFLAG_RD, &sc->tx_boundary,
1521 		       0, "tx_boundary");
1522 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1523 		       "write_combine",
1524 		       CTLFLAG_RD, &sc->wc,
1525 		       0, "write combining PIO?");
1526 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1527 		       "read_dma_MBs",
1528 		       CTLFLAG_RD, &sc->read_dma,
1529 		       0, "DMA Read speed in MB/s");
1530 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1531 		       "write_dma_MBs",
1532 		       CTLFLAG_RD, &sc->write_dma,
1533 		       0, "DMA Write speed in MB/s");
1534 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1535 		       "read_write_dma_MBs",
1536 		       CTLFLAG_RD, &sc->read_write_dma,
1537 		       0, "DMA concurrent Read/Write speed in MB/s");
1538 
1539 
1540 	/* performance related tunables */
1541 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1542 			"intr_coal_delay",
1543 			CTLTYPE_INT|CTLFLAG_RW, sc,
1544 			0, mxge_change_intr_coal,
1545 			"I", "interrupt coalescing delay in usecs");
1546 
1547 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1548 			"flow_control_enabled",
1549 			CTLTYPE_INT|CTLFLAG_RW, sc,
1550 			0, mxge_change_flow_control,
1551 			"I", "interrupt coalescing delay in usecs");
1552 
1553 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1554 		       "deassert_wait",
1555 		       CTLFLAG_RW, &mxge_deassert_wait,
1556 		       0, "Wait for IRQ line to go low in ihandler");
1557 
1558 	/* stats block from firmware is in network byte order.
1559 	   Need to swap it */
1560 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1561 			"link_up",
1562 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1563 			0, mxge_handle_be32,
1564 			"I", "link up");
1565 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1566 			"rdma_tags_available",
1567 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1568 			0, mxge_handle_be32,
1569 			"I", "rdma_tags_available");
1570 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1571 			"dropped_bad_crc32",
1572 			CTLTYPE_INT|CTLFLAG_RD,
1573 			&fw->dropped_bad_crc32,
1574 			0, mxge_handle_be32,
1575 			"I", "dropped_bad_crc32");
1576 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1577 			"dropped_bad_phy",
1578 			CTLTYPE_INT|CTLFLAG_RD,
1579 			&fw->dropped_bad_phy,
1580 			0, mxge_handle_be32,
1581 			"I", "dropped_bad_phy");
1582 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1583 			"dropped_link_error_or_filtered",
1584 			CTLTYPE_INT|CTLFLAG_RD,
1585 			&fw->dropped_link_error_or_filtered,
1586 			0, mxge_handle_be32,
1587 			"I", "dropped_link_error_or_filtered");
1588 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1589 			"dropped_link_overflow",
1590 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1591 			0, mxge_handle_be32,
1592 			"I", "dropped_link_overflow");
1593 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1594 			"dropped_multicast_filtered",
1595 			CTLTYPE_INT|CTLFLAG_RD,
1596 			&fw->dropped_multicast_filtered,
1597 			0, mxge_handle_be32,
1598 			"I", "dropped_multicast_filtered");
1599 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1600 			"dropped_no_big_buffer",
1601 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1602 			0, mxge_handle_be32,
1603 			"I", "dropped_no_big_buffer");
1604 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1605 			"dropped_no_small_buffer",
1606 			CTLTYPE_INT|CTLFLAG_RD,
1607 			&fw->dropped_no_small_buffer,
1608 			0, mxge_handle_be32,
1609 			"I", "dropped_no_small_buffer");
1610 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1611 			"dropped_overrun",
1612 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1613 			0, mxge_handle_be32,
1614 			"I", "dropped_overrun");
1615 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1616 			"dropped_pause",
1617 			CTLTYPE_INT|CTLFLAG_RD,
1618 			&fw->dropped_pause,
1619 			0, mxge_handle_be32,
1620 			"I", "dropped_pause");
1621 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1622 			"dropped_runt",
1623 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1624 			0, mxge_handle_be32,
1625 			"I", "dropped_runt");
1626 
1627 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1628 			"dropped_unicast_filtered",
1629 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1630 			0, mxge_handle_be32,
1631 			"I", "dropped_unicast_filtered");
1632 
1633 	/* verbose printing? */
1634 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1635 		       "verbose",
1636 		       CTLFLAG_RW, &mxge_verbose,
1637 		       0, "verbose printing");
1638 
1639 	/* lro */
1640 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1641 			"lro_cnt",
1642 			CTLTYPE_INT|CTLFLAG_RW, sc,
1643 			0, mxge_change_lro,
1644 			"I", "number of lro merge queues");
1645 
1646 
1647 	/* add counters exported for debugging from all slices */
1648 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1649 	sc->slice_sysctl_tree =
1650 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1651 				"slice", CTLFLAG_RD, 0, "");
1652 
1653 	for (slice = 0; slice < sc->num_slices; slice++) {
1654 		ss = &sc->ss[slice];
1655 		sysctl_ctx_init(&ss->sysctl_ctx);
1656 		ctx = &ss->sysctl_ctx;
1657 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1658 		ksprintf(slice_num, "%d", slice);
1659 		ss->sysctl_tree =
1660 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1661 					CTLFLAG_RD, 0, "");
1662 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1663 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1664 			       "rx_small_cnt",
1665 			       CTLFLAG_RD, &ss->rx_small.cnt,
1666 			       0, "rx_small_cnt");
1667 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1668 			       "rx_big_cnt",
1669 			       CTLFLAG_RD, &ss->rx_big.cnt,
1670 			       0, "rx_small_cnt");
1671 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1672 			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1673 			       0, "number of lro merge queues flushed");
1674 
1675 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1676 			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1677 			       0, "number of frames appended to lro merge"
1678 			       "queues");
1679 
1680 #ifndef IFNET_BUF_RING
1681 		/* only transmit from slice 0 for now */
1682 		if (slice > 0)
1683 			continue;
1684 #endif
1685 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1686 			       "tx_req",
1687 			       CTLFLAG_RD, &ss->tx.req,
1688 			       0, "tx_req");
1689 
1690 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1691 			       "tx_done",
1692 			       CTLFLAG_RD, &ss->tx.done,
1693 			       0, "tx_done");
1694 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1695 			       "tx_pkt_done",
1696 			       CTLFLAG_RD, &ss->tx.pkt_done,
1697 			       0, "tx_done");
1698 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1699 			       "tx_stall",
1700 			       CTLFLAG_RD, &ss->tx.stall,
1701 			       0, "tx_stall");
1702 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1703 			       "tx_wake",
1704 			       CTLFLAG_RD, &ss->tx.wake,
1705 			       0, "tx_wake");
1706 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1707 			       "tx_defrag",
1708 			       CTLFLAG_RD, &ss->tx.defrag,
1709 			       0, "tx_defrag");
1710 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1711 			       "tx_queue_active",
1712 			       CTLFLAG_RD, &ss->tx.queue_active,
1713 			       0, "tx_queue_active");
1714 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1715 			       "tx_activate",
1716 			       CTLFLAG_RD, &ss->tx.activate,
1717 			       0, "tx_activate");
1718 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1719 			       "tx_deactivate",
1720 			       CTLFLAG_RD, &ss->tx.deactivate,
1721 			       0, "tx_deactivate");
1722 	}
1723 }
1724 
1725 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1726    backwards one at a time and handle ring wraps */
1727 
1728 static inline void
1729 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1730 			    mcp_kreq_ether_send_t *src, int cnt)
1731 {
1732         int idx, starting_slot;
1733         starting_slot = tx->req;
1734         while (cnt > 1) {
1735                 cnt--;
1736                 idx = (starting_slot + cnt) & tx->mask;
1737                 mxge_pio_copy(&tx->lanai[idx],
1738 			      &src[cnt], sizeof(*src));
1739                 wmb();
1740         }
1741 }
1742 
1743 /*
1744  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1745  * at most 32 bytes at a time, so as to avoid involving the software
1746  * pio handler in the nic.   We re-write the first segment's flags
1747  * to mark them valid only after writing the entire chain
1748  */
1749 
1750 static inline void
1751 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1752                   int cnt)
1753 {
1754         int idx, i;
1755         uint32_t *src_ints;
1756 	volatile uint32_t *dst_ints;
1757         mcp_kreq_ether_send_t *srcp;
1758 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1759 	uint8_t last_flags;
1760 
1761         idx = tx->req & tx->mask;
1762 
1763 	last_flags = src->flags;
1764 	src->flags = 0;
1765         wmb();
1766         dst = dstp = &tx->lanai[idx];
1767         srcp = src;
1768 
1769         if ((idx + cnt) < tx->mask) {
1770                 for (i = 0; i < (cnt - 1); i += 2) {
1771                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1772                         wmb(); /* force write every 32 bytes */
1773                         srcp += 2;
1774                         dstp += 2;
1775                 }
1776         } else {
1777                 /* submit all but the first request, and ensure
1778                    that it is submitted below */
1779                 mxge_submit_req_backwards(tx, src, cnt);
1780                 i = 0;
1781         }
1782         if (i < cnt) {
1783                 /* submit the first request */
1784                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1785                 wmb(); /* barrier before setting valid flag */
1786         }
1787 
1788         /* re-write the last 32-bits with the valid flags */
1789         src->flags = last_flags;
1790         src_ints = (uint32_t *)src;
1791         src_ints+=3;
1792         dst_ints = (volatile uint32_t *)dst;
1793         dst_ints+=3;
1794         *dst_ints =  *src_ints;
1795         tx->req += cnt;
1796         wmb();
1797 }
1798 
1799 #if IFCAP_TSO4
1800 
1801 static void
1802 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1803 	       int busdma_seg_cnt, int ip_off)
1804 {
1805 	mxge_tx_ring_t *tx;
1806 	mcp_kreq_ether_send_t *req;
1807 	bus_dma_segment_t *seg;
1808 	struct ip *ip;
1809 	struct tcphdr *tcp;
1810 	uint32_t low, high_swapped;
1811 	int len, seglen, cum_len, cum_len_next;
1812 	int next_is_first, chop, cnt, rdma_count, small;
1813 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1814 	uint8_t flags, flags_next;
1815 	static int once;
1816 
1817 	mss = m->m_pkthdr.tso_segsz;
1818 
1819 	/* negative cum_len signifies to the
1820 	 * send loop that we are still in the
1821 	 * header portion of the TSO packet.
1822 	 */
1823 
1824 	/* ensure we have the ethernet, IP and TCP
1825 	   header together in the first mbuf, copy
1826 	   it to a scratch buffer if not */
1827 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1828 		m_copydata(m, 0, ip_off + sizeof (*ip),
1829 			   ss->scratch);
1830 		ip = (struct ip *)(ss->scratch + ip_off);
1831 	} else {
1832 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1833 	}
1834 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1835 			    + sizeof (*tcp))) {
1836 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1837 			   + sizeof (*tcp),  ss->scratch);
1838 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1839 	}
1840 
1841 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1842 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1843 
1844 	/* TSO implies checksum offload on this hardware */
1845 	cksum_offset = ip_off + (ip->ip_hl << 2);
1846 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1847 
1848 
1849 	/* for TSO, pseudo_hdr_offset holds mss.
1850 	 * The firmware figures out where to put
1851 	 * the checksum by parsing the header. */
1852 	pseudo_hdr_offset = htobe16(mss);
1853 
1854 	tx = &ss->tx;
1855 	req = tx->req_list;
1856 	seg = tx->seg_list;
1857 	cnt = 0;
1858 	rdma_count = 0;
1859 	/* "rdma_count" is the number of RDMAs belonging to the
1860 	 * current packet BEFORE the current send request. For
1861 	 * non-TSO packets, this is equal to "count".
1862 	 * For TSO packets, rdma_count needs to be reset
1863 	 * to 0 after a segment cut.
1864 	 *
1865 	 * The rdma_count field of the send request is
1866 	 * the number of RDMAs of the packet starting at
1867 	 * that request. For TSO send requests with one ore more cuts
1868 	 * in the middle, this is the number of RDMAs starting
1869 	 * after the last cut in the request. All previous
1870 	 * segments before the last cut implicitly have 1 RDMA.
1871 	 *
1872 	 * Since the number of RDMAs is not known beforehand,
1873 	 * it must be filled-in retroactively - after each
1874 	 * segmentation cut or at the end of the entire packet.
1875 	 */
1876 
1877 	while (busdma_seg_cnt) {
1878 		/* Break the busdma segment up into pieces*/
1879 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1880 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1881 		len = seg->ds_len;
1882 
1883 		while (len) {
1884 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1885 			seglen = len;
1886 			cum_len_next = cum_len + seglen;
1887 			(req-rdma_count)->rdma_count = rdma_count + 1;
1888 			if (__predict_true(cum_len >= 0)) {
1889 				/* payload */
1890 				chop = (cum_len_next > mss);
1891 				cum_len_next = cum_len_next % mss;
1892 				next_is_first = (cum_len_next == 0);
1893 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1894 				flags_next |= next_is_first *
1895 					MXGEFW_FLAGS_FIRST;
1896 				rdma_count |= -(chop | next_is_first);
1897 				rdma_count += chop & !next_is_first;
1898 			} else if (cum_len_next >= 0) {
1899 				/* header ends */
1900 				rdma_count = -1;
1901 				cum_len_next = 0;
1902 				seglen = -cum_len;
1903 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1904 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1905 					MXGEFW_FLAGS_FIRST |
1906 					(small * MXGEFW_FLAGS_SMALL);
1907 			    }
1908 
1909 			req->addr_high = high_swapped;
1910 			req->addr_low = htobe32(low);
1911 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1912 			req->pad = 0;
1913 			req->rdma_count = 1;
1914 			req->length = htobe16(seglen);
1915 			req->cksum_offset = cksum_offset;
1916 			req->flags = flags | ((cum_len & 1) *
1917 					      MXGEFW_FLAGS_ALIGN_ODD);
1918 			low += seglen;
1919 			len -= seglen;
1920 			cum_len = cum_len_next;
1921 			flags = flags_next;
1922 			req++;
1923 			cnt++;
1924 			rdma_count++;
1925 			if (__predict_false(cksum_offset > seglen))
1926 				cksum_offset -= seglen;
1927 			else
1928 				cksum_offset = 0;
1929 			if (__predict_false(cnt > tx->max_desc))
1930 				goto drop;
1931 		}
1932 		busdma_seg_cnt--;
1933 		seg++;
1934 	}
1935 	(req-rdma_count)->rdma_count = rdma_count;
1936 
1937 	do {
1938 		req--;
1939 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1940 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1941 
1942 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1943 	mxge_submit_req(tx, tx->req_list, cnt);
1944 #ifdef IFNET_BUF_RING
1945 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1946 		/* tell the NIC to start polling this slice */
1947 		*tx->send_go = 1;
1948 		tx->queue_active = 1;
1949 		tx->activate++;
1950 		wmb();
1951 	}
1952 #endif
1953 	return;
1954 
1955 drop:
1956 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1957 	m_freem(m);
1958 	ss->oerrors++;
1959 	if (!once) {
1960 		kprintf("tx->max_desc exceeded via TSO!\n");
1961 		kprintf("mss = %d, %ld, %d!\n", mss,
1962 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1963 		once = 1;
1964 	}
1965 	return;
1966 
1967 }
1968 
1969 #endif /* IFCAP_TSO4 */
1970 
1971 #ifdef MXGE_NEW_VLAN_API
1972 /*
1973  * We reproduce the software vlan tag insertion from
1974  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1975  * vlan tag insertion. We need to advertise this in order to have the
1976  * vlan interface respect our csum offload flags.
1977  */
1978 static struct mbuf *
1979 mxge_vlan_tag_insert(struct mbuf *m)
1980 {
1981 	struct ether_vlan_header *evl;
1982 
1983 	M_PREPEND(m, EVL_ENCAPLEN, MB_DONTWAIT);
1984 	if (__predict_false(m == NULL))
1985 		return NULL;
1986 	if (m->m_len < sizeof(*evl)) {
1987 		m = m_pullup(m, sizeof(*evl));
1988 		if (__predict_false(m == NULL))
1989 			return NULL;
1990 	}
1991 	/*
1992 	 * Transform the Ethernet header into an Ethernet header
1993 	 * with 802.1Q encapsulation.
1994 	 */
1995 	evl = mtod(m, struct ether_vlan_header *);
1996 	bcopy((char *)evl + EVL_ENCAPLEN,
1997 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1998 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1999 	evl->evl_tag = htons(m->m_pkthdr.ether_vlantag);
2000 	m->m_flags &= ~M_VLANTAG;
2001 	return m;
2002 }
2003 #endif /* MXGE_NEW_VLAN_API */
2004 
2005 static void
2006 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2007 {
2008 	mxge_softc_t *sc;
2009 	mcp_kreq_ether_send_t *req;
2010 	bus_dma_segment_t *seg;
2011 	struct mbuf *m_tmp;
2012 	struct ifnet *ifp;
2013 	mxge_tx_ring_t *tx;
2014 	struct ip *ip;
2015 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
2016 	uint16_t pseudo_hdr_offset;
2017         uint8_t flags, cksum_offset;
2018 
2019 
2020 	sc = ss->sc;
2021 	ifp = sc->ifp;
2022 	tx = &ss->tx;
2023 
2024 	ip_off = sizeof (struct ether_header);
2025 #ifdef MXGE_NEW_VLAN_API
2026 	if (m->m_flags & M_VLANTAG) {
2027 		m = mxge_vlan_tag_insert(m);
2028 		if (__predict_false(m == NULL))
2029 			goto drop;
2030 		ip_off += EVL_ENCAPLEN;
2031 	}
2032 #endif
2033 	/* (try to) map the frame for DMA */
2034 	idx = tx->req & tx->mask;
2035 	err = bus_dmamap_load_mbuf_segment(tx->dmat, tx->info[idx].map,
2036 					   m, tx->seg_list, 1, &cnt,
2037 					   BUS_DMA_NOWAIT);
2038 	if (__predict_false(err == EFBIG)) {
2039 		/* Too many segments in the chain.  Try
2040 		   to defrag */
2041 		m_tmp = m_defrag(m, MB_DONTWAIT);
2042 		if (m_tmp == NULL) {
2043 			goto drop;
2044 		}
2045 		ss->tx.defrag++;
2046 		m = m_tmp;
2047 		err = bus_dmamap_load_mbuf_segment(tx->dmat,
2048 					      tx->info[idx].map,
2049 					      m, tx->seg_list, 1, &cnt,
2050 					      BUS_DMA_NOWAIT);
2051 	}
2052 	if (__predict_false(err != 0)) {
2053 		device_printf(sc->dev, "bus_dmamap_load_mbuf_segment returned %d"
2054 			      " packet len = %d\n", err, m->m_pkthdr.len);
2055 		goto drop;
2056 	}
2057 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2058 			BUS_DMASYNC_PREWRITE);
2059 	tx->info[idx].m = m;
2060 
2061 #if IFCAP_TSO4
2062 	/* TSO is different enough, we handle it in another routine */
2063 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2064 		mxge_encap_tso(ss, m, cnt, ip_off);
2065 		return;
2066 	}
2067 #endif
2068 
2069 	req = tx->req_list;
2070 	cksum_offset = 0;
2071 	pseudo_hdr_offset = 0;
2072 	flags = MXGEFW_FLAGS_NO_TSO;
2073 
2074 	/* checksum offloading? */
2075 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2076 		/* ensure ip header is in first mbuf, copy
2077 		   it to a scratch buffer if not */
2078 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2079 			m_copydata(m, 0, ip_off + sizeof (*ip),
2080 				   ss->scratch);
2081 			ip = (struct ip *)(ss->scratch + ip_off);
2082 		} else {
2083 			ip = (struct ip *)(mtod(m, char *) + ip_off);
2084 		}
2085 		cksum_offset = ip_off + (ip->ip_hl << 2);
2086 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2087 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2088 		req->cksum_offset = cksum_offset;
2089 		flags |= MXGEFW_FLAGS_CKSUM;
2090 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2091 	} else {
2092 		odd_flag = 0;
2093 	}
2094 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2095 		flags |= MXGEFW_FLAGS_SMALL;
2096 
2097 	/* convert segments into a request list */
2098 	cum_len = 0;
2099 	seg = tx->seg_list;
2100 	req->flags = MXGEFW_FLAGS_FIRST;
2101 	for (i = 0; i < cnt; i++) {
2102 		req->addr_low =
2103 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2104 		req->addr_high =
2105 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2106 		req->length = htobe16(seg->ds_len);
2107 		req->cksum_offset = cksum_offset;
2108 		if (cksum_offset > seg->ds_len)
2109 			cksum_offset -= seg->ds_len;
2110 		else
2111 			cksum_offset = 0;
2112 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2113 		req->pad = 0; /* complete solid 16-byte block */
2114 		req->rdma_count = 1;
2115 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2116 		cum_len += seg->ds_len;
2117 		seg++;
2118 		req++;
2119 		req->flags = 0;
2120 	}
2121 	req--;
2122 	/* pad runts to 60 bytes */
2123 	if (cum_len < 60) {
2124 		req++;
2125 		req->addr_low =
2126 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2127 		req->addr_high =
2128 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2129 		req->length = htobe16(60 - cum_len);
2130 		req->cksum_offset = 0;
2131 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2132 		req->pad = 0; /* complete solid 16-byte block */
2133 		req->rdma_count = 1;
2134 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2135 		cnt++;
2136 	}
2137 
2138 	tx->req_list[0].rdma_count = cnt;
2139 #if 0
2140 	/* print what the firmware will see */
2141 	for (i = 0; i < cnt; i++) {
2142 		kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2143 		    "cso:%d, flags:0x%x, rdma:%d\n",
2144 		    i, (int)ntohl(tx->req_list[i].addr_high),
2145 		    (int)ntohl(tx->req_list[i].addr_low),
2146 		    (int)ntohs(tx->req_list[i].length),
2147 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2148 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2149 		    tx->req_list[i].rdma_count);
2150 	}
2151 	kprintf("--------------\n");
2152 #endif
2153 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2154 	mxge_submit_req(tx, tx->req_list, cnt);
2155 #ifdef IFNET_BUF_RING
2156 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2157 		/* tell the NIC to start polling this slice */
2158 		*tx->send_go = 1;
2159 		tx->queue_active = 1;
2160 		tx->activate++;
2161 		wmb();
2162 	}
2163 #endif
2164 	return;
2165 
2166 drop:
2167 	m_freem(m);
2168 	ss->oerrors++;
2169 	return;
2170 }
2171 
2172 #ifdef IFNET_BUF_RING
2173 static void
2174 mxge_qflush(struct ifnet *ifp)
2175 {
2176 	mxge_softc_t *sc = ifp->if_softc;
2177 	mxge_tx_ring_t *tx;
2178 	struct mbuf *m;
2179 	int slice;
2180 
2181 	for (slice = 0; slice < sc->num_slices; slice++) {
2182 		tx = &sc->ss[slice].tx;
2183 		lwkt_serialize_enter(sc->ifp->if_serializer);
2184 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2185 			m_freem(m);
2186 		lwkt_serialize_exit(sc->ifp->if_serializer);
2187 	}
2188 	if_qflush(ifp);
2189 }
2190 
2191 static inline void
2192 mxge_start_locked(struct mxge_slice_state *ss)
2193 {
2194 	mxge_softc_t *sc;
2195 	struct mbuf *m;
2196 	struct ifnet *ifp;
2197 	mxge_tx_ring_t *tx;
2198 
2199 	sc = ss->sc;
2200 	ifp = sc->ifp;
2201 	tx = &ss->tx;
2202 
2203 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2204 		m = drbr_dequeue(ifp, tx->br);
2205 		if (m == NULL) {
2206 			return;
2207 		}
2208 		/* let BPF see it */
2209 		BPF_MTAP(ifp, m);
2210 
2211 		/* give it to the nic */
2212 		mxge_encap(ss, m);
2213 	}
2214 	/* ran out of transmit slots */
2215 	if (((ss->if_flags & IFF_OACTIVE) == 0)
2216 	    && (!drbr_empty(ifp, tx->br))) {
2217 		ss->if_flags |= IFF_OACTIVE;
2218 		tx->stall++;
2219 	}
2220 }
2221 
2222 static int
2223 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2224 {
2225 	mxge_softc_t *sc;
2226 	struct ifnet *ifp;
2227 	mxge_tx_ring_t *tx;
2228 	int err;
2229 
2230 	sc = ss->sc;
2231 	ifp = sc->ifp;
2232 	tx = &ss->tx;
2233 
2234 	if ((ss->if_flags & (IFF_RUNNING|IFF_OACTIVE)) !=
2235 	    IFF_RUNNING) {
2236 		err = drbr_enqueue(ifp, tx->br, m);
2237 		return (err);
2238 	}
2239 
2240 	if (drbr_empty(ifp, tx->br) &&
2241 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2242 		/* let BPF see it */
2243 		BPF_MTAP(ifp, m);
2244 		/* give it to the nic */
2245 		mxge_encap(ss, m);
2246 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2247 		return (err);
2248 	}
2249 	if (!drbr_empty(ifp, tx->br))
2250 		mxge_start_locked(ss);
2251 	return (0);
2252 }
2253 
2254 static int
2255 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2256 {
2257 	mxge_softc_t *sc = ifp->if_softc;
2258 	struct mxge_slice_state *ss;
2259 	mxge_tx_ring_t *tx;
2260 	int err = 0;
2261 	int slice;
2262 
2263 #if 0
2264 	slice = m->m_pkthdr.flowid;
2265 #endif
2266 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2267 
2268 	ss = &sc->ss[slice];
2269 	tx = &ss->tx;
2270 
2271 	if(lwkt_serialize_try(ifp->if_serializer)) {
2272 		err = mxge_transmit_locked(ss, m);
2273 		lwkt_serialize_exit(ifp->if_serializer);
2274 	} else {
2275 		err = drbr_enqueue(ifp, tx->br, m);
2276 	}
2277 
2278 	return (err);
2279 }
2280 
2281 #else
2282 
2283 static inline void
2284 mxge_start_locked(struct mxge_slice_state *ss)
2285 {
2286 	mxge_softc_t *sc;
2287 	struct mbuf *m;
2288 	struct ifnet *ifp;
2289 	mxge_tx_ring_t *tx;
2290 
2291 	sc = ss->sc;
2292 	ifp = sc->ifp;
2293 	tx = &ss->tx;
2294 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2295 		m = ifq_dequeue(&ifp->if_snd, NULL);
2296 		if (m == NULL) {
2297 			return;
2298 		}
2299 		/* let BPF see it */
2300 		BPF_MTAP(ifp, m);
2301 
2302 		/* give it to the nic */
2303 		mxge_encap(ss, m);
2304 	}
2305 	/* ran out of transmit slots */
2306 	if ((sc->ifp->if_flags & IFF_OACTIVE) == 0) {
2307 		sc->ifp->if_flags |= IFF_OACTIVE;
2308 		tx->stall++;
2309 	}
2310 }
2311 #endif
2312 static void
2313 mxge_start(struct ifnet *ifp)
2314 {
2315 	mxge_softc_t *sc = ifp->if_softc;
2316 	struct mxge_slice_state *ss;
2317 
2318 	ASSERT_SERIALIZED(sc->ifp->if_serializer);
2319 	/* only use the first slice for now */
2320 	ss = &sc->ss[0];
2321 	mxge_start_locked(ss);
2322 }
2323 
2324 /*
2325  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2326  * at most 32 bytes at a time, so as to avoid involving the software
2327  * pio handler in the nic.   We re-write the first segment's low
2328  * DMA address to mark it valid only after we write the entire chunk
2329  * in a burst
2330  */
2331 static inline void
2332 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2333 		mcp_kreq_ether_recv_t *src)
2334 {
2335 	uint32_t low;
2336 
2337 	low = src->addr_low;
2338 	src->addr_low = 0xffffffff;
2339 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2340 	wmb();
2341 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2342 	wmb();
2343 	src->addr_low = low;
2344 	dst->addr_low = low;
2345 	wmb();
2346 }
2347 
2348 static int
2349 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2350 {
2351 	bus_dma_segment_t seg;
2352 	struct mbuf *m;
2353 	mxge_rx_ring_t *rx = &ss->rx_small;
2354 	int cnt, err;
2355 
2356 	m = m_gethdr(MB_DONTWAIT, MT_DATA);
2357 	if (m == NULL) {
2358 		rx->alloc_fail++;
2359 		err = ENOBUFS;
2360 		goto done;
2361 	}
2362 	m->m_len = m->m_pkthdr.len = MHLEN;
2363 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2364 				      &seg, 1, &cnt, BUS_DMA_NOWAIT);
2365 	if (err != 0) {
2366 		kprintf("can't dmamap small (%d)\n", err);
2367 		m_free(m);
2368 		goto done;
2369 	}
2370 	rx->info[idx].m = m;
2371 	rx->shadow[idx].addr_low =
2372 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2373 	rx->shadow[idx].addr_high =
2374 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2375 
2376 done:
2377 	if ((idx & 7) == 7)
2378 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2379 	return err;
2380 }
2381 
2382 
2383 static int
2384 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2385 {
2386 	bus_dma_segment_t seg[3];
2387 	struct mbuf *m;
2388 	mxge_rx_ring_t *rx = &ss->rx_big;
2389 	int cnt, err, i;
2390 
2391 	if (rx->cl_size == MCLBYTES)
2392 		m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2393 	else {
2394 #if 0
2395 		m = m_getjcl(MB_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2396 #else
2397 		/*
2398 		 * XXX: allocate normal sized buffers for big buffers.
2399 		 * We should be fine as long as we don't get any jumbo frames
2400 		 */
2401 		m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2402 #endif
2403 	}
2404 	if (m == NULL) {
2405 		rx->alloc_fail++;
2406 		err = ENOBUFS;
2407 		goto done;
2408 	}
2409 	m->m_pkthdr.len = 0;
2410 	m->m_len = m->m_pkthdr.len = rx->mlen;
2411 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2412 				      seg, 1, &cnt, BUS_DMA_NOWAIT);
2413 	if (err != 0) {
2414 		kprintf("can't dmamap big (%d)\n", err);
2415 		m_free(m);
2416 		goto done;
2417 	}
2418 	rx->info[idx].m = m;
2419 	rx->shadow[idx].addr_low =
2420 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2421 	rx->shadow[idx].addr_high =
2422 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2423 
2424 #if MXGE_VIRT_JUMBOS
2425 	for (i = 1; i < cnt; i++) {
2426 		rx->shadow[idx + i].addr_low =
2427 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2428 		rx->shadow[idx + i].addr_high =
2429 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2430        }
2431 #endif
2432 
2433 done:
2434        for (i = 0; i < rx->nbufs; i++) {
2435 		if ((idx & 7) == 7) {
2436 			mxge_submit_8rx(&rx->lanai[idx - 7],
2437 					&rx->shadow[idx - 7]);
2438 		}
2439 		idx++;
2440 	}
2441 	return err;
2442 }
2443 
2444 /*
2445  *  Myri10GE hardware checksums are not valid if the sender
2446  *  padded the frame with non-zero padding.  This is because
2447  *  the firmware just does a simple 16-bit 1s complement
2448  *  checksum across the entire frame, excluding the first 14
2449  *  bytes.  It is best to simply to check the checksum and
2450  *  tell the stack about it only if the checksum is good
2451  */
2452 
2453 static inline uint16_t
2454 mxge_rx_csum(struct mbuf *m, int csum)
2455 {
2456 	struct ether_header *eh;
2457 	struct ip *ip;
2458 	uint16_t c;
2459 
2460 	eh = mtod(m, struct ether_header *);
2461 
2462 	/* only deal with IPv4 TCP & UDP for now */
2463 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2464 		return 1;
2465 	ip = (struct ip *)(eh + 1);
2466 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2467 			    ip->ip_p != IPPROTO_UDP))
2468 		return 1;
2469 #ifdef INET
2470 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2471 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2472 			    - (ip->ip_hl << 2) + ip->ip_p));
2473 #else
2474 	c = 1;
2475 #endif
2476 	c ^= 0xffff;
2477 	return (c);
2478 }
2479 
2480 static void
2481 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2482 {
2483 	struct ether_vlan_header *evl;
2484 	struct ether_header *eh;
2485 	uint32_t partial;
2486 
2487 	evl = mtod(m, struct ether_vlan_header *);
2488 	eh = mtod(m, struct ether_header *);
2489 
2490 	/*
2491 	 * fix checksum by subtracting EVL_ENCAPLEN bytes
2492 	 * after what the firmware thought was the end of the ethernet
2493 	 * header.
2494 	 */
2495 
2496 	/* put checksum into host byte order */
2497 	*csum = ntohs(*csum);
2498 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2499 	(*csum) += ~partial;
2500 	(*csum) +=  ((*csum) < ~partial);
2501 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2502 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2503 
2504 	/* restore checksum to network byte order;
2505 	   later consumers expect this */
2506 	*csum = htons(*csum);
2507 
2508 	/* save the tag */
2509 #ifdef MXGE_NEW_VLAN_API
2510 	m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2511 #else
2512 	{
2513 		struct m_tag *mtag;
2514 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2515 				   MB_DONTWAIT);
2516 		if (mtag == NULL)
2517 			return;
2518 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2519 		m_tag_prepend(m, mtag);
2520 	}
2521 
2522 #endif
2523 	m->m_flags |= M_VLANTAG;
2524 
2525 	/*
2526 	 * Remove the 802.1q header by copying the Ethernet
2527 	 * addresses over it and adjusting the beginning of
2528 	 * the data in the mbuf.  The encapsulated Ethernet
2529 	 * type field is already in place.
2530 	 */
2531 	bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2532 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2533 	m_adj(m, EVL_ENCAPLEN);
2534 }
2535 
2536 
2537 static inline void
2538 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum,
2539 		   struct mbuf_chain *chain)
2540 {
2541 	mxge_softc_t *sc;
2542 	struct ifnet *ifp;
2543 	struct mbuf *m;
2544 	struct ether_header *eh;
2545 	mxge_rx_ring_t *rx;
2546 	bus_dmamap_t old_map;
2547 	int idx;
2548 	uint16_t tcpudp_csum;
2549 
2550 	sc = ss->sc;
2551 	ifp = sc->ifp;
2552 	rx = &ss->rx_big;
2553 	idx = rx->cnt & rx->mask;
2554 	rx->cnt += rx->nbufs;
2555 	/* save a pointer to the received mbuf */
2556 	m = rx->info[idx].m;
2557 	/* try to replace the received mbuf */
2558 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2559 		/* drop the frame -- the old mbuf is re-cycled */
2560 		ifp->if_ierrors++;
2561 		return;
2562 	}
2563 
2564 	/* unmap the received buffer */
2565 	old_map = rx->info[idx].map;
2566 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2567 	bus_dmamap_unload(rx->dmat, old_map);
2568 
2569 	/* swap the bus_dmamap_t's */
2570 	rx->info[idx].map = rx->extra_map;
2571 	rx->extra_map = old_map;
2572 
2573 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2574 	 * aligned */
2575 	m->m_data += MXGEFW_PAD;
2576 
2577 	m->m_pkthdr.rcvif = ifp;
2578 	m->m_len = m->m_pkthdr.len = len;
2579 	ss->ipackets++;
2580 	eh = mtod(m, struct ether_header *);
2581 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2582 		mxge_vlan_tag_remove(m, &csum);
2583 	}
2584 	/* if the checksum is valid, mark it in the mbuf header */
2585 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2586 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2587 			return;
2588 		/* otherwise, it was a UDP frame, or a TCP frame which
2589 		   we could not do LRO on.  Tell the stack that the
2590 		   checksum is good */
2591 		m->m_pkthdr.csum_data = 0xffff;
2592 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2593 	}
2594 #if 0
2595 	/* flowid only valid if RSS hashing is enabled */
2596 	if (sc->num_slices > 1) {
2597 		m->m_pkthdr.flowid = (ss - sc->ss);
2598 		m->m_flags |= M_FLOWID;
2599 	}
2600 #endif
2601 	ether_input_chain(ifp, m, NULL, chain);
2602 }
2603 
2604 static inline void
2605 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum,
2606 		   struct mbuf_chain *chain)
2607 {
2608 	mxge_softc_t *sc;
2609 	struct ifnet *ifp;
2610 	struct ether_header *eh;
2611 	struct mbuf *m;
2612 	mxge_rx_ring_t *rx;
2613 	bus_dmamap_t old_map;
2614 	int idx;
2615 	uint16_t tcpudp_csum;
2616 
2617 	sc = ss->sc;
2618 	ifp = sc->ifp;
2619 	rx = &ss->rx_small;
2620 	idx = rx->cnt & rx->mask;
2621 	rx->cnt++;
2622 	/* save a pointer to the received mbuf */
2623 	m = rx->info[idx].m;
2624 	/* try to replace the received mbuf */
2625 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2626 		/* drop the frame -- the old mbuf is re-cycled */
2627 		ifp->if_ierrors++;
2628 		return;
2629 	}
2630 
2631 	/* unmap the received buffer */
2632 	old_map = rx->info[idx].map;
2633 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2634 	bus_dmamap_unload(rx->dmat, old_map);
2635 
2636 	/* swap the bus_dmamap_t's */
2637 	rx->info[idx].map = rx->extra_map;
2638 	rx->extra_map = old_map;
2639 
2640 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2641 	 * aligned */
2642 	m->m_data += MXGEFW_PAD;
2643 
2644 	m->m_pkthdr.rcvif = ifp;
2645 	m->m_len = m->m_pkthdr.len = len;
2646 	ss->ipackets++;
2647 	eh = mtod(m, struct ether_header *);
2648 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2649 		mxge_vlan_tag_remove(m, &csum);
2650 	}
2651 	/* if the checksum is valid, mark it in the mbuf header */
2652 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2653 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2654 			return;
2655 		/* otherwise, it was a UDP frame, or a TCP frame which
2656 		   we could not do LRO on.  Tell the stack that the
2657 		   checksum is good */
2658 		m->m_pkthdr.csum_data = 0xffff;
2659 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2660 	}
2661 #if 0
2662 	/* flowid only valid if RSS hashing is enabled */
2663 	if (sc->num_slices > 1) {
2664 		m->m_pkthdr.flowid = (ss - sc->ss);
2665 		m->m_flags |= M_FLOWID;
2666 	}
2667 #endif
2668 	ether_input_chain(ifp, m, NULL, chain);
2669 }
2670 
2671 /*
2672  * XXX
2673  *
2674  * Inlining the call to this function causes mxge_intr() to grow too large
2675  * for GCC's stack size limits (which shouldn't take into account inlining
2676  * of leaf functions at one call site anyway). Inlining is definitely a
2677  * good idea in this case though, so mark the function appropriately.
2678  */
2679 static __always_inline void
2680 mxge_clean_rx_done(struct mxge_slice_state *ss)
2681 {
2682 	mxge_rx_done_t *rx_done = &ss->rx_done;
2683 	int limit = 0;
2684 	uint16_t length;
2685 	uint16_t checksum;
2686 	struct mbuf_chain chain[MAXCPU];
2687 
2688 	ether_input_chain_init(chain);
2689 	while (rx_done->entry[rx_done->idx].length != 0) {
2690 		length = ntohs(rx_done->entry[rx_done->idx].length);
2691 		rx_done->entry[rx_done->idx].length = 0;
2692 		checksum = rx_done->entry[rx_done->idx].checksum;
2693 		if (length <= (MHLEN - MXGEFW_PAD))
2694 			mxge_rx_done_small(ss, length, checksum, chain);
2695 		else
2696 			mxge_rx_done_big(ss, length, checksum, chain);
2697 		rx_done->cnt++;
2698 		rx_done->idx = rx_done->cnt & rx_done->mask;
2699 
2700 		/* limit potential for livelock */
2701 		if (__predict_false(++limit > rx_done->mask / 2))
2702 			break;
2703 	}
2704 	ether_input_dispatch(chain);
2705 #ifdef INET
2706 	while (!SLIST_EMPTY(&ss->lro_active)) {
2707 		struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2708 		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2709 		mxge_lro_flush(ss, lro);
2710 	}
2711 #endif
2712 }
2713 
2714 
2715 static inline void
2716 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2717 {
2718 	struct ifnet *ifp;
2719 	mxge_tx_ring_t *tx;
2720 	struct mbuf *m;
2721 	bus_dmamap_t map;
2722 	int idx;
2723 	int *flags;
2724 
2725 	tx = &ss->tx;
2726 	ifp = ss->sc->ifp;
2727 	ASSERT_SERIALIZED(ifp->if_serializer);
2728 	while (tx->pkt_done != mcp_idx) {
2729 		idx = tx->done & tx->mask;
2730 		tx->done++;
2731 		m = tx->info[idx].m;
2732 		/* mbuf and DMA map only attached to the first
2733 		   segment per-mbuf */
2734 		if (m != NULL) {
2735 			ss->obytes += m->m_pkthdr.len;
2736 			if (m->m_flags & M_MCAST)
2737 				ss->omcasts++;
2738 			ss->opackets++;
2739 			tx->info[idx].m = NULL;
2740 			map = tx->info[idx].map;
2741 			bus_dmamap_unload(tx->dmat, map);
2742 			m_freem(m);
2743 		}
2744 		if (tx->info[idx].flag) {
2745 			tx->info[idx].flag = 0;
2746 			tx->pkt_done++;
2747 		}
2748 	}
2749 
2750 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2751            its OK to send packets */
2752 #ifdef IFNET_BUF_RING
2753 	flags = &ss->if_flags;
2754 #else
2755 	flags = &ifp->if_flags;
2756 #endif
2757 	if ((*flags) & IFF_OACTIVE &&
2758 	    tx->req - tx->done < (tx->mask + 1)/4) {
2759 		*(flags) &= ~IFF_OACTIVE;
2760 		ss->tx.wake++;
2761 		mxge_start_locked(ss);
2762 	}
2763 #ifdef IFNET_BUF_RING
2764 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2765 		/* let the NIC stop polling this queue, since there
2766 		 * are no more transmits pending */
2767 		if (tx->req == tx->done) {
2768 			*tx->send_stop = 1;
2769 			tx->queue_active = 0;
2770 			tx->deactivate++;
2771 			wmb();
2772 		}
2773 	}
2774 #endif
2775 
2776 }
2777 
2778 static struct mxge_media_type mxge_xfp_media_types[] =
2779 {
2780 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2781 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2782 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2783 	{0,		(1 << 5),	"10GBASE-ER"},
2784 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2785 	{0,		(1 << 3),	"10GBASE-SW"},
2786 	{0,		(1 << 2),	"10GBASE-LW"},
2787 	{0,		(1 << 1),	"10GBASE-EW"},
2788 	{0,		(1 << 0),	"Reserved"}
2789 };
2790 static struct mxge_media_type mxge_sfp_media_types[] =
2791 {
2792 	{0,		(1 << 7),	"Reserved"},
2793 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2794 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2795 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"}
2796 };
2797 
2798 static void
2799 mxge_set_media(mxge_softc_t *sc, int type)
2800 {
2801 	sc->media_flags |= type;
2802 	ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2803 	ifmedia_set(&sc->media, sc->media_flags);
2804 }
2805 
2806 
2807 /*
2808  * Determine the media type for a NIC.  Some XFPs will identify
2809  * themselves only when their link is up, so this is initiated via a
2810  * link up interrupt.  However, this can potentially take up to
2811  * several milliseconds, so it is run via the watchdog routine, rather
2812  * than in the interrupt handler itself.   This need only be done
2813  * once, not each time the link is up.
2814  */
2815 static void
2816 mxge_media_probe(mxge_softc_t *sc)
2817 {
2818 	mxge_cmd_t cmd;
2819 	char *cage_type;
2820 	char *ptr;
2821 	struct mxge_media_type *mxge_media_types = NULL;
2822 	int i, err, ms, mxge_media_type_entries;
2823 	uint32_t byte;
2824 
2825 	sc->need_media_probe = 0;
2826 
2827 	/* if we've already set a media type, we're done */
2828 	if (sc->media_flags  != (IFM_ETHER | IFM_AUTO))
2829 		return;
2830 
2831 	/*
2832 	 * parse the product code to deterimine the interface type
2833 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2834 	 * after the 3rd dash in the driver's cached copy of the
2835 	 * EEPROM's product code string.
2836 	 */
2837 	ptr = sc->product_code_string;
2838 	if (ptr == NULL) {
2839 		device_printf(sc->dev, "Missing product code\n");
2840 	}
2841 
2842 	for (i = 0; i < 3; i++, ptr++) {
2843 		ptr = index(ptr, '-');
2844 		if (ptr == NULL) {
2845 			device_printf(sc->dev,
2846 				      "only %d dashes in PC?!?\n", i);
2847 			return;
2848 		}
2849 	}
2850 	if (*ptr == 'C') {
2851 		/* -C is CX4 */
2852 		mxge_set_media(sc, IFM_10G_CX4);
2853 		return;
2854 	}
2855 	else if (*ptr == 'Q') {
2856 		/* -Q is Quad Ribbon Fiber */
2857 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2858 		/* FreeBSD has no media type for Quad ribbon fiber */
2859 		return;
2860 	}
2861 
2862 	if (*ptr == 'R') {
2863 		/* -R is XFP */
2864 		mxge_media_types = mxge_xfp_media_types;
2865 		mxge_media_type_entries = NELEM(mxge_xfp_media_types);
2866 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2867 		cage_type = "XFP";
2868 	}
2869 
2870 	if (*ptr == 'S' || *(ptr +1) == 'S') {
2871 		/* -S or -2S is SFP+ */
2872 		mxge_media_types = mxge_sfp_media_types;
2873 		mxge_media_type_entries = NELEM(mxge_sfp_media_types);
2874 		cage_type = "SFP+";
2875 		byte = 3;
2876 	}
2877 
2878 	if (mxge_media_types == NULL) {
2879 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2880 		return;
2881 	}
2882 
2883 	/*
2884 	 * At this point we know the NIC has an XFP cage, so now we
2885 	 * try to determine what is in the cage by using the
2886 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2887 	 * register.  We read just one byte, which may take over
2888 	 * a millisecond
2889 	 */
2890 
2891 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2892 	cmd.data1 = byte;
2893 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2894 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2895 		device_printf(sc->dev, "failed to read XFP\n");
2896 	}
2897 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2898 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2899 	}
2900 	if (err != MXGEFW_CMD_OK) {
2901 		return;
2902 	}
2903 
2904 	/* now we wait for the data to be cached */
2905 	cmd.data0 = byte;
2906 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2907 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2908 		DELAY(1000);
2909 		cmd.data0 = byte;
2910 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2911 	}
2912 	if (err != MXGEFW_CMD_OK) {
2913 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2914 			      cage_type, err, ms);
2915 		return;
2916 	}
2917 
2918 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2919 		if (mxge_verbose)
2920 			device_printf(sc->dev, "%s:%s\n", cage_type,
2921 				      mxge_media_types[0].name);
2922 		mxge_set_media(sc, IFM_10G_CX4);
2923 		return;
2924 	}
2925 	for (i = 1; i < mxge_media_type_entries; i++) {
2926 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2927 			if (mxge_verbose)
2928 				device_printf(sc->dev, "%s:%s\n",
2929 					      cage_type,
2930 					      mxge_media_types[i].name);
2931 
2932 			mxge_set_media(sc, mxge_media_types[i].flag);
2933 			return;
2934 		}
2935 	}
2936 	device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2937 		      cmd.data0);
2938 
2939 	return;
2940 }
2941 
2942 static void
2943 mxge_intr(void *arg)
2944 {
2945 	struct mxge_slice_state *ss = arg;
2946 	mxge_softc_t *sc = ss->sc;
2947 	mcp_irq_data_t *stats = ss->fw_stats;
2948 	mxge_tx_ring_t *tx = &ss->tx;
2949 	mxge_rx_done_t *rx_done = &ss->rx_done;
2950 	uint32_t send_done_count;
2951 	uint8_t valid;
2952 
2953 
2954 #ifndef IFNET_BUF_RING
2955 	/* an interrupt on a non-zero slice is implicitly valid
2956 	   since MSI-X irqs are not shared */
2957 	if (ss != sc->ss) {
2958 		mxge_clean_rx_done(ss);
2959 		*ss->irq_claim = be32toh(3);
2960 		return;
2961 	}
2962 #endif
2963 
2964 	/* make sure the DMA has finished */
2965 	if (!stats->valid) {
2966 		return;
2967 	}
2968 	valid = stats->valid;
2969 
2970 	if (sc->legacy_irq) {
2971 		/* lower legacy IRQ  */
2972 		*sc->irq_deassert = 0;
2973 		if (!mxge_deassert_wait)
2974 			/* don't wait for conf. that irq is low */
2975 			stats->valid = 0;
2976 	} else {
2977 		stats->valid = 0;
2978 	}
2979 
2980 	/* loop while waiting for legacy irq deassertion */
2981 	do {
2982 		/* check for transmit completes and receives */
2983 		send_done_count = be32toh(stats->send_done_count);
2984 		while ((send_done_count != tx->pkt_done) ||
2985 		       (rx_done->entry[rx_done->idx].length != 0)) {
2986 			if (send_done_count != tx->pkt_done)
2987 				mxge_tx_done(ss, (int)send_done_count);
2988 			mxge_clean_rx_done(ss);
2989 			send_done_count = be32toh(stats->send_done_count);
2990 		}
2991 		if (sc->legacy_irq && mxge_deassert_wait)
2992 			wmb();
2993 	} while (*((volatile uint8_t *) &stats->valid));
2994 
2995 	/* fw link & error stats meaningful only on the first slice */
2996 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2997 		if (sc->link_state != stats->link_up) {
2998 			sc->link_state = stats->link_up;
2999 			if (sc->link_state) {
3000 				sc->ifp->if_link_state = LINK_STATE_UP;
3001 				if_link_state_change(sc->ifp);
3002 				if (mxge_verbose)
3003 					device_printf(sc->dev, "link up\n");
3004 			} else {
3005 				sc->ifp->if_link_state = LINK_STATE_DOWN;
3006 				if_link_state_change(sc->ifp);
3007 				if (mxge_verbose)
3008 					device_printf(sc->dev, "link down\n");
3009 			}
3010 			sc->need_media_probe = 1;
3011 		}
3012 		if (sc->rdma_tags_available !=
3013 		    be32toh(stats->rdma_tags_available)) {
3014 			sc->rdma_tags_available =
3015 				be32toh(stats->rdma_tags_available);
3016 			device_printf(sc->dev, "RDMA timed out! %d tags "
3017 				      "left\n", sc->rdma_tags_available);
3018 		}
3019 
3020 		if (stats->link_down) {
3021 			sc->down_cnt += stats->link_down;
3022 			sc->link_state = 0;
3023 			sc->ifp->if_link_state = LINK_STATE_DOWN;
3024 			if_link_state_change(sc->ifp);
3025 		}
3026 	}
3027 
3028 	/* check to see if we have rx token to pass back */
3029 	if (valid & 0x1)
3030 	    *ss->irq_claim = be32toh(3);
3031 	*(ss->irq_claim + 1) = be32toh(3);
3032 }
3033 
3034 static void
3035 mxge_init(void *arg)
3036 {
3037 }
3038 
3039 
3040 
3041 static void
3042 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3043 {
3044 	struct lro_entry *lro_entry;
3045 	int i;
3046 
3047 	while (!SLIST_EMPTY(&ss->lro_free)) {
3048 		lro_entry = SLIST_FIRST(&ss->lro_free);
3049 		SLIST_REMOVE_HEAD(&ss->lro_free, next);
3050 		kfree(lro_entry, M_DEVBUF);
3051 	}
3052 
3053 	for (i = 0; i <= ss->rx_big.mask; i++) {
3054 		if (ss->rx_big.info[i].m == NULL)
3055 			continue;
3056 		bus_dmamap_unload(ss->rx_big.dmat,
3057 				  ss->rx_big.info[i].map);
3058 		m_freem(ss->rx_big.info[i].m);
3059 		ss->rx_big.info[i].m = NULL;
3060 	}
3061 
3062 	for (i = 0; i <= ss->rx_small.mask; i++) {
3063 		if (ss->rx_small.info[i].m == NULL)
3064 			continue;
3065 		bus_dmamap_unload(ss->rx_small.dmat,
3066 				  ss->rx_small.info[i].map);
3067 		m_freem(ss->rx_small.info[i].m);
3068 		ss->rx_small.info[i].m = NULL;
3069 	}
3070 
3071 	/* transmit ring used only on the first slice */
3072 	if (ss->tx.info == NULL)
3073 		return;
3074 
3075 	for (i = 0; i <= ss->tx.mask; i++) {
3076 		ss->tx.info[i].flag = 0;
3077 		if (ss->tx.info[i].m == NULL)
3078 			continue;
3079 		bus_dmamap_unload(ss->tx.dmat,
3080 				  ss->tx.info[i].map);
3081 		m_freem(ss->tx.info[i].m);
3082 		ss->tx.info[i].m = NULL;
3083 	}
3084 }
3085 
3086 static void
3087 mxge_free_mbufs(mxge_softc_t *sc)
3088 {
3089 	int slice;
3090 
3091 	for (slice = 0; slice < sc->num_slices; slice++)
3092 		mxge_free_slice_mbufs(&sc->ss[slice]);
3093 }
3094 
3095 static void
3096 mxge_free_slice_rings(struct mxge_slice_state *ss)
3097 {
3098 	int i;
3099 
3100 
3101 	if (ss->rx_done.entry != NULL)
3102 		mxge_dma_free(&ss->rx_done.dma);
3103 	ss->rx_done.entry = NULL;
3104 
3105 	if (ss->tx.req_bytes != NULL)
3106 		kfree(ss->tx.req_bytes, M_DEVBUF);
3107 	ss->tx.req_bytes = NULL;
3108 
3109 	if (ss->tx.seg_list != NULL)
3110 		kfree(ss->tx.seg_list, M_DEVBUF);
3111 	ss->tx.seg_list = NULL;
3112 
3113 	if (ss->rx_small.shadow != NULL)
3114 		kfree(ss->rx_small.shadow, M_DEVBUF);
3115 	ss->rx_small.shadow = NULL;
3116 
3117 	if (ss->rx_big.shadow != NULL)
3118 		kfree(ss->rx_big.shadow, M_DEVBUF);
3119 	ss->rx_big.shadow = NULL;
3120 
3121 	if (ss->tx.info != NULL) {
3122 		if (ss->tx.dmat != NULL) {
3123 			for (i = 0; i <= ss->tx.mask; i++) {
3124 				bus_dmamap_destroy(ss->tx.dmat,
3125 						   ss->tx.info[i].map);
3126 			}
3127 			bus_dma_tag_destroy(ss->tx.dmat);
3128 		}
3129 		kfree(ss->tx.info, M_DEVBUF);
3130 	}
3131 	ss->tx.info = NULL;
3132 
3133 	if (ss->rx_small.info != NULL) {
3134 		if (ss->rx_small.dmat != NULL) {
3135 			for (i = 0; i <= ss->rx_small.mask; i++) {
3136 				bus_dmamap_destroy(ss->rx_small.dmat,
3137 						   ss->rx_small.info[i].map);
3138 			}
3139 			bus_dmamap_destroy(ss->rx_small.dmat,
3140 					   ss->rx_small.extra_map);
3141 			bus_dma_tag_destroy(ss->rx_small.dmat);
3142 		}
3143 		kfree(ss->rx_small.info, M_DEVBUF);
3144 	}
3145 	ss->rx_small.info = NULL;
3146 
3147 	if (ss->rx_big.info != NULL) {
3148 		if (ss->rx_big.dmat != NULL) {
3149 			for (i = 0; i <= ss->rx_big.mask; i++) {
3150 				bus_dmamap_destroy(ss->rx_big.dmat,
3151 						   ss->rx_big.info[i].map);
3152 			}
3153 			bus_dmamap_destroy(ss->rx_big.dmat,
3154 					   ss->rx_big.extra_map);
3155 			bus_dma_tag_destroy(ss->rx_big.dmat);
3156 		}
3157 		kfree(ss->rx_big.info, M_DEVBUF);
3158 	}
3159 	ss->rx_big.info = NULL;
3160 }
3161 
3162 static void
3163 mxge_free_rings(mxge_softc_t *sc)
3164 {
3165 	int slice;
3166 
3167 	for (slice = 0; slice < sc->num_slices; slice++)
3168 		mxge_free_slice_rings(&sc->ss[slice]);
3169 }
3170 
3171 static int
3172 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3173 		       int tx_ring_entries)
3174 {
3175 	mxge_softc_t *sc = ss->sc;
3176 	size_t bytes;
3177 	int err, i;
3178 
3179 	err = ENOMEM;
3180 
3181 	/* allocate per-slice receive resources */
3182 
3183 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3184 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3185 
3186 	/* allocate the rx shadow rings */
3187 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3188 	ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3189 	if (ss->rx_small.shadow == NULL)
3190 		return err;;
3191 
3192 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3193 	ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3194 	if (ss->rx_big.shadow == NULL)
3195 		return err;;
3196 
3197 	/* allocate the rx host info rings */
3198 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3199 	ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3200 	if (ss->rx_small.info == NULL)
3201 		return err;;
3202 
3203 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3204 	ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3205 	if (ss->rx_big.info == NULL)
3206 		return err;;
3207 
3208 	/* allocate the rx busdma resources */
3209 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3210 				 1,			/* alignment */
3211 				 4096,			/* boundary */
3212 				 BUS_SPACE_MAXADDR,	/* low */
3213 				 BUS_SPACE_MAXADDR,	/* high */
3214 				 NULL, NULL,		/* filter */
3215 				 MHLEN,			/* maxsize */
3216 				 1,			/* num segs */
3217 				 MHLEN,			/* maxsegsize */
3218 				 BUS_DMA_ALLOCNOW,	/* flags */
3219 				 &ss->rx_small.dmat);	/* tag */
3220 	if (err != 0) {
3221 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3222 			      err);
3223 		return err;;
3224 	}
3225 
3226 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3227 				 1,			/* alignment */
3228 #if MXGE_VIRT_JUMBOS
3229 				 4096,			/* boundary */
3230 #else
3231 				 0,			/* boundary */
3232 #endif
3233 				 BUS_SPACE_MAXADDR,	/* low */
3234 				 BUS_SPACE_MAXADDR,	/* high */
3235 				 NULL, NULL,		/* filter */
3236 				 3*4096,		/* maxsize */
3237 #if MXGE_VIRT_JUMBOS
3238 				 3,			/* num segs */
3239 				 4096,			/* maxsegsize*/
3240 #else
3241 				 1,			/* num segs */
3242 				 MJUM9BYTES,		/* maxsegsize*/
3243 #endif
3244 				 BUS_DMA_ALLOCNOW,	/* flags */
3245 				 &ss->rx_big.dmat);	/* tag */
3246 	if (err != 0) {
3247 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3248 			      err);
3249 		return err;;
3250 	}
3251 	for (i = 0; i <= ss->rx_small.mask; i++) {
3252 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3253 					&ss->rx_small.info[i].map);
3254 		if (err != 0) {
3255 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3256 				      err);
3257 			return err;;
3258 		}
3259 	}
3260 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3261 				&ss->rx_small.extra_map);
3262 	if (err != 0) {
3263 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3264 			      err);
3265 		return err;;
3266 	}
3267 
3268 	for (i = 0; i <= ss->rx_big.mask; i++) {
3269 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3270 					&ss->rx_big.info[i].map);
3271 		if (err != 0) {
3272 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3273 				      err);
3274 			return err;;
3275 		}
3276 	}
3277 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3278 				&ss->rx_big.extra_map);
3279 	if (err != 0) {
3280 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3281 			      err);
3282 		return err;;
3283 	}
3284 
3285 	/* now allocate TX resouces */
3286 
3287 #ifndef IFNET_BUF_RING
3288 	/* only use a single TX ring for now */
3289 	if (ss != ss->sc->ss)
3290 		return 0;
3291 #endif
3292 
3293 	ss->tx.mask = tx_ring_entries - 1;
3294 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3295 
3296 
3297 	/* allocate the tx request copy block */
3298 	bytes = 8 +
3299 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3300 	ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3301 	if (ss->tx.req_bytes == NULL)
3302 		return err;;
3303 	/* ensure req_list entries are aligned to 8 bytes */
3304 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3305 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3306 
3307 	/* allocate the tx busdma segment list */
3308 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3309 	ss->tx.seg_list = (bus_dma_segment_t *)
3310 		kmalloc(bytes, M_DEVBUF, M_WAITOK);
3311 	if (ss->tx.seg_list == NULL)
3312 		return err;;
3313 
3314 	/* allocate the tx host info ring */
3315 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3316 	ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3317 	if (ss->tx.info == NULL)
3318 		return err;;
3319 
3320 	/* allocate the tx busdma resources */
3321 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3322 				 1,			/* alignment */
3323 				 sc->tx_boundary,	/* boundary */
3324 				 BUS_SPACE_MAXADDR,	/* low */
3325 				 BUS_SPACE_MAXADDR,	/* high */
3326 				 NULL, NULL,		/* filter */
3327 				 65536 + 256,		/* maxsize */
3328 				 ss->tx.max_desc - 2,	/* num segs */
3329 				 sc->tx_boundary,	/* maxsegsz */
3330 				 BUS_DMA_ALLOCNOW,	/* flags */
3331 				 &ss->tx.dmat);		/* tag */
3332 
3333 	if (err != 0) {
3334 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3335 			      err);
3336 		return err;;
3337 	}
3338 
3339 	/* now use these tags to setup dmamaps for each slot
3340 	   in the ring */
3341 	for (i = 0; i <= ss->tx.mask; i++) {
3342 		err = bus_dmamap_create(ss->tx.dmat, 0,
3343 					&ss->tx.info[i].map);
3344 		if (err != 0) {
3345 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3346 				      err);
3347 			return err;;
3348 		}
3349 	}
3350 	return 0;
3351 
3352 }
3353 
3354 static int
3355 mxge_alloc_rings(mxge_softc_t *sc)
3356 {
3357 	mxge_cmd_t cmd;
3358 	int tx_ring_size;
3359 	int tx_ring_entries, rx_ring_entries;
3360 	int err, slice;
3361 
3362 	/* get ring sizes */
3363 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3364 	tx_ring_size = cmd.data0;
3365 	if (err != 0) {
3366 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3367 		goto abort;
3368 	}
3369 
3370 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3371 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3372 	ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3373 	ifq_set_ready(&sc->ifp->if_snd);
3374 
3375 	for (slice = 0; slice < sc->num_slices; slice++) {
3376 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3377 					     rx_ring_entries,
3378 					     tx_ring_entries);
3379 		if (err != 0)
3380 			goto abort;
3381 	}
3382 	return 0;
3383 
3384 abort:
3385 	mxge_free_rings(sc);
3386 	return err;
3387 
3388 }
3389 
3390 
3391 static void
3392 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3393 {
3394 	int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3395 
3396 	if (bufsize < MCLBYTES) {
3397 		/* easy, everything fits in a single buffer */
3398 		*big_buf_size = MCLBYTES;
3399 		*cl_size = MCLBYTES;
3400 		*nbufs = 1;
3401 		return;
3402 	}
3403 
3404 	if (bufsize < MJUMPAGESIZE) {
3405 		/* still easy, everything still fits in a single buffer */
3406 		*big_buf_size = MJUMPAGESIZE;
3407 		*cl_size = MJUMPAGESIZE;
3408 		*nbufs = 1;
3409 		return;
3410 	}
3411 #if MXGE_VIRT_JUMBOS
3412 	/* now we need to use virtually contiguous buffers */
3413 	*cl_size = MJUM9BYTES;
3414 	*big_buf_size = 4096;
3415 	*nbufs = mtu / 4096 + 1;
3416 	/* needs to be a power of two, so round up */
3417 	if (*nbufs == 3)
3418 		*nbufs = 4;
3419 #else
3420 	*cl_size = MJUM9BYTES;
3421 	*big_buf_size = MJUM9BYTES;
3422 	*nbufs = 1;
3423 #endif
3424 }
3425 
3426 static int
3427 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3428 {
3429 	mxge_softc_t *sc;
3430 	mxge_cmd_t cmd;
3431 	bus_dmamap_t map;
3432 	struct lro_entry *lro_entry;
3433 	int err, i, slice;
3434 
3435 
3436 	sc = ss->sc;
3437 	slice = ss - sc->ss;
3438 
3439 	SLIST_INIT(&ss->lro_free);
3440 	SLIST_INIT(&ss->lro_active);
3441 
3442 	for (i = 0; i < sc->lro_cnt; i++) {
3443 		lro_entry = (struct lro_entry *)
3444 			kmalloc(sizeof (*lro_entry), M_DEVBUF,
3445 			       M_NOWAIT | M_ZERO);
3446 		if (lro_entry == NULL) {
3447 			sc->lro_cnt = i;
3448 			break;
3449 		}
3450 		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3451 	}
3452 	/* get the lanai pointers to the send and receive rings */
3453 
3454 	err = 0;
3455 #ifndef IFNET_BUF_RING
3456 	/* We currently only send from the first slice */
3457 	if (slice == 0) {
3458 #endif
3459 		cmd.data0 = slice;
3460 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3461 		ss->tx.lanai =
3462 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3463 		ss->tx.send_go = (volatile uint32_t *)
3464 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3465 		ss->tx.send_stop = (volatile uint32_t *)
3466 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3467 #ifndef IFNET_BUF_RING
3468 	}
3469 #endif
3470 	cmd.data0 = slice;
3471 	err |= mxge_send_cmd(sc,
3472 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3473 	ss->rx_small.lanai =
3474 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3475 	cmd.data0 = slice;
3476 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3477 	ss->rx_big.lanai =
3478 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3479 
3480 	if (err != 0) {
3481 		device_printf(sc->dev,
3482 			      "failed to get ring sizes or locations\n");
3483 		return EIO;
3484 	}
3485 
3486 	/* stock receive rings */
3487 	for (i = 0; i <= ss->rx_small.mask; i++) {
3488 		map = ss->rx_small.info[i].map;
3489 		err = mxge_get_buf_small(ss, map, i);
3490 		if (err) {
3491 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3492 				      i, ss->rx_small.mask + 1);
3493 			return ENOMEM;
3494 		}
3495 	}
3496 	for (i = 0; i <= ss->rx_big.mask; i++) {
3497 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3498 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3499 	}
3500 	ss->rx_big.nbufs = nbufs;
3501 	ss->rx_big.cl_size = cl_size;
3502 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3503 		EVL_ENCAPLEN + MXGEFW_PAD;
3504 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3505 		map = ss->rx_big.info[i].map;
3506 		err = mxge_get_buf_big(ss, map, i);
3507 		if (err) {
3508 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3509 				      i, ss->rx_big.mask + 1);
3510 			return ENOMEM;
3511 		}
3512 	}
3513 	return 0;
3514 }
3515 
3516 static int
3517 mxge_open(mxge_softc_t *sc)
3518 {
3519 	mxge_cmd_t cmd;
3520 	int err, big_bytes, nbufs, slice, cl_size, i;
3521 	bus_addr_t bus;
3522 	volatile uint8_t *itable;
3523 	struct mxge_slice_state *ss;
3524 
3525 	ASSERT_SERIALIZED(sc->ifp->if_serializer);
3526 	/* Copy the MAC address in case it was overridden */
3527 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3528 
3529 	err = mxge_reset(sc, 1);
3530 	if (err != 0) {
3531 		device_printf(sc->dev, "failed to reset\n");
3532 		return EIO;
3533 	}
3534 
3535 	if (sc->num_slices > 1) {
3536 		/* setup the indirection table */
3537 		cmd.data0 = sc->num_slices;
3538 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3539 				    &cmd);
3540 
3541 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3542 				     &cmd);
3543 		if (err != 0) {
3544 			device_printf(sc->dev,
3545 				      "failed to setup rss tables\n");
3546 			return err;
3547 		}
3548 
3549 		/* just enable an identity mapping */
3550 		itable = sc->sram + cmd.data0;
3551 		for (i = 0; i < sc->num_slices; i++)
3552 			itable[i] = (uint8_t)i;
3553 
3554 		cmd.data0 = 1;
3555 		cmd.data1 = mxge_rss_hash_type;
3556 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3557 		if (err != 0) {
3558 			device_printf(sc->dev, "failed to enable slices\n");
3559 			return err;
3560 		}
3561 	}
3562 
3563 
3564 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3565 
3566 	cmd.data0 = nbufs;
3567 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3568 			    &cmd);
3569 	/* error is only meaningful if we're trying to set
3570 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3571 	if (err && nbufs > 1) {
3572 		device_printf(sc->dev,
3573 			      "Failed to set alway-use-n to %d\n",
3574 			      nbufs);
3575 		return EIO;
3576 	}
3577 	/* Give the firmware the mtu and the big and small buffer
3578 	   sizes.  The firmware wants the big buf size to be a power
3579 	   of two. Luckily, FreeBSD's clusters are powers of two */
3580 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3581 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3582 	cmd.data0 = MHLEN - MXGEFW_PAD;
3583 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3584 			     &cmd);
3585 	cmd.data0 = big_bytes;
3586 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3587 
3588 	if (err != 0) {
3589 		device_printf(sc->dev, "failed to setup params\n");
3590 		goto abort;
3591 	}
3592 
3593 	/* Now give him the pointer to the stats block */
3594 	for (slice = 0;
3595 #ifdef IFNET_BUF_RING
3596 	     slice < sc->num_slices;
3597 #else
3598 	     slice < 1;
3599 #endif
3600 	     slice++) {
3601 		ss = &sc->ss[slice];
3602 		cmd.data0 =
3603 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3604 		cmd.data1 =
3605 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3606 		cmd.data2 = sizeof(struct mcp_irq_data);
3607 		cmd.data2 |= (slice << 16);
3608 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3609 	}
3610 
3611 	if (err != 0) {
3612 		bus = sc->ss->fw_stats_dma.bus_addr;
3613 		bus += offsetof(struct mcp_irq_data, send_done_count);
3614 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3615 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3616 		err = mxge_send_cmd(sc,
3617 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3618 				    &cmd);
3619 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3620 		sc->fw_multicast_support = 0;
3621 	} else {
3622 		sc->fw_multicast_support = 1;
3623 	}
3624 
3625 	if (err != 0) {
3626 		device_printf(sc->dev, "failed to setup params\n");
3627 		goto abort;
3628 	}
3629 
3630 	for (slice = 0; slice < sc->num_slices; slice++) {
3631 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3632 		if (err != 0) {
3633 			device_printf(sc->dev, "couldn't open slice %d\n",
3634 				      slice);
3635 			goto abort;
3636 		}
3637 	}
3638 
3639 	/* Finally, start the firmware running */
3640 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3641 	if (err) {
3642 		device_printf(sc->dev, "Couldn't bring up link\n");
3643 		goto abort;
3644 	}
3645 #ifdef IFNET_BUF_RING
3646 	for (slice = 0; slice < sc->num_slices; slice++) {
3647 		ss = &sc->ss[slice];
3648 		ss->if_flags |= IFF_RUNNING;
3649 		ss->if_flags &= ~IFF_OACTIVE;
3650 	}
3651 #endif
3652 	sc->ifp->if_flags |= IFF_RUNNING;
3653 	sc->ifp->if_flags &= ~IFF_OACTIVE;
3654 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3655 
3656 	return 0;
3657 
3658 
3659 abort:
3660 	mxge_free_mbufs(sc);
3661 
3662 	return err;
3663 }
3664 
3665 static int
3666 mxge_close(mxge_softc_t *sc)
3667 {
3668 	mxge_cmd_t cmd;
3669 	int err, old_down_cnt;
3670 #ifdef IFNET_BUF_RING
3671 	struct mxge_slice_state *ss;
3672 	int slice;
3673 #endif
3674 
3675 	ASSERT_SERIALIZED(sc->ifp->if_serializer);
3676 	callout_stop(&sc->co_hdl);
3677 #ifdef IFNET_BUF_RING
3678 	for (slice = 0; slice < sc->num_slices; slice++) {
3679 		ss = &sc->ss[slice];
3680 		ss->if_flags &= ~IFF_RUNNING;
3681 	}
3682 #endif
3683 	sc->ifp->if_flags &= ~IFF_RUNNING;
3684 	old_down_cnt = sc->down_cnt;
3685 	wmb();
3686 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3687 	if (err) {
3688 		device_printf(sc->dev, "Couldn't bring down link\n");
3689 	}
3690 	if (old_down_cnt == sc->down_cnt) {
3691 		/* wait for down irq */
3692 		DELAY(10 * sc->intr_coal_delay);
3693 	}
3694 	wmb();
3695 	if (old_down_cnt == sc->down_cnt) {
3696 		device_printf(sc->dev, "never got down irq\n");
3697 	}
3698 
3699 	mxge_free_mbufs(sc);
3700 
3701 	return 0;
3702 }
3703 
3704 static void
3705 mxge_setup_cfg_space(mxge_softc_t *sc)
3706 {
3707 	device_t dev = sc->dev;
3708 	int reg;
3709 	uint16_t cmd, lnk, pectl;
3710 
3711 	/* find the PCIe link width and set max read request to 4KB*/
3712 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3713 		lnk = pci_read_config(dev, reg + 0x12, 2);
3714 		sc->link_width = (lnk >> 4) & 0x3f;
3715 
3716 		pectl = pci_read_config(dev, reg + 0x8, 2);
3717 		pectl = (pectl & ~0x7000) | (5 << 12);
3718 		pci_write_config(dev, reg + 0x8, pectl, 2);
3719 	}
3720 
3721 	/* Enable DMA and Memory space access */
3722 	pci_enable_busmaster(dev);
3723 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3724 	cmd |= PCIM_CMD_MEMEN;
3725 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3726 }
3727 
3728 static uint32_t
3729 mxge_read_reboot(mxge_softc_t *sc)
3730 {
3731 	device_t dev = sc->dev;
3732 	uint32_t vs;
3733 
3734 	/* find the vendor specific offset */
3735 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3736 		device_printf(sc->dev,
3737 			      "could not find vendor specific offset\n");
3738 		return (uint32_t)-1;
3739 	}
3740 	/* enable read32 mode */
3741 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3742 	/* tell NIC which register to read */
3743 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3744 	return (pci_read_config(dev, vs + 0x14, 4));
3745 }
3746 
3747 static int
3748 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3749 {
3750 	struct pci_devinfo *dinfo;
3751 	mxge_tx_ring_t *tx;
3752 	int err;
3753 	uint32_t reboot;
3754 	uint16_t cmd;
3755 
3756 	err = ENXIO;
3757 
3758 	device_printf(sc->dev, "Watchdog reset!\n");
3759 
3760 	/*
3761 	 * check to see if the NIC rebooted.  If it did, then all of
3762 	 * PCI config space has been reset, and things like the
3763 	 * busmaster bit will be zero.  If this is the case, then we
3764 	 * must restore PCI config space before the NIC can be used
3765 	 * again
3766 	 */
3767 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3768 	if (cmd == 0xffff) {
3769 		/*
3770 		 * maybe the watchdog caught the NIC rebooting; wait
3771 		 * up to 100ms for it to finish.  If it does not come
3772 		 * back, then give up
3773 		 */
3774 		DELAY(1000*100);
3775 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3776 		if (cmd == 0xffff) {
3777 			device_printf(sc->dev, "NIC disappeared!\n");
3778 			return (err);
3779 		}
3780 	}
3781 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3782 		/* print the reboot status */
3783 		reboot = mxge_read_reboot(sc);
3784 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3785 			      reboot);
3786 		/* restore PCI configuration space */
3787 		dinfo = device_get_ivars(sc->dev);
3788 		pci_cfg_restore(sc->dev, dinfo);
3789 
3790 		/* and redo any changes we made to our config space */
3791 		mxge_setup_cfg_space(sc);
3792 
3793 		if (sc->ifp->if_flags & IFF_RUNNING) {
3794 			mxge_close(sc);
3795 			err = mxge_open(sc);
3796 		}
3797 	} else {
3798 		tx = &sc->ss[slice].tx;
3799 		device_printf(sc->dev,
3800 			      "NIC did not reboot, slice %d ring state:\n",
3801 			      slice);
3802 		device_printf(sc->dev,
3803 			      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3804 			      tx->req, tx->done, tx->queue_active);
3805 		device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3806 			      tx->activate, tx->deactivate);
3807 		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3808 			      tx->pkt_done,
3809 			      be32toh(sc->ss->fw_stats->send_done_count));
3810 		device_printf(sc->dev, "not resetting\n");
3811 	}
3812 	return (err);
3813 }
3814 
3815 static int
3816 mxge_watchdog(mxge_softc_t *sc)
3817 {
3818 	mxge_tx_ring_t *tx;
3819 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3820 	int i, err = 0;
3821 
3822 	/* see if we have outstanding transmits, which
3823 	   have been pending for more than mxge_ticks */
3824 	for (i = 0;
3825 #ifdef IFNET_BUF_RING
3826 	     (i < sc->num_slices) && (err == 0);
3827 #else
3828 	     (i < 1) && (err == 0);
3829 #endif
3830 	     i++) {
3831 		tx = &sc->ss[i].tx;
3832 		if (tx->req != tx->done &&
3833 		    tx->watchdog_req != tx->watchdog_done &&
3834 		    tx->done == tx->watchdog_done) {
3835 			/* check for pause blocking before resetting */
3836 			if (tx->watchdog_rx_pause == rx_pause)
3837 				err = mxge_watchdog_reset(sc, i);
3838 			else
3839 				device_printf(sc->dev, "Flow control blocking "
3840 					      "xmits, check link partner\n");
3841 		}
3842 
3843 		tx->watchdog_req = tx->req;
3844 		tx->watchdog_done = tx->done;
3845 		tx->watchdog_rx_pause = rx_pause;
3846 	}
3847 
3848 	if (sc->need_media_probe)
3849 		mxge_media_probe(sc);
3850 	return (err);
3851 }
3852 
3853 static void
3854 mxge_update_stats(mxge_softc_t *sc)
3855 {
3856 	struct mxge_slice_state *ss;
3857 	u_long ipackets = 0;
3858 	u_long opackets = 0;
3859 #ifdef IFNET_BUF_RING
3860 	u_long obytes = 0;
3861 	u_long omcasts = 0;
3862 	u_long odrops = 0;
3863 #endif
3864 	u_long oerrors = 0;
3865 	int slice;
3866 
3867 	for (slice = 0; slice < sc->num_slices; slice++) {
3868 		ss = &sc->ss[slice];
3869 		ipackets += ss->ipackets;
3870 		opackets += ss->opackets;
3871 #ifdef IFNET_BUF_RING
3872 		obytes += ss->obytes;
3873 		omcasts += ss->omcasts;
3874 		odrops += ss->tx.br->br_drops;
3875 #endif
3876 		oerrors += ss->oerrors;
3877 	}
3878 	sc->ifp->if_ipackets = ipackets;
3879 	sc->ifp->if_opackets = opackets;
3880 #ifdef IFNET_BUF_RING
3881 	sc->ifp->if_obytes = obytes;
3882 	sc->ifp->if_omcasts = omcasts;
3883 	sc->ifp->if_snd.ifq_drops = odrops;
3884 #endif
3885 	sc->ifp->if_oerrors = oerrors;
3886 }
3887 
3888 static void
3889 mxge_tick(void *arg)
3890 {
3891 	mxge_softc_t *sc = arg;
3892 	int err = 0;
3893 
3894 	lwkt_serialize_enter(sc->ifp->if_serializer);
3895 	/* aggregate stats from different slices */
3896 	mxge_update_stats(sc);
3897 	if (!sc->watchdog_countdown) {
3898 		err = mxge_watchdog(sc);
3899 		sc->watchdog_countdown = 4;
3900 	}
3901 	sc->watchdog_countdown--;
3902 	if (err == 0)
3903 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3904 	lwkt_serialize_exit(sc->ifp->if_serializer);
3905 }
3906 
3907 static int
3908 mxge_media_change(struct ifnet *ifp)
3909 {
3910 	return EINVAL;
3911 }
3912 
3913 static int
3914 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3915 {
3916 	struct ifnet *ifp = sc->ifp;
3917 	int real_mtu, old_mtu;
3918 	int err = 0;
3919 
3920 	if (ifp->if_serializer)
3921 		ASSERT_SERIALIZED(ifp->if_serializer);
3922 
3923 	real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3924 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3925 		return EINVAL;
3926 	old_mtu = ifp->if_mtu;
3927 	ifp->if_mtu = mtu;
3928 	if (ifp->if_flags & IFF_RUNNING) {
3929 		mxge_close(sc);
3930 		err = mxge_open(sc);
3931 		if (err != 0) {
3932 			ifp->if_mtu = old_mtu;
3933 			mxge_close(sc);
3934 			(void) mxge_open(sc);
3935 		}
3936 	}
3937 	return err;
3938 }
3939 
3940 static void
3941 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3942 {
3943 	mxge_softc_t *sc = ifp->if_softc;
3944 
3945 
3946 	if (sc == NULL)
3947 		return;
3948 	ifmr->ifm_status = IFM_AVALID;
3949 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3950 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3951 	ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3952 }
3953 
3954 static int
3955 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data, struct ucred *cr)
3956 {
3957 	mxge_softc_t *sc = ifp->if_softc;
3958 	struct ifreq *ifr = (struct ifreq *)data;
3959 	int err, mask;
3960 
3961 	(void)cr;
3962 	err = 0;
3963 	ASSERT_SERIALIZED(ifp->if_serializer);
3964 	switch (command) {
3965 	case SIOCSIFADDR:
3966 	case SIOCGIFADDR:
3967 		err = ether_ioctl(ifp, command, data);
3968 		break;
3969 
3970 	case SIOCSIFMTU:
3971 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3972 		break;
3973 
3974 	case SIOCSIFFLAGS:
3975 		if (sc->dying) {
3976 			return EINVAL;
3977 		}
3978 		if (ifp->if_flags & IFF_UP) {
3979 			if (!(ifp->if_flags & IFF_RUNNING)) {
3980 				err = mxge_open(sc);
3981 			} else {
3982 				/* take care of promis can allmulti
3983 				   flag chages */
3984 				mxge_change_promisc(sc,
3985 						    ifp->if_flags & IFF_PROMISC);
3986 				mxge_set_multicast_list(sc);
3987 			}
3988 		} else {
3989 			if (ifp->if_flags & IFF_RUNNING) {
3990 				mxge_close(sc);
3991 			}
3992 		}
3993 		break;
3994 
3995 	case SIOCADDMULTI:
3996 	case SIOCDELMULTI:
3997 		mxge_set_multicast_list(sc);
3998 		break;
3999 
4000 	case SIOCSIFCAP:
4001 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4002 		if (mask & IFCAP_TXCSUM) {
4003 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4004 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4005 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
4006 						      | CSUM_TSO);
4007 			} else {
4008 				ifp->if_capenable |= IFCAP_TXCSUM;
4009 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4010 			}
4011 		} else if (mask & IFCAP_RXCSUM) {
4012 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4013 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4014 				sc->csum_flag = 0;
4015 			} else {
4016 				ifp->if_capenable |= IFCAP_RXCSUM;
4017 				sc->csum_flag = 1;
4018 			}
4019 		}
4020 		if (mask & IFCAP_TSO4) {
4021 			if (IFCAP_TSO4 & ifp->if_capenable) {
4022 				ifp->if_capenable &= ~IFCAP_TSO4;
4023 				ifp->if_hwassist &= ~CSUM_TSO;
4024 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4025 				ifp->if_capenable |= IFCAP_TSO4;
4026 				ifp->if_hwassist |= CSUM_TSO;
4027 			} else {
4028 				kprintf("mxge requires tx checksum offload"
4029 				       " be enabled to use TSO\n");
4030 				err = EINVAL;
4031 			}
4032 		}
4033 		if (mask & IFCAP_LRO) {
4034 			if (IFCAP_LRO & ifp->if_capenable)
4035 				err = mxge_change_lro_locked(sc, 0);
4036 			else
4037 				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
4038 		}
4039 		if (mask & IFCAP_VLAN_HWTAGGING)
4040 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4041 		VLAN_CAPABILITIES(ifp);
4042 
4043 		break;
4044 
4045 	case SIOCGIFMEDIA:
4046 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4047 				    &sc->media, command);
4048                 break;
4049 
4050 	default:
4051 		err = ENOTTY;
4052         }
4053 	return err;
4054 }
4055 
4056 static void
4057 mxge_fetch_tunables(mxge_softc_t *sc)
4058 {
4059 
4060 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4061 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4062 			  &mxge_flow_control);
4063 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4064 			  &mxge_intr_coal_delay);
4065 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4066 			  &mxge_nvidia_ecrc_enable);
4067 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4068 			  &mxge_force_firmware);
4069 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4070 			  &mxge_deassert_wait);
4071 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4072 			  &mxge_verbose);
4073 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4074 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4075 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4076 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4077 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4078 	if (sc->lro_cnt != 0)
4079 		mxge_lro_cnt = sc->lro_cnt;
4080 
4081 	if (bootverbose)
4082 		mxge_verbose = 1;
4083 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4084 		mxge_intr_coal_delay = 30;
4085 	if (mxge_ticks == 0)
4086 		mxge_ticks = hz / 2;
4087 	sc->pause = mxge_flow_control;
4088 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4089 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4090 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4091 	}
4092 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4093 	    mxge_initial_mtu < ETHER_MIN_LEN)
4094 		mxge_initial_mtu = ETHERMTU_JUMBO;
4095 }
4096 
4097 
4098 static void
4099 mxge_free_slices(mxge_softc_t *sc)
4100 {
4101 	struct mxge_slice_state *ss;
4102 	int i;
4103 
4104 
4105 	if (sc->ss == NULL)
4106 		return;
4107 
4108 	for (i = 0; i < sc->num_slices; i++) {
4109 		ss = &sc->ss[i];
4110 		if (ss->fw_stats != NULL) {
4111 			mxge_dma_free(&ss->fw_stats_dma);
4112 			ss->fw_stats = NULL;
4113 #ifdef IFNET_BUF_RING
4114 			if (ss->tx.br != NULL) {
4115 				drbr_free(ss->tx.br, M_DEVBUF);
4116 				ss->tx.br = NULL;
4117 			}
4118 #endif
4119 		}
4120 		if (ss->rx_done.entry != NULL) {
4121 			mxge_dma_free(&ss->rx_done.dma);
4122 			ss->rx_done.entry = NULL;
4123 		}
4124 	}
4125 	kfree(sc->ss, M_DEVBUF);
4126 	sc->ss = NULL;
4127 }
4128 
4129 static int
4130 mxge_alloc_slices(mxge_softc_t *sc)
4131 {
4132 	mxge_cmd_t cmd;
4133 	struct mxge_slice_state *ss;
4134 	size_t bytes;
4135 	int err, i, max_intr_slots;
4136 
4137 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4138 	if (err != 0) {
4139 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4140 		return err;
4141 	}
4142 	sc->rx_ring_size = cmd.data0;
4143 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4144 
4145 	bytes = sizeof (*sc->ss) * sc->num_slices;
4146 	sc->ss = kmalloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4147 	if (sc->ss == NULL)
4148 		return (ENOMEM);
4149 	for (i = 0; i < sc->num_slices; i++) {
4150 		ss = &sc->ss[i];
4151 
4152 		ss->sc = sc;
4153 
4154 		/* allocate per-slice rx interrupt queues */
4155 
4156 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4157 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4158 		if (err != 0)
4159 			goto abort;
4160 		ss->rx_done.entry = ss->rx_done.dma.addr;
4161 		bzero(ss->rx_done.entry, bytes);
4162 
4163 		/*
4164 		 * allocate the per-slice firmware stats; stats
4165 		 * (including tx) are used used only on the first
4166 		 * slice for now
4167 		 */
4168 #ifndef IFNET_BUF_RING
4169 		if (i > 0)
4170 			continue;
4171 #endif
4172 
4173 		bytes = sizeof (*ss->fw_stats);
4174 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4175 				     sizeof (*ss->fw_stats), 64);
4176 		if (err != 0)
4177 			goto abort;
4178 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4179 #ifdef IFNET_BUF_RING
4180 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4181 					   &ss->tx.lock);
4182 #endif
4183 	}
4184 
4185 	return (0);
4186 
4187 abort:
4188 	mxge_free_slices(sc);
4189 	return (ENOMEM);
4190 }
4191 
4192 static void
4193 mxge_slice_probe(mxge_softc_t *sc)
4194 {
4195 	mxge_cmd_t cmd;
4196 	char *old_fw;
4197 	int msix_cnt, status, max_intr_slots;
4198 
4199 	sc->num_slices = 1;
4200 	/*
4201 	 *  don't enable multiple slices if they are not enabled,
4202 	 *  or if this is not an SMP system
4203 	 */
4204 
4205 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
4206 		return;
4207 
4208 	/* see how many MSI-X interrupts are available */
4209 	msix_cnt = pci_msix_count(sc->dev);
4210 	if (msix_cnt < 2)
4211 		return;
4212 
4213 	/* now load the slice aware firmware see what it supports */
4214 	old_fw = sc->fw_name;
4215 	if (old_fw == mxge_fw_aligned)
4216 		sc->fw_name = mxge_fw_rss_aligned;
4217 	else
4218 		sc->fw_name = mxge_fw_rss_unaligned;
4219 	status = mxge_load_firmware(sc, 0);
4220 	if (status != 0) {
4221 		device_printf(sc->dev, "Falling back to a single slice\n");
4222 		return;
4223 	}
4224 
4225 	/* try to send a reset command to the card to see if it
4226 	   is alive */
4227 	memset(&cmd, 0, sizeof (cmd));
4228 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4229 	if (status != 0) {
4230 		device_printf(sc->dev, "failed reset\n");
4231 		goto abort_with_fw;
4232 	}
4233 
4234 	/* get rx ring size */
4235 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4236 	if (status != 0) {
4237 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4238 		goto abort_with_fw;
4239 	}
4240 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4241 
4242 	/* tell it the size of the interrupt queues */
4243 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4244 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4245 	if (status != 0) {
4246 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4247 		goto abort_with_fw;
4248 	}
4249 
4250 	/* ask the maximum number of slices it supports */
4251 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4252 	if (status != 0) {
4253 		device_printf(sc->dev,
4254 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4255 		goto abort_with_fw;
4256 	}
4257 	sc->num_slices = cmd.data0;
4258 	if (sc->num_slices > msix_cnt)
4259 		sc->num_slices = msix_cnt;
4260 
4261 	if (mxge_max_slices == -1) {
4262 		/* cap to number of CPUs in system */
4263 		if (sc->num_slices > ncpus)
4264 			sc->num_slices = ncpus;
4265 	} else {
4266 		if (sc->num_slices > mxge_max_slices)
4267 			sc->num_slices = mxge_max_slices;
4268 	}
4269 	/* make sure it is a power of two */
4270 	while (sc->num_slices & (sc->num_slices - 1))
4271 		sc->num_slices--;
4272 
4273 	if (mxge_verbose)
4274 		device_printf(sc->dev, "using %d slices\n",
4275 			      sc->num_slices);
4276 
4277 	return;
4278 
4279 abort_with_fw:
4280 	sc->fw_name = old_fw;
4281 	(void) mxge_load_firmware(sc, 0);
4282 }
4283 
4284 #if 0
4285 static int
4286 mxge_add_msix_irqs(mxge_softc_t *sc)
4287 {
4288 	size_t bytes;
4289 	int count, err, i, rid;
4290 
4291 	rid = PCIR_BAR(2);
4292 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4293 						    &rid, RF_ACTIVE);
4294 
4295 	if (sc->msix_table_res == NULL) {
4296 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4297 		return ENXIO;
4298 	}
4299 
4300 	count = sc->num_slices;
4301 	err = pci_alloc_msix(sc->dev, &count);
4302 	if (err != 0) {
4303 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4304 			      "err = %d \n", sc->num_slices, err);
4305 		goto abort_with_msix_table;
4306 	}
4307 	if (count < sc->num_slices) {
4308 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4309 			      count, sc->num_slices);
4310 		device_printf(sc->dev,
4311 			      "Try setting hw.mxge.max_slices to %d\n",
4312 			      count);
4313 		err = ENOSPC;
4314 		goto abort_with_msix;
4315 	}
4316 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4317 	sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4318 	if (sc->msix_irq_res == NULL) {
4319 		err = ENOMEM;
4320 		goto abort_with_msix;
4321 	}
4322 
4323 	for (i = 0; i < sc->num_slices; i++) {
4324 		rid = i + 1;
4325 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4326 							  SYS_RES_IRQ,
4327 							  &rid, RF_ACTIVE);
4328 		if (sc->msix_irq_res[i] == NULL) {
4329 			device_printf(sc->dev, "couldn't allocate IRQ res"
4330 				      " for message %d\n", i);
4331 			err = ENXIO;
4332 			goto abort_with_res;
4333 		}
4334 	}
4335 
4336 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4337 	sc->msix_ih =  kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4338 
4339 	for (i = 0; i < sc->num_slices; i++) {
4340 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4341 				     INTR_MPSAFE,
4342 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i],
4343 				     sc->ifp->if_serializer);
4344 		if (err != 0) {
4345 			device_printf(sc->dev, "couldn't setup intr for "
4346 				      "message %d\n", i);
4347 			goto abort_with_intr;
4348 		}
4349 	}
4350 
4351 	if (mxge_verbose) {
4352 		device_printf(sc->dev, "using %d msix IRQs:",
4353 			      sc->num_slices);
4354 		for (i = 0; i < sc->num_slices; i++)
4355 			kprintf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4356 		kprintf("\n");
4357 	}
4358 	return (0);
4359 
4360 abort_with_intr:
4361 	for (i = 0; i < sc->num_slices; i++) {
4362 		if (sc->msix_ih[i] != NULL) {
4363 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4364 					  sc->msix_ih[i]);
4365 			sc->msix_ih[i] = NULL;
4366 		}
4367 	}
4368 	kfree(sc->msix_ih, M_DEVBUF);
4369 
4370 
4371 abort_with_res:
4372 	for (i = 0; i < sc->num_slices; i++) {
4373 		rid = i + 1;
4374 		if (sc->msix_irq_res[i] != NULL)
4375 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4376 					     sc->msix_irq_res[i]);
4377 		sc->msix_irq_res[i] = NULL;
4378 	}
4379 	kfree(sc->msix_irq_res, M_DEVBUF);
4380 
4381 
4382 abort_with_msix:
4383 	pci_release_msi(sc->dev);
4384 
4385 abort_with_msix_table:
4386 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4387 			     sc->msix_table_res);
4388 
4389 	return err;
4390 }
4391 #endif
4392 
4393 static int
4394 mxge_add_single_irq(mxge_softc_t *sc)
4395 {
4396 	int count, err, rid;
4397 
4398 #ifdef OLD_MSI
4399 	count = pci_msi_count(sc->dev);
4400 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4401 		rid = 1;
4402 	} else {
4403 		rid = 0;
4404 		sc->legacy_irq = 1;
4405 	}
4406 #else
4407 	count = 0;
4408 	rid = 0;
4409 	sc->legacy_irq = 1;
4410 #endif
4411 	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4412 					 1, RF_SHAREABLE | RF_ACTIVE);
4413 	if (sc->irq_res == NULL) {
4414 		device_printf(sc->dev, "could not alloc interrupt\n");
4415 		return ENXIO;
4416 	}
4417 	if (mxge_verbose)
4418 		device_printf(sc->dev, "using %s irq %ld\n",
4419 			      sc->legacy_irq ? "INTx" : "MSI",
4420 			      rman_get_start(sc->irq_res));
4421 	err = bus_setup_intr(sc->dev, sc->irq_res,
4422 			     INTR_MPSAFE,
4423 			     mxge_intr, &sc->ss[0], &sc->ih,
4424 			     sc->ifp->if_serializer);
4425 	if (err != 0) {
4426 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4427 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4428 		if (!sc->legacy_irq)
4429 			pci_release_msi(sc->dev);
4430 	}
4431 	return err;
4432 }
4433 
4434 #if 0
4435 static void
4436 mxge_rem_msix_irqs(mxge_softc_t *sc)
4437 {
4438 	int i, rid;
4439 
4440 	for (i = 0; i < sc->num_slices; i++) {
4441 		if (sc->msix_ih[i] != NULL) {
4442 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4443 					  sc->msix_ih[i]);
4444 			sc->msix_ih[i] = NULL;
4445 		}
4446 	}
4447 	kfree(sc->msix_ih, M_DEVBUF);
4448 
4449 	for (i = 0; i < sc->num_slices; i++) {
4450 		rid = i + 1;
4451 		if (sc->msix_irq_res[i] != NULL)
4452 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4453 					     sc->msix_irq_res[i]);
4454 		sc->msix_irq_res[i] = NULL;
4455 	}
4456 	kfree(sc->msix_irq_res, M_DEVBUF);
4457 
4458 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4459 			     sc->msix_table_res);
4460 
4461 	pci_release_msi(sc->dev);
4462 	return;
4463 }
4464 #endif
4465 
4466 static void
4467 mxge_rem_single_irq(mxge_softc_t *sc)
4468 {
4469 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4470 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4471 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4472 	if (!sc->legacy_irq)
4473 		pci_release_msi(sc->dev);
4474 }
4475 
4476 static void
4477 mxge_rem_irq(mxge_softc_t *sc)
4478 {
4479 #if 0
4480 	if (sc->num_slices > 1)
4481 		mxge_rem_msix_irqs(sc);
4482 	else
4483 #endif
4484 		mxge_rem_single_irq(sc);
4485 }
4486 
4487 static int
4488 mxge_add_irq(mxge_softc_t *sc)
4489 {
4490 #if 0
4491 	int err;
4492 
4493 	if (sc->num_slices > 1)
4494 		err = mxge_add_msix_irqs(sc);
4495 	else
4496 		err = mxge_add_single_irq(sc);
4497 
4498 	if (0 && err == 0 && sc->num_slices > 1) {
4499 		mxge_rem_msix_irqs(sc);
4500 		err = mxge_add_msix_irqs(sc);
4501 	}
4502 	return err;
4503 #else
4504 	return mxge_add_single_irq(sc);
4505 #endif
4506 }
4507 
4508 
4509 static int
4510 mxge_attach(device_t dev)
4511 {
4512 	mxge_softc_t *sc = device_get_softc(dev);
4513 	struct ifnet *ifp = &sc->arpcom.ac_if;
4514 	int err, rid;
4515 
4516 	/*
4517 	 * avoid rewriting half the lines in this file to use
4518 	 * &sc->arpcom.ac_if instead
4519 	 */
4520 	sc->ifp = ifp;
4521 	sc->dev = dev;
4522 	mxge_fetch_tunables(sc);
4523 
4524 	err = bus_dma_tag_create(NULL,			/* parent */
4525 				 1,			/* alignment */
4526 				 0,			/* boundary */
4527 				 BUS_SPACE_MAXADDR,	/* low */
4528 				 BUS_SPACE_MAXADDR,	/* high */
4529 				 NULL, NULL,		/* filter */
4530 				 65536 + 256,		/* maxsize */
4531 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4532 				 65536,			/* maxsegsize */
4533 				 0,			/* flags */
4534 				 &sc->parent_dmat);	/* tag */
4535 
4536 	if (err != 0) {
4537 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4538 			      err);
4539 		goto abort_with_nothing;
4540 	}
4541 
4542 	sc->ifp = ifp;
4543 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4544 
4545 	callout_init_mp(&sc->co_hdl);
4546 
4547 	mxge_setup_cfg_space(sc);
4548 
4549 	/* Map the board into the kernel */
4550 	rid = PCIR_BARS;
4551 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4552 					 ~0, 1, RF_ACTIVE);
4553 	if (sc->mem_res == NULL) {
4554 		device_printf(dev, "could not map memory\n");
4555 		err = ENXIO;
4556 		goto abort_with_nothing;
4557 	}
4558 	sc->sram = rman_get_virtual(sc->mem_res);
4559 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4560 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4561 		device_printf(dev, "impossible memory region size %ld\n",
4562 			      rman_get_size(sc->mem_res));
4563 		err = ENXIO;
4564 		goto abort_with_mem_res;
4565 	}
4566 
4567 	/* make NULL terminated copy of the EEPROM strings section of
4568 	   lanai SRAM */
4569 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4570 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4571 				rman_get_bushandle(sc->mem_res),
4572 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4573 				sc->eeprom_strings,
4574 				MXGE_EEPROM_STRINGS_SIZE - 2);
4575 	err = mxge_parse_strings(sc);
4576 	if (err != 0)
4577 		goto abort_with_mem_res;
4578 
4579 	/* Enable write combining for efficient use of PCIe bus */
4580 	mxge_enable_wc(sc);
4581 
4582 	/* Allocate the out of band dma memory */
4583 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4584 			     sizeof (mxge_cmd_t), 64);
4585 	if (err != 0)
4586 		goto abort_with_mem_res;
4587 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4588 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4589 	if (err != 0)
4590 		goto abort_with_cmd_dma;
4591 
4592 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4593 	if (err != 0)
4594 		goto abort_with_zeropad_dma;
4595 
4596 	/* select & load the firmware */
4597 	err = mxge_select_firmware(sc);
4598 	if (err != 0)
4599 		goto abort_with_dmabench;
4600 	sc->intr_coal_delay = mxge_intr_coal_delay;
4601 
4602 	mxge_slice_probe(sc);
4603 	err = mxge_alloc_slices(sc);
4604 	if (err != 0)
4605 		goto abort_with_dmabench;
4606 
4607 	err = mxge_reset(sc, 0);
4608 	if (err != 0)
4609 		goto abort_with_slices;
4610 
4611 	err = mxge_alloc_rings(sc);
4612 	if (err != 0) {
4613 		device_printf(sc->dev, "failed to allocate rings\n");
4614 		goto abort_with_dmabench;
4615 	}
4616 
4617 	ifp->if_baudrate = IF_Gbps(10UL);
4618 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4619 		IFCAP_VLAN_MTU;
4620 #ifdef INET
4621 	ifp->if_capabilities |= IFCAP_LRO;
4622 #endif
4623 
4624 #ifdef MXGE_NEW_VLAN_API
4625 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4626 #endif
4627 
4628 	sc->max_mtu = mxge_max_mtu(sc);
4629 	if (sc->max_mtu >= 9000)
4630 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4631 	else
4632 		device_printf(dev, "MTU limited to %d.  Install "
4633 			      "latest firmware for 9000 byte jumbo support\n",
4634 			      sc->max_mtu - ETHER_HDR_LEN);
4635 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4636 	ifp->if_capenable = ifp->if_capabilities;
4637 	if (sc->lro_cnt == 0)
4638 		ifp->if_capenable &= ~IFCAP_LRO;
4639 	sc->csum_flag = 1;
4640         ifp->if_init = mxge_init;
4641         ifp->if_softc = sc;
4642         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4643         ifp->if_ioctl = mxge_ioctl;
4644         ifp->if_start = mxge_start;
4645 	/* Initialise the ifmedia structure */
4646 	ifmedia_init(&sc->media, 0, mxge_media_change,
4647 		     mxge_media_status);
4648 	mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4649 	mxge_media_probe(sc);
4650 	sc->dying = 0;
4651 	ether_ifattach(ifp, sc->mac_addr, NULL);
4652 	/* ether_ifattach sets mtu to ETHERMTU */
4653 	if (mxge_initial_mtu != ETHERMTU) {
4654 		lwkt_serialize_enter(ifp->if_serializer);
4655 		mxge_change_mtu(sc, mxge_initial_mtu);
4656 		lwkt_serialize_exit(ifp->if_serializer);
4657 	}
4658 	/* must come after ether_ifattach() */
4659 	err = mxge_add_irq(sc);
4660 	if (err != 0) {
4661 		device_printf(sc->dev, "failed to add irq\n");
4662 		goto abort_with_rings;
4663 	}
4664 
4665 	mxge_add_sysctls(sc);
4666 #ifdef IFNET_BUF_RING
4667 	ifp->if_transmit = mxge_transmit;
4668 	ifp->if_qflush = mxge_qflush;
4669 #endif
4670 	return 0;
4671 
4672 abort_with_rings:
4673 	mxge_free_rings(sc);
4674 abort_with_slices:
4675 	mxge_free_slices(sc);
4676 abort_with_dmabench:
4677 	mxge_dma_free(&sc->dmabench_dma);
4678 abort_with_zeropad_dma:
4679 	mxge_dma_free(&sc->zeropad_dma);
4680 abort_with_cmd_dma:
4681 	mxge_dma_free(&sc->cmd_dma);
4682 abort_with_mem_res:
4683 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4684 	pci_disable_busmaster(dev);
4685 	bus_dma_tag_destroy(sc->parent_dmat);
4686 abort_with_nothing:
4687 	return err;
4688 }
4689 
4690 static int
4691 mxge_detach(device_t dev)
4692 {
4693 	mxge_softc_t *sc = device_get_softc(dev);
4694 
4695 	lwkt_serialize_enter(sc->ifp->if_serializer);
4696 	sc->dying = 1;
4697 	if (sc->ifp->if_flags & IFF_RUNNING)
4698 		mxge_close(sc);
4699 	/*
4700 	 * XXX: race: the callout callback could be spinning on
4701 	 * the serializer and run anyway
4702 	 */
4703 	callout_stop(&sc->co_hdl);
4704 	lwkt_serialize_exit(sc->ifp->if_serializer);
4705 
4706 	ether_ifdetach(sc->ifp);
4707 	ifmedia_removeall(&sc->media);
4708 	mxge_dummy_rdma(sc, 0);
4709 	mxge_rem_sysctls(sc);
4710 	mxge_rem_irq(sc);
4711 	mxge_free_rings(sc);
4712 	mxge_free_slices(sc);
4713 	mxge_dma_free(&sc->dmabench_dma);
4714 	mxge_dma_free(&sc->zeropad_dma);
4715 	mxge_dma_free(&sc->cmd_dma);
4716 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4717 	pci_disable_busmaster(dev);
4718 	bus_dma_tag_destroy(sc->parent_dmat);
4719 	return 0;
4720 }
4721 
4722 static int
4723 mxge_shutdown(device_t dev)
4724 {
4725 	return 0;
4726 }
4727 
4728 /*
4729   This file uses Myri10GE driver indentation.
4730 
4731   Local Variables:
4732   c-file-style:"linux"
4733   tab-width:8
4734   End:
4735 */
4736