xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 39beb93c)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2009, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/sx.h>
48 
49 #include <net/if.h>
50 #include <net/if_arp.h>
51 #include <net/ethernet.h>
52 #include <net/if_dl.h>
53 #include <net/if_media.h>
54 
55 #include <net/bpf.h>
56 
57 #include <net/if_types.h>
58 #include <net/if_vlan_var.h>
59 #include <net/zlib.h>
60 
61 #include <netinet/in_systm.h>
62 #include <netinet/in.h>
63 #include <netinet/ip.h>
64 #include <netinet/tcp.h>
65 
66 #include <machine/bus.h>
67 #include <machine/in_cksum.h>
68 #include <machine/resource.h>
69 #include <sys/bus.h>
70 #include <sys/rman.h>
71 #include <sys/smp.h>
72 
73 #include <dev/pci/pcireg.h>
74 #include <dev/pci/pcivar.h>
75 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
76 
77 #include <vm/vm.h>		/* for pmap_mapdev() */
78 #include <vm/pmap.h>
79 
80 #if defined(__i386) || defined(__amd64)
81 #include <machine/specialreg.h>
82 #endif
83 
84 #include <dev/mxge/mxge_mcp.h>
85 #include <dev/mxge/mcp_gen_header.h>
86 /*#define MXGE_FAKE_IFP*/
87 #include <dev/mxge/if_mxge_var.h>
88 
89 /* tunable params */
90 static int mxge_nvidia_ecrc_enable = 1;
91 static int mxge_force_firmware = 0;
92 static int mxge_intr_coal_delay = 30;
93 static int mxge_deassert_wait = 1;
94 static int mxge_flow_control = 1;
95 static int mxge_verbose = 0;
96 static int mxge_lro_cnt = 8;
97 static int mxge_ticks;
98 static int mxge_max_slices = 1;
99 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
100 static int mxge_always_promisc = 0;
101 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
102 static char *mxge_fw_aligned = "mxge_eth_z8e";
103 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
104 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
105 
106 static int mxge_probe(device_t dev);
107 static int mxge_attach(device_t dev);
108 static int mxge_detach(device_t dev);
109 static int mxge_shutdown(device_t dev);
110 static void mxge_intr(void *arg);
111 
112 static device_method_t mxge_methods[] =
113 {
114   /* Device interface */
115   DEVMETHOD(device_probe, mxge_probe),
116   DEVMETHOD(device_attach, mxge_attach),
117   DEVMETHOD(device_detach, mxge_detach),
118   DEVMETHOD(device_shutdown, mxge_shutdown),
119   {0, 0}
120 };
121 
122 static driver_t mxge_driver =
123 {
124   "mxge",
125   mxge_methods,
126   sizeof(mxge_softc_t),
127 };
128 
129 static devclass_t mxge_devclass;
130 
131 /* Declare ourselves to be a child of the PCI bus.*/
132 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
133 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
134 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
135 
136 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
137 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
138 static int mxge_close(mxge_softc_t *sc);
139 static int mxge_open(mxge_softc_t *sc);
140 static void mxge_tick(void *arg);
141 
142 static int
143 mxge_probe(device_t dev)
144 {
145 	int rev;
146 
147 
148 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
149 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
150 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
151 		rev = pci_get_revid(dev);
152 		switch (rev) {
153 		case MXGE_PCI_REV_Z8E:
154 			device_set_desc(dev, "Myri10G-PCIE-8A");
155 			break;
156 		case MXGE_PCI_REV_Z8ES:
157 			device_set_desc(dev, "Myri10G-PCIE-8B");
158 			break;
159 		default:
160 			device_set_desc(dev, "Myri10G-PCIE-8??");
161 			device_printf(dev, "Unrecognized rev %d NIC\n",
162 				      rev);
163 			break;
164 		}
165 		return 0;
166 	}
167 	return ENXIO;
168 }
169 
170 static void
171 mxge_enable_wc(mxge_softc_t *sc)
172 {
173 #if defined(__i386) || defined(__amd64)
174 	vm_offset_t len;
175 	int err;
176 
177 	sc->wc = 1;
178 	len = rman_get_size(sc->mem_res);
179 	err = pmap_change_attr((vm_offset_t) sc->sram,
180 			       len, PAT_WRITE_COMBINING);
181 	if (err != 0) {
182 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
183 			      err);
184 		sc->wc = 0;
185 	}
186 #endif
187 }
188 
189 
190 /* callback to get our DMA address */
191 static void
192 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
193 			 int error)
194 {
195 	if (error == 0) {
196 		*(bus_addr_t *) arg = segs->ds_addr;
197 	}
198 }
199 
200 static int
201 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
202 		   bus_size_t alignment)
203 {
204 	int err;
205 	device_t dev = sc->dev;
206 	bus_size_t boundary, maxsegsize;
207 
208 	if (bytes > 4096 && alignment == 4096) {
209 		boundary = 0;
210 		maxsegsize = bytes;
211 	} else {
212 		boundary = 4096;
213 		maxsegsize = 4096;
214 	}
215 
216 	/* allocate DMAable memory tags */
217 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
218 				 alignment,		/* alignment */
219 				 boundary,		/* boundary */
220 				 BUS_SPACE_MAXADDR,	/* low */
221 				 BUS_SPACE_MAXADDR,	/* high */
222 				 NULL, NULL,		/* filter */
223 				 bytes,			/* maxsize */
224 				 1,			/* num segs */
225 				 maxsegsize,		/* maxsegsize */
226 				 BUS_DMA_COHERENT,	/* flags */
227 				 NULL, NULL,		/* lock */
228 				 &dma->dmat);		/* tag */
229 	if (err != 0) {
230 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
231 		return err;
232 	}
233 
234 	/* allocate DMAable memory & map */
235 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
236 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
237 				| BUS_DMA_ZERO),  &dma->map);
238 	if (err != 0) {
239 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
240 		goto abort_with_dmat;
241 	}
242 
243 	/* load the memory */
244 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
245 			      mxge_dmamap_callback,
246 			      (void *)&dma->bus_addr, 0);
247 	if (err != 0) {
248 		device_printf(dev, "couldn't load map (err = %d)\n", err);
249 		goto abort_with_mem;
250 	}
251 	return 0;
252 
253 abort_with_mem:
254 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
255 abort_with_dmat:
256 	(void)bus_dma_tag_destroy(dma->dmat);
257 	return err;
258 }
259 
260 
261 static void
262 mxge_dma_free(mxge_dma_t *dma)
263 {
264 	bus_dmamap_unload(dma->dmat, dma->map);
265 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
266 	(void)bus_dma_tag_destroy(dma->dmat);
267 }
268 
269 /*
270  * The eeprom strings on the lanaiX have the format
271  * SN=x\0
272  * MAC=x:x:x:x:x:x\0
273  * PC=text\0
274  */
275 
276 static int
277 mxge_parse_strings(mxge_softc_t *sc)
278 {
279 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
280 
281 	char *ptr, *limit;
282 	int i, found_mac;
283 
284 	ptr = sc->eeprom_strings;
285 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
286 	found_mac = 0;
287 	while (ptr < limit && *ptr != '\0') {
288 		if (memcmp(ptr, "MAC=", 4) == 0) {
289 			ptr += 1;
290 			sc->mac_addr_string = ptr;
291 			for (i = 0; i < 6; i++) {
292 				ptr += 3;
293 				if ((ptr + 2) > limit)
294 					goto abort;
295 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
296 				found_mac = 1;
297 			}
298 		} else if (memcmp(ptr, "PC=", 3) == 0) {
299 			ptr += 3;
300 			strncpy(sc->product_code_string, ptr,
301 				sizeof (sc->product_code_string) - 1);
302 		} else if (memcmp(ptr, "SN=", 3) == 0) {
303 			ptr += 3;
304 			strncpy(sc->serial_number_string, ptr,
305 				sizeof (sc->serial_number_string) - 1);
306 		}
307 		MXGE_NEXT_STRING(ptr);
308 	}
309 
310 	if (found_mac)
311 		return 0;
312 
313  abort:
314 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
315 
316 	return ENXIO;
317 }
318 
319 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
320 static void
321 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
322 {
323 	uint32_t val;
324 	unsigned long base, off;
325 	char *va, *cfgptr;
326 	device_t pdev, mcp55;
327 	uint16_t vendor_id, device_id, word;
328 	uintptr_t bus, slot, func, ivend, idev;
329 	uint32_t *ptr32;
330 
331 
332 	if (!mxge_nvidia_ecrc_enable)
333 		return;
334 
335 	pdev = device_get_parent(device_get_parent(sc->dev));
336 	if (pdev == NULL) {
337 		device_printf(sc->dev, "could not find parent?\n");
338 		return;
339 	}
340 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
341 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
342 
343 	if (vendor_id != 0x10de)
344 		return;
345 
346 	base = 0;
347 
348 	if (device_id == 0x005d) {
349 		/* ck804, base address is magic */
350 		base = 0xe0000000UL;
351 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
352 		/* mcp55, base address stored in chipset */
353 		mcp55 = pci_find_bsf(0, 0, 0);
354 		if (mcp55 &&
355 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
356 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
357 			word = pci_read_config(mcp55, 0x90, 2);
358 			base = ((unsigned long)word & 0x7ffeU) << 25;
359 		}
360 	}
361 	if (!base)
362 		return;
363 
364 	/* XXXX
365 	   Test below is commented because it is believed that doing
366 	   config read/write beyond 0xff will access the config space
367 	   for the next larger function.  Uncomment this and remove
368 	   the hacky pmap_mapdev() way of accessing config space when
369 	   FreeBSD grows support for extended pcie config space access
370 	*/
371 #if 0
372 	/* See if we can, by some miracle, access the extended
373 	   config space */
374 	val = pci_read_config(pdev, 0x178, 4);
375 	if (val != 0xffffffff) {
376 		val |= 0x40;
377 		pci_write_config(pdev, 0x178, val, 4);
378 		return;
379 	}
380 #endif
381 	/* Rather than using normal pci config space writes, we must
382 	 * map the Nvidia config space ourselves.  This is because on
383 	 * opteron/nvidia class machine the 0xe000000 mapping is
384 	 * handled by the nvidia chipset, that means the internal PCI
385 	 * device (the on-chip northbridge), or the amd-8131 bridge
386 	 * and things behind them are not visible by this method.
387 	 */
388 
389 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
390 		      PCI_IVAR_BUS, &bus);
391 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
392 		      PCI_IVAR_SLOT, &slot);
393 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
394 		      PCI_IVAR_FUNCTION, &func);
395 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
396 		      PCI_IVAR_VENDOR, &ivend);
397 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
398 		      PCI_IVAR_DEVICE, &idev);
399 
400 	off =  base
401 		+ 0x00100000UL * (unsigned long)bus
402 		+ 0x00001000UL * (unsigned long)(func
403 						 + 8 * slot);
404 
405 	/* map it into the kernel */
406 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
407 
408 
409 	if (va == NULL) {
410 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
411 		return;
412 	}
413 	/* get a pointer to the config space mapped into the kernel */
414 	cfgptr = va + (off & PAGE_MASK);
415 
416 	/* make sure that we can really access it */
417 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
418 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
419 	if (! (vendor_id == ivend && device_id == idev)) {
420 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
421 			      vendor_id, device_id);
422 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
423 		return;
424 	}
425 
426 	ptr32 = (uint32_t*)(cfgptr + 0x178);
427 	val = *ptr32;
428 
429 	if (val == 0xffffffff) {
430 		device_printf(sc->dev, "extended mapping failed\n");
431 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
432 		return;
433 	}
434 	*ptr32 = val | 0x40;
435 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
436 	if (mxge_verbose)
437 		device_printf(sc->dev,
438 			      "Enabled ECRC on upstream Nvidia bridge "
439 			      "at %d:%d:%d\n",
440 			      (int)bus, (int)slot, (int)func);
441 	return;
442 }
443 #else
444 static void
445 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
446 {
447 	device_printf(sc->dev,
448 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
449 	return;
450 }
451 #endif
452 
453 
454 static int
455 mxge_dma_test(mxge_softc_t *sc, int test_type)
456 {
457 	mxge_cmd_t cmd;
458 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
459 	int status;
460 	uint32_t len;
461 	char *test = " ";
462 
463 
464 	/* Run a small DMA test.
465 	 * The magic multipliers to the length tell the firmware
466 	 * to do DMA read, write, or read+write tests.  The
467 	 * results are returned in cmd.data0.  The upper 16
468 	 * bits of the return is the number of transfers completed.
469 	 * The lower 16 bits is the time in 0.5us ticks that the
470 	 * transfers took to complete.
471 	 */
472 
473 	len = sc->tx_boundary;
474 
475 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
476 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
477 	cmd.data2 = len * 0x10000;
478 	status = mxge_send_cmd(sc, test_type, &cmd);
479 	if (status != 0) {
480 		test = "read";
481 		goto abort;
482 	}
483 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
484 		(cmd.data0 & 0xffff);
485 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
486 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
487 	cmd.data2 = len * 0x1;
488 	status = mxge_send_cmd(sc, test_type, &cmd);
489 	if (status != 0) {
490 		test = "write";
491 		goto abort;
492 	}
493 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
494 		(cmd.data0 & 0xffff);
495 
496 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
497 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
498 	cmd.data2 = len * 0x10001;
499 	status = mxge_send_cmd(sc, test_type, &cmd);
500 	if (status != 0) {
501 		test = "read/write";
502 		goto abort;
503 	}
504 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
505 		(cmd.data0 & 0xffff);
506 
507 abort:
508 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
509 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
510 			      test, status);
511 
512 	return status;
513 }
514 
515 /*
516  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
517  * when the PCI-E Completion packets are aligned on an 8-byte
518  * boundary.  Some PCI-E chip sets always align Completion packets; on
519  * the ones that do not, the alignment can be enforced by enabling
520  * ECRC generation (if supported).
521  *
522  * When PCI-E Completion packets are not aligned, it is actually more
523  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
524  *
525  * If the driver can neither enable ECRC nor verify that it has
526  * already been enabled, then it must use a firmware image which works
527  * around unaligned completion packets (ethp_z8e.dat), and it should
528  * also ensure that it never gives the device a Read-DMA which is
529  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
530  * enabled, then the driver should use the aligned (eth_z8e.dat)
531  * firmware image, and set tx_boundary to 4KB.
532  */
533 
534 static int
535 mxge_firmware_probe(mxge_softc_t *sc)
536 {
537 	device_t dev = sc->dev;
538 	int reg, status;
539 	uint16_t pectl;
540 
541 	sc->tx_boundary = 4096;
542 	/*
543 	 * Verify the max read request size was set to 4KB
544 	 * before trying the test with 4KB.
545 	 */
546 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
547 		pectl = pci_read_config(dev, reg + 0x8, 2);
548 		if ((pectl & (5 << 12)) != (5 << 12)) {
549 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
550 				      pectl);
551 			sc->tx_boundary = 2048;
552 		}
553 	}
554 
555 	/*
556 	 * load the optimized firmware (which assumes aligned PCIe
557 	 * completions) in order to see if it works on this host.
558 	 */
559 	sc->fw_name = mxge_fw_aligned;
560 	status = mxge_load_firmware(sc, 1);
561 	if (status != 0) {
562 		return status;
563 	}
564 
565 	/*
566 	 * Enable ECRC if possible
567 	 */
568 	mxge_enable_nvidia_ecrc(sc);
569 
570 	/*
571 	 * Run a DMA test which watches for unaligned completions and
572 	 * aborts on the first one seen.
573 	 */
574 
575 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
576 	if (status == 0)
577 		return 0; /* keep the aligned firmware */
578 
579 	if (status != E2BIG)
580 		device_printf(dev, "DMA test failed: %d\n", status);
581 	if (status == ENOSYS)
582 		device_printf(dev, "Falling back to ethp! "
583 			      "Please install up to date fw\n");
584 	return status;
585 }
586 
587 static int
588 mxge_select_firmware(mxge_softc_t *sc)
589 {
590 	int aligned = 0;
591 
592 
593 	if (mxge_force_firmware != 0) {
594 		if (mxge_force_firmware == 1)
595 			aligned = 1;
596 		else
597 			aligned = 0;
598 		if (mxge_verbose)
599 			device_printf(sc->dev,
600 				      "Assuming %s completions (forced)\n",
601 				      aligned ? "aligned" : "unaligned");
602 		goto abort;
603 	}
604 
605 	/* if the PCIe link width is 4 or less, we can use the aligned
606 	   firmware and skip any checks */
607 	if (sc->link_width != 0 && sc->link_width <= 4) {
608 		device_printf(sc->dev,
609 			      "PCIe x%d Link, expect reduced performance\n",
610 			      sc->link_width);
611 		aligned = 1;
612 		goto abort;
613 	}
614 
615 	if (0 == mxge_firmware_probe(sc))
616 		return 0;
617 
618 abort:
619 	if (aligned) {
620 		sc->fw_name = mxge_fw_aligned;
621 		sc->tx_boundary = 4096;
622 	} else {
623 		sc->fw_name = mxge_fw_unaligned;
624 		sc->tx_boundary = 2048;
625 	}
626 	return (mxge_load_firmware(sc, 0));
627 }
628 
629 union qualhack
630 {
631         const char *ro_char;
632         char *rw_char;
633 };
634 
635 static int
636 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
637 {
638 
639 
640 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
641 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
642 			      be32toh(hdr->mcp_type));
643 		return EIO;
644 	}
645 
646 	/* save firmware version for sysctl */
647 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
648 	if (mxge_verbose)
649 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
650 
651 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
652 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
653 
654 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
655 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
656 		device_printf(sc->dev, "Found firmware version %s\n",
657 			      sc->fw_version);
658 		device_printf(sc->dev, "Driver needs %d.%d\n",
659 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
660 		return EINVAL;
661 	}
662 	return 0;
663 
664 }
665 
666 static void *
667 z_alloc(void *nil, u_int items, u_int size)
668 {
669         void *ptr;
670 
671         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
672         return ptr;
673 }
674 
675 static void
676 z_free(void *nil, void *ptr)
677 {
678         free(ptr, M_TEMP);
679 }
680 
681 
682 static int
683 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
684 {
685 	z_stream zs;
686 	char *inflate_buffer;
687 	const struct firmware *fw;
688 	const mcp_gen_header_t *hdr;
689 	unsigned hdr_offset;
690 	int status;
691 	unsigned int i;
692 	char dummy;
693 	size_t fw_len;
694 
695 	fw = firmware_get(sc->fw_name);
696 	if (fw == NULL) {
697 		device_printf(sc->dev, "Could not find firmware image %s\n",
698 			      sc->fw_name);
699 		return ENOENT;
700 	}
701 
702 
703 
704 	/* setup zlib and decompress f/w */
705 	bzero(&zs, sizeof (zs));
706 	zs.zalloc = z_alloc;
707 	zs.zfree = z_free;
708 	status = inflateInit(&zs);
709 	if (status != Z_OK) {
710 		status = EIO;
711 		goto abort_with_fw;
712 	}
713 
714 	/* the uncompressed size is stored as the firmware version,
715 	   which would otherwise go unused */
716 	fw_len = (size_t) fw->version;
717 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
718 	if (inflate_buffer == NULL)
719 		goto abort_with_zs;
720 	zs.avail_in = fw->datasize;
721 	zs.next_in = __DECONST(char *, fw->data);
722 	zs.avail_out = fw_len;
723 	zs.next_out = inflate_buffer;
724 	status = inflate(&zs, Z_FINISH);
725 	if (status != Z_STREAM_END) {
726 		device_printf(sc->dev, "zlib %d\n", status);
727 		status = EIO;
728 		goto abort_with_buffer;
729 	}
730 
731 	/* check id */
732 	hdr_offset = htobe32(*(const uint32_t *)
733 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
734 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
735 		device_printf(sc->dev, "Bad firmware file");
736 		status = EIO;
737 		goto abort_with_buffer;
738 	}
739 	hdr = (const void*)(inflate_buffer + hdr_offset);
740 
741 	status = mxge_validate_firmware(sc, hdr);
742 	if (status != 0)
743 		goto abort_with_buffer;
744 
745 	/* Copy the inflated firmware to NIC SRAM. */
746 	for (i = 0; i < fw_len; i += 256) {
747 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
748 			      inflate_buffer + i,
749 			      min(256U, (unsigned)(fw_len - i)));
750 		wmb();
751 		dummy = *sc->sram;
752 		wmb();
753 	}
754 
755 	*limit = fw_len;
756 	status = 0;
757 abort_with_buffer:
758 	free(inflate_buffer, M_TEMP);
759 abort_with_zs:
760 	inflateEnd(&zs);
761 abort_with_fw:
762 	firmware_put(fw, FIRMWARE_UNLOAD);
763 	return status;
764 }
765 
766 /*
767  * Enable or disable periodic RDMAs from the host to make certain
768  * chipsets resend dropped PCIe messages
769  */
770 
771 static void
772 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
773 {
774 	char buf_bytes[72];
775 	volatile uint32_t *confirm;
776 	volatile char *submit;
777 	uint32_t *buf, dma_low, dma_high;
778 	int i;
779 
780 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
781 
782 	/* clear confirmation addr */
783 	confirm = (volatile uint32_t *)sc->cmd;
784 	*confirm = 0;
785 	wmb();
786 
787 	/* send an rdma command to the PCIe engine, and wait for the
788 	   response in the confirmation address.  The firmware should
789 	   write a -1 there to indicate it is alive and well
790 	*/
791 
792 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
793 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
794 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
795 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
796 	buf[2] = htobe32(0xffffffff);		/* confirm data */
797 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
798 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
799 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
800 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
801 	buf[5] = htobe32(enable);			/* enable? */
802 
803 
804 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
805 
806 	mxge_pio_copy(submit, buf, 64);
807 	wmb();
808 	DELAY(1000);
809 	wmb();
810 	i = 0;
811 	while (*confirm != 0xffffffff && i < 20) {
812 		DELAY(1000);
813 		i++;
814 	}
815 	if (*confirm != 0xffffffff) {
816 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
817 			      (enable ? "enable" : "disable"), confirm,
818 			      *confirm);
819 	}
820 	return;
821 }
822 
823 static int
824 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
825 {
826 	mcp_cmd_t *buf;
827 	char buf_bytes[sizeof(*buf) + 8];
828 	volatile mcp_cmd_response_t *response = sc->cmd;
829 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
830 	uint32_t dma_low, dma_high;
831 	int err, sleep_total = 0;
832 
833 	/* ensure buf is aligned to 8 bytes */
834 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
835 
836 	buf->data0 = htobe32(data->data0);
837 	buf->data1 = htobe32(data->data1);
838 	buf->data2 = htobe32(data->data2);
839 	buf->cmd = htobe32(cmd);
840 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
841 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
842 
843 	buf->response_addr.low = htobe32(dma_low);
844 	buf->response_addr.high = htobe32(dma_high);
845 	mtx_lock(&sc->cmd_mtx);
846 	response->result = 0xffffffff;
847 	wmb();
848 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
849 
850 	/* wait up to 20ms */
851 	err = EAGAIN;
852 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
853 		bus_dmamap_sync(sc->cmd_dma.dmat,
854 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
855 		wmb();
856 		switch (be32toh(response->result)) {
857 		case 0:
858 			data->data0 = be32toh(response->data);
859 			err = 0;
860 			break;
861 		case 0xffffffff:
862 			DELAY(1000);
863 			break;
864 		case MXGEFW_CMD_UNKNOWN:
865 			err = ENOSYS;
866 			break;
867 		case MXGEFW_CMD_ERROR_UNALIGNED:
868 			err = E2BIG;
869 			break;
870 		case MXGEFW_CMD_ERROR_BUSY:
871 			err = EBUSY;
872 			break;
873 		default:
874 			device_printf(sc->dev,
875 				      "mxge: command %d "
876 				      "failed, result = %d\n",
877 				      cmd, be32toh(response->result));
878 			err = ENXIO;
879 			break;
880 		}
881 		if (err != EAGAIN)
882 			break;
883 	}
884 	if (err == EAGAIN)
885 		device_printf(sc->dev, "mxge: command %d timed out"
886 			      "result = %d\n",
887 			      cmd, be32toh(response->result));
888 	mtx_unlock(&sc->cmd_mtx);
889 	return err;
890 }
891 
892 static int
893 mxge_adopt_running_firmware(mxge_softc_t *sc)
894 {
895 	struct mcp_gen_header *hdr;
896 	const size_t bytes = sizeof (struct mcp_gen_header);
897 	size_t hdr_offset;
898 	int status;
899 
900 	/* find running firmware header */
901 	hdr_offset = htobe32(*(volatile uint32_t *)
902 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
903 
904 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
905 		device_printf(sc->dev,
906 			      "Running firmware has bad header offset (%d)\n",
907 			      (int)hdr_offset);
908 		return EIO;
909 	}
910 
911 	/* copy header of running firmware from SRAM to host memory to
912 	 * validate firmware */
913 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
914 	if (hdr == NULL) {
915 		device_printf(sc->dev, "could not malloc firmware hdr\n");
916 		return ENOMEM;
917 	}
918 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
919 				rman_get_bushandle(sc->mem_res),
920 				hdr_offset, (char *)hdr, bytes);
921 	status = mxge_validate_firmware(sc, hdr);
922 	free(hdr, M_DEVBUF);
923 
924 	/*
925 	 * check to see if adopted firmware has bug where adopting
926 	 * it will cause broadcasts to be filtered unless the NIC
927 	 * is kept in ALLMULTI mode
928 	 */
929 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
930 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
931 		sc->adopted_rx_filter_bug = 1;
932 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
933 			      "working around rx filter bug\n",
934 			      sc->fw_ver_major, sc->fw_ver_minor,
935 			      sc->fw_ver_tiny);
936 	}
937 
938 	return status;
939 }
940 
941 
942 static int
943 mxge_load_firmware(mxge_softc_t *sc, int adopt)
944 {
945 	volatile uint32_t *confirm;
946 	volatile char *submit;
947 	char buf_bytes[72];
948 	uint32_t *buf, size, dma_low, dma_high;
949 	int status, i;
950 
951 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
952 
953 	size = sc->sram_size;
954 	status = mxge_load_firmware_helper(sc, &size);
955 	if (status) {
956 		if (!adopt)
957 			return status;
958 		/* Try to use the currently running firmware, if
959 		   it is new enough */
960 		status = mxge_adopt_running_firmware(sc);
961 		if (status) {
962 			device_printf(sc->dev,
963 				      "failed to adopt running firmware\n");
964 			return status;
965 		}
966 		device_printf(sc->dev,
967 			      "Successfully adopted running firmware\n");
968 		if (sc->tx_boundary == 4096) {
969 			device_printf(sc->dev,
970 				"Using firmware currently running on NIC"
971 				 ".  For optimal\n");
972 			device_printf(sc->dev,
973 				 "performance consider loading optimized "
974 				 "firmware\n");
975 		}
976 		sc->fw_name = mxge_fw_unaligned;
977 		sc->tx_boundary = 2048;
978 		return 0;
979 	}
980 	/* clear confirmation addr */
981 	confirm = (volatile uint32_t *)sc->cmd;
982 	*confirm = 0;
983 	wmb();
984 	/* send a reload command to the bootstrap MCP, and wait for the
985 	   response in the confirmation address.  The firmware should
986 	   write a -1 there to indicate it is alive and well
987 	*/
988 
989 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
990 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
991 
992 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
993 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
994 	buf[2] = htobe32(0xffffffff);	/* confirm data */
995 
996 	/* FIX: All newest firmware should un-protect the bottom of
997 	   the sram before handoff. However, the very first interfaces
998 	   do not. Therefore the handoff copy must skip the first 8 bytes
999 	*/
1000 					/* where the code starts*/
1001 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1002 	buf[4] = htobe32(size - 8); 	/* length of code */
1003 	buf[5] = htobe32(8);		/* where to copy to */
1004 	buf[6] = htobe32(0);		/* where to jump to */
1005 
1006 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1007 	mxge_pio_copy(submit, buf, 64);
1008 	wmb();
1009 	DELAY(1000);
1010 	wmb();
1011 	i = 0;
1012 	while (*confirm != 0xffffffff && i < 20) {
1013 		DELAY(1000*10);
1014 		i++;
1015 		bus_dmamap_sync(sc->cmd_dma.dmat,
1016 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1017 	}
1018 	if (*confirm != 0xffffffff) {
1019 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1020 			confirm, *confirm);
1021 
1022 		return ENXIO;
1023 	}
1024 	return 0;
1025 }
1026 
1027 static int
1028 mxge_update_mac_address(mxge_softc_t *sc)
1029 {
1030 	mxge_cmd_t cmd;
1031 	uint8_t *addr = sc->mac_addr;
1032 	int status;
1033 
1034 
1035 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1036 		     | (addr[2] << 8) | addr[3]);
1037 
1038 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1039 
1040 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1041 	return status;
1042 }
1043 
1044 static int
1045 mxge_change_pause(mxge_softc_t *sc, int pause)
1046 {
1047 	mxge_cmd_t cmd;
1048 	int status;
1049 
1050 	if (pause)
1051 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1052 				       &cmd);
1053 	else
1054 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1055 				       &cmd);
1056 
1057 	if (status) {
1058 		device_printf(sc->dev, "Failed to set flow control mode\n");
1059 		return ENXIO;
1060 	}
1061 	sc->pause = pause;
1062 	return 0;
1063 }
1064 
1065 static void
1066 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1067 {
1068 	mxge_cmd_t cmd;
1069 	int status;
1070 
1071 	if (mxge_always_promisc)
1072 		promisc = 1;
1073 
1074 	if (promisc)
1075 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1076 				       &cmd);
1077 	else
1078 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1079 				       &cmd);
1080 
1081 	if (status) {
1082 		device_printf(sc->dev, "Failed to set promisc mode\n");
1083 	}
1084 }
1085 
1086 static void
1087 mxge_set_multicast_list(mxge_softc_t *sc)
1088 {
1089 	mxge_cmd_t cmd;
1090 	struct ifmultiaddr *ifma;
1091 	struct ifnet *ifp = sc->ifp;
1092 	int err;
1093 
1094 	/* This firmware is known to not support multicast */
1095 	if (!sc->fw_multicast_support)
1096 		return;
1097 
1098 	/* Disable multicast filtering while we play with the lists*/
1099 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1100 	if (err != 0) {
1101 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1102 		       " error status: %d\n", err);
1103 		return;
1104 	}
1105 
1106 	if (sc->adopted_rx_filter_bug)
1107 		return;
1108 
1109 	if (ifp->if_flags & IFF_ALLMULTI)
1110 		/* request to disable multicast filtering, so quit here */
1111 		return;
1112 
1113 	/* Flush all the filters */
1114 
1115 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1116 	if (err != 0) {
1117 		device_printf(sc->dev,
1118 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1119 			      ", error status: %d\n", err);
1120 		return;
1121 	}
1122 
1123 	/* Walk the multicast list, and add each address */
1124 
1125 	IF_ADDR_LOCK(ifp);
1126 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1127 		if (ifma->ifma_addr->sa_family != AF_LINK)
1128 			continue;
1129 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1130 		      &cmd.data0, 4);
1131 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1132 		      &cmd.data1, 2);
1133 		cmd.data0 = htonl(cmd.data0);
1134 		cmd.data1 = htonl(cmd.data1);
1135 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1136 		if (err != 0) {
1137 			device_printf(sc->dev, "Failed "
1138 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1139 			       "%d\t", err);
1140 			/* abort, leaving multicast filtering off */
1141 			IF_ADDR_UNLOCK(ifp);
1142 			return;
1143 		}
1144 	}
1145 	IF_ADDR_UNLOCK(ifp);
1146 	/* Enable multicast filtering */
1147 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1148 	if (err != 0) {
1149 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1150 		       ", error status: %d\n", err);
1151 	}
1152 }
1153 
1154 static int
1155 mxge_max_mtu(mxge_softc_t *sc)
1156 {
1157 	mxge_cmd_t cmd;
1158 	int status;
1159 
1160 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1161 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1162 
1163 	/* try to set nbufs to see if it we can
1164 	   use virtually contiguous jumbos */
1165 	cmd.data0 = 0;
1166 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1167 			       &cmd);
1168 	if (status == 0)
1169 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1170 
1171 	/* otherwise, we're limited to MJUMPAGESIZE */
1172 	return MJUMPAGESIZE - MXGEFW_PAD;
1173 }
1174 
1175 static int
1176 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1177 {
1178 	struct mxge_slice_state *ss;
1179 	mxge_rx_done_t *rx_done;
1180 	volatile uint32_t *irq_claim;
1181 	mxge_cmd_t cmd;
1182 	int slice, status;
1183 
1184 	/* try to send a reset command to the card to see if it
1185 	   is alive */
1186 	memset(&cmd, 0, sizeof (cmd));
1187 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1188 	if (status != 0) {
1189 		device_printf(sc->dev, "failed reset\n");
1190 		return ENXIO;
1191 	}
1192 
1193 	mxge_dummy_rdma(sc, 1);
1194 
1195 
1196 	/* set the intrq size */
1197 	cmd.data0 = sc->rx_ring_size;
1198 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1199 
1200 	/*
1201 	 * Even though we already know how many slices are supported
1202 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1203 	 * has magic side effects, and must be called after a reset.
1204 	 * It must be called prior to calling any RSS related cmds,
1205 	 * including assigning an interrupt queue for anything but
1206 	 * slice 0.  It must also be called *after*
1207 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1208 	 * the firmware to compute offsets.
1209 	 */
1210 
1211 	if (sc->num_slices > 1) {
1212 		/* ask the maximum number of slices it supports */
1213 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1214 					   &cmd);
1215 		if (status != 0) {
1216 			device_printf(sc->dev,
1217 				      "failed to get number of slices\n");
1218 			return status;
1219 		}
1220 		/*
1221 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1222 		 * to setting up the interrupt queue DMA
1223 		 */
1224 		cmd.data0 = sc->num_slices;
1225 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1226 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1227 					   &cmd);
1228 		if (status != 0) {
1229 			device_printf(sc->dev,
1230 				      "failed to set number of slices\n");
1231 			return status;
1232 		}
1233 	}
1234 
1235 
1236 	if (interrupts_setup) {
1237 		/* Now exchange information about interrupts  */
1238 		for (slice = 0; slice < sc->num_slices; slice++) {
1239 			rx_done = &sc->ss[slice].rx_done;
1240 			memset(rx_done->entry, 0, sc->rx_ring_size);
1241 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1242 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1243 			cmd.data2 = slice;
1244 			status |= mxge_send_cmd(sc,
1245 						MXGEFW_CMD_SET_INTRQ_DMA,
1246 						&cmd);
1247 		}
1248 	}
1249 
1250 	status |= mxge_send_cmd(sc,
1251 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1252 
1253 
1254 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1255 
1256 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1257 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1258 
1259 
1260 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1261 				&cmd);
1262 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1263 	if (status != 0) {
1264 		device_printf(sc->dev, "failed set interrupt parameters\n");
1265 		return status;
1266 	}
1267 
1268 
1269 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1270 
1271 
1272 	/* run a DMA benchmark */
1273 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1274 
1275 	for (slice = 0; slice < sc->num_slices; slice++) {
1276 		ss = &sc->ss[slice];
1277 
1278 		ss->irq_claim = irq_claim + (2 * slice);
1279 		/* reset mcp/driver shared state back to 0 */
1280 		ss->rx_done.idx = 0;
1281 		ss->rx_done.cnt = 0;
1282 		ss->tx.req = 0;
1283 		ss->tx.done = 0;
1284 		ss->tx.pkt_done = 0;
1285 		ss->tx.wake = 0;
1286 		ss->tx.defrag = 0;
1287 		ss->tx.stall = 0;
1288 		ss->rx_big.cnt = 0;
1289 		ss->rx_small.cnt = 0;
1290 		ss->lro_bad_csum = 0;
1291 		ss->lro_queued = 0;
1292 		ss->lro_flushed = 0;
1293 		if (ss->fw_stats != NULL) {
1294 			ss->fw_stats->valid = 0;
1295 			ss->fw_stats->send_done_count = 0;
1296 		}
1297 	}
1298 	sc->rdma_tags_available = 15;
1299 	status = mxge_update_mac_address(sc);
1300 	mxge_change_promisc(sc, 0);
1301 	mxge_change_pause(sc, sc->pause);
1302 	mxge_set_multicast_list(sc);
1303 	return status;
1304 }
1305 
1306 static int
1307 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1308 {
1309         mxge_softc_t *sc;
1310         unsigned int intr_coal_delay;
1311         int err;
1312 
1313         sc = arg1;
1314         intr_coal_delay = sc->intr_coal_delay;
1315         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1316         if (err != 0) {
1317                 return err;
1318         }
1319         if (intr_coal_delay == sc->intr_coal_delay)
1320                 return 0;
1321 
1322         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1323                 return EINVAL;
1324 
1325 	mtx_lock(&sc->driver_mtx);
1326 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1327 	sc->intr_coal_delay = intr_coal_delay;
1328 
1329 	mtx_unlock(&sc->driver_mtx);
1330         return err;
1331 }
1332 
1333 static int
1334 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1335 {
1336         mxge_softc_t *sc;
1337         unsigned int enabled;
1338         int err;
1339 
1340         sc = arg1;
1341         enabled = sc->pause;
1342         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1343         if (err != 0) {
1344                 return err;
1345         }
1346         if (enabled == sc->pause)
1347                 return 0;
1348 
1349 	mtx_lock(&sc->driver_mtx);
1350 	err = mxge_change_pause(sc, enabled);
1351 	mtx_unlock(&sc->driver_mtx);
1352         return err;
1353 }
1354 
1355 static int
1356 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1357 {
1358 	struct ifnet *ifp;
1359 	int err = 0;
1360 
1361 	ifp = sc->ifp;
1362 	if (lro_cnt == 0)
1363 		ifp->if_capenable &= ~IFCAP_LRO;
1364 	else
1365 		ifp->if_capenable |= IFCAP_LRO;
1366 	sc->lro_cnt = lro_cnt;
1367 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1368 		mxge_close(sc);
1369 		err = mxge_open(sc);
1370 	}
1371 	return err;
1372 }
1373 
1374 static int
1375 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1376 {
1377 	mxge_softc_t *sc;
1378 	unsigned int lro_cnt;
1379 	int err;
1380 
1381 	sc = arg1;
1382 	lro_cnt = sc->lro_cnt;
1383 	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1384 	if (err != 0)
1385 		return err;
1386 
1387 	if (lro_cnt == sc->lro_cnt)
1388 		return 0;
1389 
1390 	if (lro_cnt > 128)
1391 		return EINVAL;
1392 
1393 	mtx_lock(&sc->driver_mtx);
1394 	err = mxge_change_lro_locked(sc, lro_cnt);
1395 	mtx_unlock(&sc->driver_mtx);
1396 	return err;
1397 }
1398 
1399 static int
1400 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1401 {
1402         int err;
1403 
1404         if (arg1 == NULL)
1405                 return EFAULT;
1406         arg2 = be32toh(*(int *)arg1);
1407         arg1 = NULL;
1408         err = sysctl_handle_int(oidp, arg1, arg2, req);
1409 
1410         return err;
1411 }
1412 
1413 static void
1414 mxge_rem_sysctls(mxge_softc_t *sc)
1415 {
1416 	struct mxge_slice_state *ss;
1417 	int slice;
1418 
1419 	if (sc->slice_sysctl_tree == NULL)
1420 		return;
1421 
1422 	for (slice = 0; slice < sc->num_slices; slice++) {
1423 		ss = &sc->ss[slice];
1424 		if (ss == NULL || ss->sysctl_tree == NULL)
1425 			continue;
1426 		sysctl_ctx_free(&ss->sysctl_ctx);
1427 		ss->sysctl_tree = NULL;
1428 	}
1429 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1430 	sc->slice_sysctl_tree = NULL;
1431 }
1432 
1433 static void
1434 mxge_add_sysctls(mxge_softc_t *sc)
1435 {
1436 	struct sysctl_ctx_list *ctx;
1437 	struct sysctl_oid_list *children;
1438 	mcp_irq_data_t *fw;
1439 	struct mxge_slice_state *ss;
1440 	int slice;
1441 	char slice_num[8];
1442 
1443 	ctx = device_get_sysctl_ctx(sc->dev);
1444 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1445 	fw = sc->ss[0].fw_stats;
1446 
1447 	/* random information */
1448 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1449 		       "firmware_version",
1450 		       CTLFLAG_RD, &sc->fw_version,
1451 		       0, "firmware version");
1452 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1453 		       "serial_number",
1454 		       CTLFLAG_RD, &sc->serial_number_string,
1455 		       0, "serial number");
1456 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1457 		       "product_code",
1458 		       CTLFLAG_RD, &sc->product_code_string,
1459 		       0, "product_code");
1460 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1461 		       "pcie_link_width",
1462 		       CTLFLAG_RD, &sc->link_width,
1463 		       0, "tx_boundary");
1464 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1465 		       "tx_boundary",
1466 		       CTLFLAG_RD, &sc->tx_boundary,
1467 		       0, "tx_boundary");
1468 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1469 		       "write_combine",
1470 		       CTLFLAG_RD, &sc->wc,
1471 		       0, "write combining PIO?");
1472 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1473 		       "read_dma_MBs",
1474 		       CTLFLAG_RD, &sc->read_dma,
1475 		       0, "DMA Read speed in MB/s");
1476 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1477 		       "write_dma_MBs",
1478 		       CTLFLAG_RD, &sc->write_dma,
1479 		       0, "DMA Write speed in MB/s");
1480 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1481 		       "read_write_dma_MBs",
1482 		       CTLFLAG_RD, &sc->read_write_dma,
1483 		       0, "DMA concurrent Read/Write speed in MB/s");
1484 
1485 
1486 	/* performance related tunables */
1487 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1488 			"intr_coal_delay",
1489 			CTLTYPE_INT|CTLFLAG_RW, sc,
1490 			0, mxge_change_intr_coal,
1491 			"I", "interrupt coalescing delay in usecs");
1492 
1493 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1494 			"flow_control_enabled",
1495 			CTLTYPE_INT|CTLFLAG_RW, sc,
1496 			0, mxge_change_flow_control,
1497 			"I", "interrupt coalescing delay in usecs");
1498 
1499 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1500 		       "deassert_wait",
1501 		       CTLFLAG_RW, &mxge_deassert_wait,
1502 		       0, "Wait for IRQ line to go low in ihandler");
1503 
1504 	/* stats block from firmware is in network byte order.
1505 	   Need to swap it */
1506 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1507 			"link_up",
1508 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1509 			0, mxge_handle_be32,
1510 			"I", "link up");
1511 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1512 			"rdma_tags_available",
1513 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1514 			0, mxge_handle_be32,
1515 			"I", "rdma_tags_available");
1516 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1517 			"dropped_bad_crc32",
1518 			CTLTYPE_INT|CTLFLAG_RD,
1519 			&fw->dropped_bad_crc32,
1520 			0, mxge_handle_be32,
1521 			"I", "dropped_bad_crc32");
1522 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1523 			"dropped_bad_phy",
1524 			CTLTYPE_INT|CTLFLAG_RD,
1525 			&fw->dropped_bad_phy,
1526 			0, mxge_handle_be32,
1527 			"I", "dropped_bad_phy");
1528 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1529 			"dropped_link_error_or_filtered",
1530 			CTLTYPE_INT|CTLFLAG_RD,
1531 			&fw->dropped_link_error_or_filtered,
1532 			0, mxge_handle_be32,
1533 			"I", "dropped_link_error_or_filtered");
1534 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1535 			"dropped_link_overflow",
1536 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1537 			0, mxge_handle_be32,
1538 			"I", "dropped_link_overflow");
1539 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1540 			"dropped_multicast_filtered",
1541 			CTLTYPE_INT|CTLFLAG_RD,
1542 			&fw->dropped_multicast_filtered,
1543 			0, mxge_handle_be32,
1544 			"I", "dropped_multicast_filtered");
1545 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1546 			"dropped_no_big_buffer",
1547 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1548 			0, mxge_handle_be32,
1549 			"I", "dropped_no_big_buffer");
1550 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1551 			"dropped_no_small_buffer",
1552 			CTLTYPE_INT|CTLFLAG_RD,
1553 			&fw->dropped_no_small_buffer,
1554 			0, mxge_handle_be32,
1555 			"I", "dropped_no_small_buffer");
1556 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1557 			"dropped_overrun",
1558 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1559 			0, mxge_handle_be32,
1560 			"I", "dropped_overrun");
1561 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1562 			"dropped_pause",
1563 			CTLTYPE_INT|CTLFLAG_RD,
1564 			&fw->dropped_pause,
1565 			0, mxge_handle_be32,
1566 			"I", "dropped_pause");
1567 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1568 			"dropped_runt",
1569 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1570 			0, mxge_handle_be32,
1571 			"I", "dropped_runt");
1572 
1573 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1574 			"dropped_unicast_filtered",
1575 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1576 			0, mxge_handle_be32,
1577 			"I", "dropped_unicast_filtered");
1578 
1579 	/* verbose printing? */
1580 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1581 		       "verbose",
1582 		       CTLFLAG_RW, &mxge_verbose,
1583 		       0, "verbose printing");
1584 
1585 	/* lro */
1586 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1587 			"lro_cnt",
1588 			CTLTYPE_INT|CTLFLAG_RW, sc,
1589 			0, mxge_change_lro,
1590 			"I", "number of lro merge queues");
1591 
1592 
1593 	/* add counters exported for debugging from all slices */
1594 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1595 	sc->slice_sysctl_tree =
1596 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1597 				"slice", CTLFLAG_RD, 0, "");
1598 
1599 	for (slice = 0; slice < sc->num_slices; slice++) {
1600 		ss = &sc->ss[slice];
1601 		sysctl_ctx_init(&ss->sysctl_ctx);
1602 		ctx = &ss->sysctl_ctx;
1603 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1604 		sprintf(slice_num, "%d", slice);
1605 		ss->sysctl_tree =
1606 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1607 					CTLFLAG_RD, 0, "");
1608 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1609 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1610 			       "rx_small_cnt",
1611 			       CTLFLAG_RD, &ss->rx_small.cnt,
1612 			       0, "rx_small_cnt");
1613 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1614 			       "rx_big_cnt",
1615 			       CTLFLAG_RD, &ss->rx_big.cnt,
1616 			       0, "rx_small_cnt");
1617 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1618 			       "tx_req",
1619 			       CTLFLAG_RD, &ss->tx.req,
1620 			       0, "tx_req");
1621 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1622 			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1623 			       0, "number of lro merge queues flushed");
1624 
1625 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1626 			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1627 			       0, "number of frames appended to lro merge"
1628 			       "queues");
1629 
1630 		/* only transmit from slice 0 for now */
1631 		if (slice > 0)
1632 			continue;
1633 
1634 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1635 			       "tx_done",
1636 			       CTLFLAG_RD, &ss->tx.done,
1637 			       0, "tx_done");
1638 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1639 			       "tx_pkt_done",
1640 			       CTLFLAG_RD, &ss->tx.pkt_done,
1641 			       0, "tx_done");
1642 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1643 			       "tx_stall",
1644 			       CTLFLAG_RD, &ss->tx.stall,
1645 			       0, "tx_stall");
1646 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1647 			       "tx_wake",
1648 			       CTLFLAG_RD, &ss->tx.wake,
1649 			       0, "tx_wake");
1650 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1651 			       "tx_defrag",
1652 			       CTLFLAG_RD, &ss->tx.defrag,
1653 			       0, "tx_defrag");
1654 	}
1655 }
1656 
1657 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1658    backwards one at a time and handle ring wraps */
1659 
1660 static inline void
1661 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1662 			    mcp_kreq_ether_send_t *src, int cnt)
1663 {
1664         int idx, starting_slot;
1665         starting_slot = tx->req;
1666         while (cnt > 1) {
1667                 cnt--;
1668                 idx = (starting_slot + cnt) & tx->mask;
1669                 mxge_pio_copy(&tx->lanai[idx],
1670 			      &src[cnt], sizeof(*src));
1671                 wmb();
1672         }
1673 }
1674 
1675 /*
1676  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1677  * at most 32 bytes at a time, so as to avoid involving the software
1678  * pio handler in the nic.   We re-write the first segment's flags
1679  * to mark them valid only after writing the entire chain
1680  */
1681 
1682 static inline void
1683 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1684                   int cnt)
1685 {
1686         int idx, i;
1687         uint32_t *src_ints;
1688 	volatile uint32_t *dst_ints;
1689         mcp_kreq_ether_send_t *srcp;
1690 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1691 	uint8_t last_flags;
1692 
1693         idx = tx->req & tx->mask;
1694 
1695 	last_flags = src->flags;
1696 	src->flags = 0;
1697         wmb();
1698         dst = dstp = &tx->lanai[idx];
1699         srcp = src;
1700 
1701         if ((idx + cnt) < tx->mask) {
1702                 for (i = 0; i < (cnt - 1); i += 2) {
1703                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1704                         wmb(); /* force write every 32 bytes */
1705                         srcp += 2;
1706                         dstp += 2;
1707                 }
1708         } else {
1709                 /* submit all but the first request, and ensure
1710                    that it is submitted below */
1711                 mxge_submit_req_backwards(tx, src, cnt);
1712                 i = 0;
1713         }
1714         if (i < cnt) {
1715                 /* submit the first request */
1716                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1717                 wmb(); /* barrier before setting valid flag */
1718         }
1719 
1720         /* re-write the last 32-bits with the valid flags */
1721         src->flags = last_flags;
1722         src_ints = (uint32_t *)src;
1723         src_ints+=3;
1724         dst_ints = (volatile uint32_t *)dst;
1725         dst_ints+=3;
1726         *dst_ints =  *src_ints;
1727         tx->req += cnt;
1728         wmb();
1729 }
1730 
1731 #if IFCAP_TSO4
1732 
1733 static void
1734 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1735 	       int busdma_seg_cnt, int ip_off)
1736 {
1737 	mxge_tx_ring_t *tx;
1738 	mcp_kreq_ether_send_t *req;
1739 	bus_dma_segment_t *seg;
1740 	struct ip *ip;
1741 	struct tcphdr *tcp;
1742 	uint32_t low, high_swapped;
1743 	int len, seglen, cum_len, cum_len_next;
1744 	int next_is_first, chop, cnt, rdma_count, small;
1745 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1746 	uint8_t flags, flags_next;
1747 	static int once;
1748 
1749 	mss = m->m_pkthdr.tso_segsz;
1750 
1751 	/* negative cum_len signifies to the
1752 	 * send loop that we are still in the
1753 	 * header portion of the TSO packet.
1754 	 */
1755 
1756 	/* ensure we have the ethernet, IP and TCP
1757 	   header together in the first mbuf, copy
1758 	   it to a scratch buffer if not */
1759 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1760 		m_copydata(m, 0, ip_off + sizeof (*ip),
1761 			   ss->scratch);
1762 		ip = (struct ip *)(ss->scratch + ip_off);
1763 	} else {
1764 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1765 	}
1766 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1767 			    + sizeof (*tcp))) {
1768 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1769 			   + sizeof (*tcp),  ss->scratch);
1770 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1771 	}
1772 
1773 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1774 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1775 
1776 	/* TSO implies checksum offload on this hardware */
1777 	cksum_offset = ip_off + (ip->ip_hl << 2);
1778 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1779 
1780 
1781 	/* for TSO, pseudo_hdr_offset holds mss.
1782 	 * The firmware figures out where to put
1783 	 * the checksum by parsing the header. */
1784 	pseudo_hdr_offset = htobe16(mss);
1785 
1786 	tx = &ss->tx;
1787 	req = tx->req_list;
1788 	seg = tx->seg_list;
1789 	cnt = 0;
1790 	rdma_count = 0;
1791 	/* "rdma_count" is the number of RDMAs belonging to the
1792 	 * current packet BEFORE the current send request. For
1793 	 * non-TSO packets, this is equal to "count".
1794 	 * For TSO packets, rdma_count needs to be reset
1795 	 * to 0 after a segment cut.
1796 	 *
1797 	 * The rdma_count field of the send request is
1798 	 * the number of RDMAs of the packet starting at
1799 	 * that request. For TSO send requests with one ore more cuts
1800 	 * in the middle, this is the number of RDMAs starting
1801 	 * after the last cut in the request. All previous
1802 	 * segments before the last cut implicitly have 1 RDMA.
1803 	 *
1804 	 * Since the number of RDMAs is not known beforehand,
1805 	 * it must be filled-in retroactively - after each
1806 	 * segmentation cut or at the end of the entire packet.
1807 	 */
1808 
1809 	while (busdma_seg_cnt) {
1810 		/* Break the busdma segment up into pieces*/
1811 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1812 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1813 		len = seg->ds_len;
1814 
1815 		while (len) {
1816 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1817 			seglen = len;
1818 			cum_len_next = cum_len + seglen;
1819 			(req-rdma_count)->rdma_count = rdma_count + 1;
1820 			if (__predict_true(cum_len >= 0)) {
1821 				/* payload */
1822 				chop = (cum_len_next > mss);
1823 				cum_len_next = cum_len_next % mss;
1824 				next_is_first = (cum_len_next == 0);
1825 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1826 				flags_next |= next_is_first *
1827 					MXGEFW_FLAGS_FIRST;
1828 				rdma_count |= -(chop | next_is_first);
1829 				rdma_count += chop & !next_is_first;
1830 			} else if (cum_len_next >= 0) {
1831 				/* header ends */
1832 				rdma_count = -1;
1833 				cum_len_next = 0;
1834 				seglen = -cum_len;
1835 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1836 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1837 					MXGEFW_FLAGS_FIRST |
1838 					(small * MXGEFW_FLAGS_SMALL);
1839 			    }
1840 
1841 			req->addr_high = high_swapped;
1842 			req->addr_low = htobe32(low);
1843 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1844 			req->pad = 0;
1845 			req->rdma_count = 1;
1846 			req->length = htobe16(seglen);
1847 			req->cksum_offset = cksum_offset;
1848 			req->flags = flags | ((cum_len & 1) *
1849 					      MXGEFW_FLAGS_ALIGN_ODD);
1850 			low += seglen;
1851 			len -= seglen;
1852 			cum_len = cum_len_next;
1853 			flags = flags_next;
1854 			req++;
1855 			cnt++;
1856 			rdma_count++;
1857 			if (__predict_false(cksum_offset > seglen))
1858 				cksum_offset -= seglen;
1859 			else
1860 				cksum_offset = 0;
1861 			if (__predict_false(cnt > tx->max_desc))
1862 				goto drop;
1863 		}
1864 		busdma_seg_cnt--;
1865 		seg++;
1866 	}
1867 	(req-rdma_count)->rdma_count = rdma_count;
1868 
1869 	do {
1870 		req--;
1871 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1872 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1873 
1874 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1875 	mxge_submit_req(tx, tx->req_list, cnt);
1876 	return;
1877 
1878 drop:
1879 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1880 	m_freem(m);
1881 	ss->sc->ifp->if_oerrors++;
1882 	if (!once) {
1883 		printf("tx->max_desc exceeded via TSO!\n");
1884 		printf("mss = %d, %ld, %d!\n", mss,
1885 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1886 		once = 1;
1887 	}
1888 	return;
1889 
1890 }
1891 
1892 #endif /* IFCAP_TSO4 */
1893 
1894 #ifdef MXGE_NEW_VLAN_API
1895 /*
1896  * We reproduce the software vlan tag insertion from
1897  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1898  * vlan tag insertion. We need to advertise this in order to have the
1899  * vlan interface respect our csum offload flags.
1900  */
1901 static struct mbuf *
1902 mxge_vlan_tag_insert(struct mbuf *m)
1903 {
1904 	struct ether_vlan_header *evl;
1905 
1906 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1907 	if (__predict_false(m == NULL))
1908 		return NULL;
1909 	if (m->m_len < sizeof(*evl)) {
1910 		m = m_pullup(m, sizeof(*evl));
1911 		if (__predict_false(m == NULL))
1912 			return NULL;
1913 	}
1914 	/*
1915 	 * Transform the Ethernet header into an Ethernet header
1916 	 * with 802.1Q encapsulation.
1917 	 */
1918 	evl = mtod(m, struct ether_vlan_header *);
1919 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
1920 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1921 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1922 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
1923 	m->m_flags &= ~M_VLANTAG;
1924 	return m;
1925 }
1926 #endif /* MXGE_NEW_VLAN_API */
1927 
1928 static void
1929 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
1930 {
1931 	mxge_softc_t *sc;
1932 	mcp_kreq_ether_send_t *req;
1933 	bus_dma_segment_t *seg;
1934 	struct mbuf *m_tmp;
1935 	struct ifnet *ifp;
1936 	mxge_tx_ring_t *tx;
1937 	struct ip *ip;
1938 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1939 	uint16_t pseudo_hdr_offset;
1940         uint8_t flags, cksum_offset;
1941 
1942 
1943 	sc = ss->sc;
1944 	ifp = sc->ifp;
1945 	tx = &ss->tx;
1946 
1947 	ip_off = sizeof (struct ether_header);
1948 #ifdef MXGE_NEW_VLAN_API
1949 	if (m->m_flags & M_VLANTAG) {
1950 		m = mxge_vlan_tag_insert(m);
1951 		if (__predict_false(m == NULL))
1952 			goto drop;
1953 		ip_off += ETHER_VLAN_ENCAP_LEN;
1954 	}
1955 #endif
1956 	/* (try to) map the frame for DMA */
1957 	idx = tx->req & tx->mask;
1958 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1959 				      m, tx->seg_list, &cnt,
1960 				      BUS_DMA_NOWAIT);
1961 	if (__predict_false(err == EFBIG)) {
1962 		/* Too many segments in the chain.  Try
1963 		   to defrag */
1964 		m_tmp = m_defrag(m, M_NOWAIT);
1965 		if (m_tmp == NULL) {
1966 			goto drop;
1967 		}
1968 		ss->tx.defrag++;
1969 		m = m_tmp;
1970 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
1971 					      tx->info[idx].map,
1972 					      m, tx->seg_list, &cnt,
1973 					      BUS_DMA_NOWAIT);
1974 	}
1975 	if (__predict_false(err != 0)) {
1976 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
1977 			      " packet len = %d\n", err, m->m_pkthdr.len);
1978 		goto drop;
1979 	}
1980 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
1981 			BUS_DMASYNC_PREWRITE);
1982 	tx->info[idx].m = m;
1983 
1984 #if IFCAP_TSO4
1985 	/* TSO is different enough, we handle it in another routine */
1986 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
1987 		mxge_encap_tso(ss, m, cnt, ip_off);
1988 		return;
1989 	}
1990 #endif
1991 
1992 	req = tx->req_list;
1993 	cksum_offset = 0;
1994 	pseudo_hdr_offset = 0;
1995 	flags = MXGEFW_FLAGS_NO_TSO;
1996 
1997 	/* checksum offloading? */
1998 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
1999 		/* ensure ip header is in first mbuf, copy
2000 		   it to a scratch buffer if not */
2001 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2002 			m_copydata(m, 0, ip_off + sizeof (*ip),
2003 				   ss->scratch);
2004 			ip = (struct ip *)(ss->scratch + ip_off);
2005 		} else {
2006 			ip = (struct ip *)(mtod(m, char *) + ip_off);
2007 		}
2008 		cksum_offset = ip_off + (ip->ip_hl << 2);
2009 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2010 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2011 		req->cksum_offset = cksum_offset;
2012 		flags |= MXGEFW_FLAGS_CKSUM;
2013 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2014 	} else {
2015 		odd_flag = 0;
2016 	}
2017 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2018 		flags |= MXGEFW_FLAGS_SMALL;
2019 
2020 	/* convert segments into a request list */
2021 	cum_len = 0;
2022 	seg = tx->seg_list;
2023 	req->flags = MXGEFW_FLAGS_FIRST;
2024 	for (i = 0; i < cnt; i++) {
2025 		req->addr_low =
2026 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2027 		req->addr_high =
2028 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2029 		req->length = htobe16(seg->ds_len);
2030 		req->cksum_offset = cksum_offset;
2031 		if (cksum_offset > seg->ds_len)
2032 			cksum_offset -= seg->ds_len;
2033 		else
2034 			cksum_offset = 0;
2035 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2036 		req->pad = 0; /* complete solid 16-byte block */
2037 		req->rdma_count = 1;
2038 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2039 		cum_len += seg->ds_len;
2040 		seg++;
2041 		req++;
2042 		req->flags = 0;
2043 	}
2044 	req--;
2045 	/* pad runts to 60 bytes */
2046 	if (cum_len < 60) {
2047 		req++;
2048 		req->addr_low =
2049 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2050 		req->addr_high =
2051 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2052 		req->length = htobe16(60 - cum_len);
2053 		req->cksum_offset = 0;
2054 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2055 		req->pad = 0; /* complete solid 16-byte block */
2056 		req->rdma_count = 1;
2057 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2058 		cnt++;
2059 	}
2060 
2061 	tx->req_list[0].rdma_count = cnt;
2062 #if 0
2063 	/* print what the firmware will see */
2064 	for (i = 0; i < cnt; i++) {
2065 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2066 		    "cso:%d, flags:0x%x, rdma:%d\n",
2067 		    i, (int)ntohl(tx->req_list[i].addr_high),
2068 		    (int)ntohl(tx->req_list[i].addr_low),
2069 		    (int)ntohs(tx->req_list[i].length),
2070 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2071 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2072 		    tx->req_list[i].rdma_count);
2073 	}
2074 	printf("--------------\n");
2075 #endif
2076 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2077 	mxge_submit_req(tx, tx->req_list, cnt);
2078 	return;
2079 
2080 drop:
2081 	m_freem(m);
2082 	ifp->if_oerrors++;
2083 	return;
2084 }
2085 
2086 
2087 
2088 
2089 static inline void
2090 mxge_start_locked(struct mxge_slice_state *ss)
2091 {
2092 	mxge_softc_t *sc;
2093 	struct mbuf *m;
2094 	struct ifnet *ifp;
2095 	mxge_tx_ring_t *tx;
2096 
2097 	sc = ss->sc;
2098 	ifp = sc->ifp;
2099 	tx = &ss->tx;
2100 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2101 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2102 		if (m == NULL) {
2103 			return;
2104 		}
2105 		/* let BPF see it */
2106 		BPF_MTAP(ifp, m);
2107 
2108 		/* give it to the nic */
2109 		mxge_encap(ss, m);
2110 	}
2111 	/* ran out of transmit slots */
2112 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2113 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2114 		tx->stall++;
2115 	}
2116 }
2117 
2118 static void
2119 mxge_start(struct ifnet *ifp)
2120 {
2121 	mxge_softc_t *sc = ifp->if_softc;
2122 	struct mxge_slice_state *ss;
2123 
2124 	/* only use the first slice for now */
2125 	ss = &sc->ss[0];
2126 	mtx_lock(&ss->tx.mtx);
2127 	mxge_start_locked(ss);
2128 	mtx_unlock(&ss->tx.mtx);
2129 }
2130 
2131 /*
2132  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2133  * at most 32 bytes at a time, so as to avoid involving the software
2134  * pio handler in the nic.   We re-write the first segment's low
2135  * DMA address to mark it valid only after we write the entire chunk
2136  * in a burst
2137  */
2138 static inline void
2139 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2140 		mcp_kreq_ether_recv_t *src)
2141 {
2142 	uint32_t low;
2143 
2144 	low = src->addr_low;
2145 	src->addr_low = 0xffffffff;
2146 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2147 	wmb();
2148 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2149 	wmb();
2150 	src->addr_low = low;
2151 	dst->addr_low = low;
2152 	wmb();
2153 }
2154 
2155 static int
2156 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2157 {
2158 	bus_dma_segment_t seg;
2159 	struct mbuf *m;
2160 	mxge_rx_ring_t *rx = &ss->rx_small;
2161 	int cnt, err;
2162 
2163 	m = m_gethdr(M_DONTWAIT, MT_DATA);
2164 	if (m == NULL) {
2165 		rx->alloc_fail++;
2166 		err = ENOBUFS;
2167 		goto done;
2168 	}
2169 	m->m_len = MHLEN;
2170 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2171 				      &seg, &cnt, BUS_DMA_NOWAIT);
2172 	if (err != 0) {
2173 		m_free(m);
2174 		goto done;
2175 	}
2176 	rx->info[idx].m = m;
2177 	rx->shadow[idx].addr_low =
2178 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2179 	rx->shadow[idx].addr_high =
2180 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2181 
2182 done:
2183 	if ((idx & 7) == 7)
2184 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2185 	return err;
2186 }
2187 
2188 static int
2189 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2190 {
2191 	bus_dma_segment_t seg[3];
2192 	struct mbuf *m;
2193 	mxge_rx_ring_t *rx = &ss->rx_big;
2194 	int cnt, err, i;
2195 
2196 	if (rx->cl_size == MCLBYTES)
2197 		m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2198 	else
2199 		m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2200 	if (m == NULL) {
2201 		rx->alloc_fail++;
2202 		err = ENOBUFS;
2203 		goto done;
2204 	}
2205 	m->m_len = rx->cl_size;
2206 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2207 				      seg, &cnt, BUS_DMA_NOWAIT);
2208 	if (err != 0) {
2209 		m_free(m);
2210 		goto done;
2211 	}
2212 	rx->info[idx].m = m;
2213 	rx->shadow[idx].addr_low =
2214 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2215 	rx->shadow[idx].addr_high =
2216 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2217 
2218 #if MXGE_VIRT_JUMBOS
2219 	for (i = 1; i < cnt; i++) {
2220 		rx->shadow[idx + i].addr_low =
2221 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2222 		rx->shadow[idx + i].addr_high =
2223 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2224        }
2225 #endif
2226 
2227 done:
2228        for (i = 0; i < rx->nbufs; i++) {
2229 		if ((idx & 7) == 7) {
2230 			mxge_submit_8rx(&rx->lanai[idx - 7],
2231 					&rx->shadow[idx - 7]);
2232 		}
2233 		idx++;
2234 	}
2235 	return err;
2236 }
2237 
2238 /*
2239  *  Myri10GE hardware checksums are not valid if the sender
2240  *  padded the frame with non-zero padding.  This is because
2241  *  the firmware just does a simple 16-bit 1s complement
2242  *  checksum across the entire frame, excluding the first 14
2243  *  bytes.  It is best to simply to check the checksum and
2244  *  tell the stack about it only if the checksum is good
2245  */
2246 
2247 static inline uint16_t
2248 mxge_rx_csum(struct mbuf *m, int csum)
2249 {
2250 	struct ether_header *eh;
2251 	struct ip *ip;
2252 	uint16_t c;
2253 
2254 	eh = mtod(m, struct ether_header *);
2255 
2256 	/* only deal with IPv4 TCP & UDP for now */
2257 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2258 		return 1;
2259 	ip = (struct ip *)(eh + 1);
2260 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2261 			    ip->ip_p != IPPROTO_UDP))
2262 		return 1;
2263 
2264 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2265 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2266 			    - (ip->ip_hl << 2) + ip->ip_p));
2267 	c ^= 0xffff;
2268 	return (c);
2269 }
2270 
2271 static void
2272 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2273 {
2274 	struct ether_vlan_header *evl;
2275 	struct ether_header *eh;
2276 	uint32_t partial;
2277 
2278 	evl = mtod(m, struct ether_vlan_header *);
2279 	eh = mtod(m, struct ether_header *);
2280 
2281 	/*
2282 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2283 	 * after what the firmware thought was the end of the ethernet
2284 	 * header.
2285 	 */
2286 
2287 	/* put checksum into host byte order */
2288 	*csum = ntohs(*csum);
2289 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2290 	(*csum) += ~partial;
2291 	(*csum) +=  ((*csum) < ~partial);
2292 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2293 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2294 
2295 	/* restore checksum to network byte order;
2296 	   later consumers expect this */
2297 	*csum = htons(*csum);
2298 
2299 	/* save the tag */
2300 #ifdef MXGE_NEW_VLAN_API
2301 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2302 #else
2303 	{
2304 		struct m_tag *mtag;
2305 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2306 				   M_NOWAIT);
2307 		if (mtag == NULL)
2308 			return;
2309 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2310 		m_tag_prepend(m, mtag);
2311 	}
2312 
2313 #endif
2314 	m->m_flags |= M_VLANTAG;
2315 
2316 	/*
2317 	 * Remove the 802.1q header by copying the Ethernet
2318 	 * addresses over it and adjusting the beginning of
2319 	 * the data in the mbuf.  The encapsulated Ethernet
2320 	 * type field is already in place.
2321 	 */
2322 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2323 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2324 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2325 }
2326 
2327 
2328 static inline void
2329 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2330 {
2331 	mxge_softc_t *sc;
2332 	struct ifnet *ifp;
2333 	struct mbuf *m;
2334 	struct ether_header *eh;
2335 	mxge_rx_ring_t *rx;
2336 	bus_dmamap_t old_map;
2337 	int idx;
2338 	uint16_t tcpudp_csum;
2339 
2340 	sc = ss->sc;
2341 	ifp = sc->ifp;
2342 	rx = &ss->rx_big;
2343 	idx = rx->cnt & rx->mask;
2344 	rx->cnt += rx->nbufs;
2345 	/* save a pointer to the received mbuf */
2346 	m = rx->info[idx].m;
2347 	/* try to replace the received mbuf */
2348 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2349 		/* drop the frame -- the old mbuf is re-cycled */
2350 		ifp->if_ierrors++;
2351 		return;
2352 	}
2353 
2354 	/* unmap the received buffer */
2355 	old_map = rx->info[idx].map;
2356 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2357 	bus_dmamap_unload(rx->dmat, old_map);
2358 
2359 	/* swap the bus_dmamap_t's */
2360 	rx->info[idx].map = rx->extra_map;
2361 	rx->extra_map = old_map;
2362 
2363 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2364 	 * aligned */
2365 	m->m_data += MXGEFW_PAD;
2366 
2367 	m->m_pkthdr.rcvif = ifp;
2368 	m->m_len = m->m_pkthdr.len = len;
2369 	ss->ipackets++;
2370 	eh = mtod(m, struct ether_header *);
2371 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2372 		mxge_vlan_tag_remove(m, &csum);
2373 	}
2374 	/* if the checksum is valid, mark it in the mbuf header */
2375 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2376 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2377 			return;
2378 		/* otherwise, it was a UDP frame, or a TCP frame which
2379 		   we could not do LRO on.  Tell the stack that the
2380 		   checksum is good */
2381 		m->m_pkthdr.csum_data = 0xffff;
2382 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2383 	}
2384 	/* pass the frame up the stack */
2385 	(*ifp->if_input)(ifp, m);
2386 }
2387 
2388 static inline void
2389 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2390 {
2391 	mxge_softc_t *sc;
2392 	struct ifnet *ifp;
2393 	struct ether_header *eh;
2394 	struct mbuf *m;
2395 	mxge_rx_ring_t *rx;
2396 	bus_dmamap_t old_map;
2397 	int idx;
2398 	uint16_t tcpudp_csum;
2399 
2400 	sc = ss->sc;
2401 	ifp = sc->ifp;
2402 	rx = &ss->rx_small;
2403 	idx = rx->cnt & rx->mask;
2404 	rx->cnt++;
2405 	/* save a pointer to the received mbuf */
2406 	m = rx->info[idx].m;
2407 	/* try to replace the received mbuf */
2408 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2409 		/* drop the frame -- the old mbuf is re-cycled */
2410 		ifp->if_ierrors++;
2411 		return;
2412 	}
2413 
2414 	/* unmap the received buffer */
2415 	old_map = rx->info[idx].map;
2416 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2417 	bus_dmamap_unload(rx->dmat, old_map);
2418 
2419 	/* swap the bus_dmamap_t's */
2420 	rx->info[idx].map = rx->extra_map;
2421 	rx->extra_map = old_map;
2422 
2423 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2424 	 * aligned */
2425 	m->m_data += MXGEFW_PAD;
2426 
2427 	m->m_pkthdr.rcvif = ifp;
2428 	m->m_len = m->m_pkthdr.len = len;
2429 	ss->ipackets++;
2430 	eh = mtod(m, struct ether_header *);
2431 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2432 		mxge_vlan_tag_remove(m, &csum);
2433 	}
2434 	/* if the checksum is valid, mark it in the mbuf header */
2435 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2436 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2437 			return;
2438 		/* otherwise, it was a UDP frame, or a TCP frame which
2439 		   we could not do LRO on.  Tell the stack that the
2440 		   checksum is good */
2441 		m->m_pkthdr.csum_data = 0xffff;
2442 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2443 	}
2444 	/* pass the frame up the stack */
2445 	(*ifp->if_input)(ifp, m);
2446 }
2447 
2448 static inline void
2449 mxge_clean_rx_done(struct mxge_slice_state *ss)
2450 {
2451 	mxge_rx_done_t *rx_done = &ss->rx_done;
2452 	struct lro_entry *lro;
2453 	int limit = 0;
2454 	uint16_t length;
2455 	uint16_t checksum;
2456 
2457 
2458 	while (rx_done->entry[rx_done->idx].length != 0) {
2459 		length = ntohs(rx_done->entry[rx_done->idx].length);
2460 		rx_done->entry[rx_done->idx].length = 0;
2461 		checksum = rx_done->entry[rx_done->idx].checksum;
2462 		if (length <= (MHLEN - MXGEFW_PAD))
2463 			mxge_rx_done_small(ss, length, checksum);
2464 		else
2465 			mxge_rx_done_big(ss, length, checksum);
2466 		rx_done->cnt++;
2467 		rx_done->idx = rx_done->cnt & rx_done->mask;
2468 
2469 		/* limit potential for livelock */
2470 		if (__predict_false(++limit > rx_done->mask / 2))
2471 			break;
2472 	}
2473 	while (!SLIST_EMPTY(&ss->lro_active)) {
2474 		lro = SLIST_FIRST(&ss->lro_active);
2475 		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2476 		mxge_lro_flush(ss, lro);
2477 	}
2478 }
2479 
2480 
2481 static inline void
2482 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2483 {
2484 	struct ifnet *ifp;
2485 	mxge_tx_ring_t *tx;
2486 	struct mbuf *m;
2487 	bus_dmamap_t map;
2488 	int idx;
2489 
2490 	tx = &ss->tx;
2491 	ifp = ss->sc->ifp;
2492 	while (tx->pkt_done != mcp_idx) {
2493 		idx = tx->done & tx->mask;
2494 		tx->done++;
2495 		m = tx->info[idx].m;
2496 		/* mbuf and DMA map only attached to the first
2497 		   segment per-mbuf */
2498 		if (m != NULL) {
2499 			ifp->if_opackets++;
2500 			tx->info[idx].m = NULL;
2501 			map = tx->info[idx].map;
2502 			bus_dmamap_unload(tx->dmat, map);
2503 			m_freem(m);
2504 		}
2505 		if (tx->info[idx].flag) {
2506 			tx->info[idx].flag = 0;
2507 			tx->pkt_done++;
2508 		}
2509 	}
2510 
2511 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2512            its OK to send packets */
2513 
2514 	if (ifp->if_drv_flags & IFF_DRV_OACTIVE &&
2515 	    tx->req - tx->done < (tx->mask + 1)/4) {
2516 		mtx_lock(&ss->tx.mtx);
2517 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2518 		ss->tx.wake++;
2519 		mxge_start_locked(ss);
2520 		mtx_unlock(&ss->tx.mtx);
2521 	}
2522 }
2523 
2524 static struct mxge_media_type mxge_xfp_media_types[] =
2525 {
2526 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2527 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2528 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2529 	{0,		(1 << 5),	"10GBASE-ER"},
2530 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2531 	{0,		(1 << 3),	"10GBASE-SW"},
2532 	{0,		(1 << 2),	"10GBASE-LW"},
2533 	{0,		(1 << 1),	"10GBASE-EW"},
2534 	{0,		(1 << 0),	"Reserved"}
2535 };
2536 static struct mxge_media_type mxge_sfp_media_types[] =
2537 {
2538 	{0,		(1 << 7),	"Reserved"},
2539 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2540 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2541 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"}
2542 };
2543 
2544 static void
2545 mxge_set_media(mxge_softc_t *sc, int type)
2546 {
2547 	sc->media_flags |= type;
2548 	ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2549 	ifmedia_set(&sc->media, sc->media_flags);
2550 }
2551 
2552 
2553 /*
2554  * Determine the media type for a NIC.  Some XFPs will identify
2555  * themselves only when their link is up, so this is initiated via a
2556  * link up interrupt.  However, this can potentially take up to
2557  * several milliseconds, so it is run via the watchdog routine, rather
2558  * than in the interrupt handler itself.   This need only be done
2559  * once, not each time the link is up.
2560  */
2561 static void
2562 mxge_media_probe(mxge_softc_t *sc)
2563 {
2564 	mxge_cmd_t cmd;
2565 	char *cage_type;
2566 	char *ptr;
2567 	struct mxge_media_type *mxge_media_types = NULL;
2568 	int i, err, ms, mxge_media_type_entries;
2569 	uint32_t byte;
2570 
2571 	sc->need_media_probe = 0;
2572 
2573 	/* if we've already set a media type, we're done */
2574 	if (sc->media_flags  != (IFM_ETHER | IFM_AUTO))
2575 		return;
2576 
2577 	/*
2578 	 * parse the product code to deterimine the interface type
2579 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2580 	 * after the 3rd dash in the driver's cached copy of the
2581 	 * EEPROM's product code string.
2582 	 */
2583 	ptr = sc->product_code_string;
2584 	if (ptr == NULL) {
2585 		device_printf(sc->dev, "Missing product code\n");
2586 	}
2587 
2588 	for (i = 0; i < 3; i++, ptr++) {
2589 		ptr = index(ptr, '-');
2590 		if (ptr == NULL) {
2591 			device_printf(sc->dev,
2592 				      "only %d dashes in PC?!?\n", i);
2593 			return;
2594 		}
2595 	}
2596 	if (*ptr == 'C') {
2597 		/* -C is CX4 */
2598 		mxge_set_media(sc, IFM_10G_CX4);
2599 		return;
2600 	}
2601 	else if (*ptr == 'Q') {
2602 		/* -Q is Quad Ribbon Fiber */
2603 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2604 		/* FreeBSD has no media type for Quad ribbon fiber */
2605 		return;
2606 	}
2607 
2608 	if (*ptr == 'R') {
2609 		/* -R is XFP */
2610 		mxge_media_types = mxge_xfp_media_types;
2611 		mxge_media_type_entries =
2612 			sizeof (mxge_xfp_media_types) /
2613 			sizeof (mxge_xfp_media_types[0]);
2614 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2615 		cage_type = "XFP";
2616 	}
2617 
2618 	if (*ptr == 'S' || *(ptr +1) == 'S') {
2619 		/* -S or -2S is SFP+ */
2620 		mxge_media_types = mxge_sfp_media_types;
2621 		mxge_media_type_entries =
2622 			sizeof (mxge_sfp_media_types) /
2623 			sizeof (mxge_sfp_media_types[0]);
2624 		cage_type = "SFP+";
2625 		byte = 3;
2626 	}
2627 
2628 	if (mxge_media_types == NULL) {
2629 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2630 		return;
2631 	}
2632 
2633 	/*
2634 	 * At this point we know the NIC has an XFP cage, so now we
2635 	 * try to determine what is in the cage by using the
2636 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2637 	 * register.  We read just one byte, which may take over
2638 	 * a millisecond
2639 	 */
2640 
2641 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2642 	cmd.data1 = byte;
2643 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2644 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2645 		device_printf(sc->dev, "failed to read XFP\n");
2646 	}
2647 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2648 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2649 	}
2650 	if (err != MXGEFW_CMD_OK) {
2651 		return;
2652 	}
2653 
2654 	/* now we wait for the data to be cached */
2655 	cmd.data0 = byte;
2656 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2657 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2658 		DELAY(1000);
2659 		cmd.data0 = byte;
2660 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2661 	}
2662 	if (err != MXGEFW_CMD_OK) {
2663 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2664 			      cage_type, err, ms);
2665 		return;
2666 	}
2667 
2668 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2669 		if (mxge_verbose)
2670 			device_printf(sc->dev, "%s:%s\n", cage_type,
2671 				      mxge_media_types[0].name);
2672 		mxge_set_media(sc, IFM_10G_CX4);
2673 		return;
2674 	}
2675 	for (i = 1; i < mxge_media_type_entries; i++) {
2676 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2677 			if (mxge_verbose)
2678 				device_printf(sc->dev, "%s:%s\n",
2679 					      cage_type,
2680 					      mxge_media_types[i].name);
2681 
2682 			mxge_set_media(sc, mxge_media_types[i].flag);
2683 			return;
2684 		}
2685 	}
2686 	device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2687 		      cmd.data0);
2688 
2689 	return;
2690 }
2691 
2692 static void
2693 mxge_intr(void *arg)
2694 {
2695 	struct mxge_slice_state *ss = arg;
2696 	mxge_softc_t *sc = ss->sc;
2697 	mcp_irq_data_t *stats = ss->fw_stats;
2698 	mxge_tx_ring_t *tx = &ss->tx;
2699 	mxge_rx_done_t *rx_done = &ss->rx_done;
2700 	uint32_t send_done_count;
2701 	uint8_t valid;
2702 
2703 
2704 	/* an interrupt on a non-zero slice is implicitly valid
2705 	   since MSI-X irqs are not shared */
2706 	if (ss != sc->ss) {
2707 		mxge_clean_rx_done(ss);
2708 		*ss->irq_claim = be32toh(3);
2709 		return;
2710 	}
2711 
2712 	/* make sure the DMA has finished */
2713 	if (!stats->valid) {
2714 		return;
2715 	}
2716 	valid = stats->valid;
2717 
2718 	if (sc->legacy_irq) {
2719 		/* lower legacy IRQ  */
2720 		*sc->irq_deassert = 0;
2721 		if (!mxge_deassert_wait)
2722 			/* don't wait for conf. that irq is low */
2723 			stats->valid = 0;
2724 	} else {
2725 		stats->valid = 0;
2726 	}
2727 
2728 	/* loop while waiting for legacy irq deassertion */
2729 	do {
2730 		/* check for transmit completes and receives */
2731 		send_done_count = be32toh(stats->send_done_count);
2732 		while ((send_done_count != tx->pkt_done) ||
2733 		       (rx_done->entry[rx_done->idx].length != 0)) {
2734 			mxge_tx_done(ss, (int)send_done_count);
2735 			mxge_clean_rx_done(ss);
2736 			send_done_count = be32toh(stats->send_done_count);
2737 		}
2738 		if (sc->legacy_irq && mxge_deassert_wait)
2739 			wmb();
2740 	} while (*((volatile uint8_t *) &stats->valid));
2741 
2742 	if (__predict_false(stats->stats_updated)) {
2743 		if (sc->link_state != stats->link_up) {
2744 			sc->link_state = stats->link_up;
2745 			if (sc->link_state) {
2746 				if_link_state_change(sc->ifp, LINK_STATE_UP);
2747 				if (mxge_verbose)
2748 					device_printf(sc->dev, "link up\n");
2749 			} else {
2750 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2751 				if (mxge_verbose)
2752 					device_printf(sc->dev, "link down\n");
2753 			}
2754 			sc->need_media_probe = 1;
2755 		}
2756 		if (sc->rdma_tags_available !=
2757 		    be32toh(stats->rdma_tags_available)) {
2758 			sc->rdma_tags_available =
2759 				be32toh(stats->rdma_tags_available);
2760 			device_printf(sc->dev, "RDMA timed out! %d tags "
2761 				      "left\n", sc->rdma_tags_available);
2762 		}
2763 
2764 		if (stats->link_down) {
2765 			sc->down_cnt += stats->link_down;
2766 			sc->link_state = 0;
2767 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2768 		}
2769 	}
2770 
2771 	/* check to see if we have rx token to pass back */
2772 	if (valid & 0x1)
2773 	    *ss->irq_claim = be32toh(3);
2774 	*(ss->irq_claim + 1) = be32toh(3);
2775 }
2776 
2777 static void
2778 mxge_init(void *arg)
2779 {
2780 }
2781 
2782 
2783 
2784 static void
2785 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2786 {
2787 	struct lro_entry *lro_entry;
2788 	int i;
2789 
2790 	while (!SLIST_EMPTY(&ss->lro_free)) {
2791 		lro_entry = SLIST_FIRST(&ss->lro_free);
2792 		SLIST_REMOVE_HEAD(&ss->lro_free, next);
2793 		free(lro_entry, M_DEVBUF);
2794 	}
2795 
2796 	for (i = 0; i <= ss->rx_big.mask; i++) {
2797 		if (ss->rx_big.info[i].m == NULL)
2798 			continue;
2799 		bus_dmamap_unload(ss->rx_big.dmat,
2800 				  ss->rx_big.info[i].map);
2801 		m_freem(ss->rx_big.info[i].m);
2802 		ss->rx_big.info[i].m = NULL;
2803 	}
2804 
2805 	for (i = 0; i <= ss->rx_small.mask; i++) {
2806 		if (ss->rx_small.info[i].m == NULL)
2807 			continue;
2808 		bus_dmamap_unload(ss->rx_small.dmat,
2809 				  ss->rx_small.info[i].map);
2810 		m_freem(ss->rx_small.info[i].m);
2811 		ss->rx_small.info[i].m = NULL;
2812 	}
2813 
2814 	/* transmit ring used only on the first slice */
2815 	if (ss->tx.info == NULL)
2816 		return;
2817 
2818 	for (i = 0; i <= ss->tx.mask; i++) {
2819 		ss->tx.info[i].flag = 0;
2820 		if (ss->tx.info[i].m == NULL)
2821 			continue;
2822 		bus_dmamap_unload(ss->tx.dmat,
2823 				  ss->tx.info[i].map);
2824 		m_freem(ss->tx.info[i].m);
2825 		ss->tx.info[i].m = NULL;
2826 	}
2827 }
2828 
2829 static void
2830 mxge_free_mbufs(mxge_softc_t *sc)
2831 {
2832 	int slice;
2833 
2834 	for (slice = 0; slice < sc->num_slices; slice++)
2835 		mxge_free_slice_mbufs(&sc->ss[slice]);
2836 }
2837 
2838 static void
2839 mxge_free_slice_rings(struct mxge_slice_state *ss)
2840 {
2841 	int i;
2842 
2843 
2844 	if (ss->rx_done.entry != NULL)
2845 		mxge_dma_free(&ss->rx_done.dma);
2846 	ss->rx_done.entry = NULL;
2847 
2848 	if (ss->tx.req_bytes != NULL)
2849 		free(ss->tx.req_bytes, M_DEVBUF);
2850 	ss->tx.req_bytes = NULL;
2851 
2852 	if (ss->tx.seg_list != NULL)
2853 		free(ss->tx.seg_list, M_DEVBUF);
2854 	ss->tx.seg_list = NULL;
2855 
2856 	if (ss->rx_small.shadow != NULL)
2857 		free(ss->rx_small.shadow, M_DEVBUF);
2858 	ss->rx_small.shadow = NULL;
2859 
2860 	if (ss->rx_big.shadow != NULL)
2861 		free(ss->rx_big.shadow, M_DEVBUF);
2862 	ss->rx_big.shadow = NULL;
2863 
2864 	if (ss->tx.info != NULL) {
2865 		if (ss->tx.dmat != NULL) {
2866 			for (i = 0; i <= ss->tx.mask; i++) {
2867 				bus_dmamap_destroy(ss->tx.dmat,
2868 						   ss->tx.info[i].map);
2869 			}
2870 			bus_dma_tag_destroy(ss->tx.dmat);
2871 		}
2872 		free(ss->tx.info, M_DEVBUF);
2873 	}
2874 	ss->tx.info = NULL;
2875 
2876 	if (ss->rx_small.info != NULL) {
2877 		if (ss->rx_small.dmat != NULL) {
2878 			for (i = 0; i <= ss->rx_small.mask; i++) {
2879 				bus_dmamap_destroy(ss->rx_small.dmat,
2880 						   ss->rx_small.info[i].map);
2881 			}
2882 			bus_dmamap_destroy(ss->rx_small.dmat,
2883 					   ss->rx_small.extra_map);
2884 			bus_dma_tag_destroy(ss->rx_small.dmat);
2885 		}
2886 		free(ss->rx_small.info, M_DEVBUF);
2887 	}
2888 	ss->rx_small.info = NULL;
2889 
2890 	if (ss->rx_big.info != NULL) {
2891 		if (ss->rx_big.dmat != NULL) {
2892 			for (i = 0; i <= ss->rx_big.mask; i++) {
2893 				bus_dmamap_destroy(ss->rx_big.dmat,
2894 						   ss->rx_big.info[i].map);
2895 			}
2896 			bus_dmamap_destroy(ss->rx_big.dmat,
2897 					   ss->rx_big.extra_map);
2898 			bus_dma_tag_destroy(ss->rx_big.dmat);
2899 		}
2900 		free(ss->rx_big.info, M_DEVBUF);
2901 	}
2902 	ss->rx_big.info = NULL;
2903 }
2904 
2905 static void
2906 mxge_free_rings(mxge_softc_t *sc)
2907 {
2908 	int slice;
2909 
2910 	for (slice = 0; slice < sc->num_slices; slice++)
2911 		mxge_free_slice_rings(&sc->ss[slice]);
2912 }
2913 
2914 static int
2915 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
2916 		       int tx_ring_entries)
2917 {
2918 	mxge_softc_t *sc = ss->sc;
2919 	size_t bytes;
2920 	int err, i;
2921 
2922 	err = ENOMEM;
2923 
2924 	/* allocate per-slice receive resources */
2925 
2926 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
2927 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
2928 
2929 	/* allocate the rx shadow rings */
2930 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
2931 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2932 	if (ss->rx_small.shadow == NULL)
2933 		return err;;
2934 
2935 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
2936 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2937 	if (ss->rx_big.shadow == NULL)
2938 		return err;;
2939 
2940 	/* allocate the rx host info rings */
2941 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
2942 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2943 	if (ss->rx_small.info == NULL)
2944 		return err;;
2945 
2946 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
2947 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2948 	if (ss->rx_big.info == NULL)
2949 		return err;;
2950 
2951 	/* allocate the rx busdma resources */
2952 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2953 				 1,			/* alignment */
2954 				 4096,			/* boundary */
2955 				 BUS_SPACE_MAXADDR,	/* low */
2956 				 BUS_SPACE_MAXADDR,	/* high */
2957 				 NULL, NULL,		/* filter */
2958 				 MHLEN,			/* maxsize */
2959 				 1,			/* num segs */
2960 				 MHLEN,			/* maxsegsize */
2961 				 BUS_DMA_ALLOCNOW,	/* flags */
2962 				 NULL, NULL,		/* lock */
2963 				 &ss->rx_small.dmat);	/* tag */
2964 	if (err != 0) {
2965 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2966 			      err);
2967 		return err;;
2968 	}
2969 
2970 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2971 				 1,			/* alignment */
2972 #if MXGE_VIRT_JUMBOS
2973 				 4096,			/* boundary */
2974 #else
2975 				 0,			/* boundary */
2976 #endif
2977 				 BUS_SPACE_MAXADDR,	/* low */
2978 				 BUS_SPACE_MAXADDR,	/* high */
2979 				 NULL, NULL,		/* filter */
2980 				 3*4096,		/* maxsize */
2981 #if MXGE_VIRT_JUMBOS
2982 				 3,			/* num segs */
2983 				 4096,			/* maxsegsize*/
2984 #else
2985 				 1,			/* num segs */
2986 				 MJUM9BYTES,		/* maxsegsize*/
2987 #endif
2988 				 BUS_DMA_ALLOCNOW,	/* flags */
2989 				 NULL, NULL,		/* lock */
2990 				 &ss->rx_big.dmat);	/* tag */
2991 	if (err != 0) {
2992 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2993 			      err);
2994 		return err;;
2995 	}
2996 	for (i = 0; i <= ss->rx_small.mask; i++) {
2997 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
2998 					&ss->rx_small.info[i].map);
2999 		if (err != 0) {
3000 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3001 				      err);
3002 			return err;;
3003 		}
3004 	}
3005 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3006 				&ss->rx_small.extra_map);
3007 	if (err != 0) {
3008 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3009 			      err);
3010 		return err;;
3011 	}
3012 
3013 	for (i = 0; i <= ss->rx_big.mask; i++) {
3014 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3015 					&ss->rx_big.info[i].map);
3016 		if (err != 0) {
3017 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3018 				      err);
3019 			return err;;
3020 		}
3021 	}
3022 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3023 				&ss->rx_big.extra_map);
3024 	if (err != 0) {
3025 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3026 			      err);
3027 		return err;;
3028 	}
3029 
3030 	/* now allocate TX resouces */
3031 
3032 	/* only use a single TX ring for now */
3033 	if (ss != ss->sc->ss)
3034 		return 0;
3035 
3036 	ss->tx.mask = tx_ring_entries - 1;
3037 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3038 
3039 
3040 	/* allocate the tx request copy block */
3041 	bytes = 8 +
3042 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3043 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3044 	if (ss->tx.req_bytes == NULL)
3045 		return err;;
3046 	/* ensure req_list entries are aligned to 8 bytes */
3047 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3048 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3049 
3050 	/* allocate the tx busdma segment list */
3051 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3052 	ss->tx.seg_list = (bus_dma_segment_t *)
3053 		malloc(bytes, M_DEVBUF, M_WAITOK);
3054 	if (ss->tx.seg_list == NULL)
3055 		return err;;
3056 
3057 	/* allocate the tx host info ring */
3058 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3059 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3060 	if (ss->tx.info == NULL)
3061 		return err;;
3062 
3063 	/* allocate the tx busdma resources */
3064 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3065 				 1,			/* alignment */
3066 				 sc->tx_boundary,	/* boundary */
3067 				 BUS_SPACE_MAXADDR,	/* low */
3068 				 BUS_SPACE_MAXADDR,	/* high */
3069 				 NULL, NULL,		/* filter */
3070 				 65536 + 256,		/* maxsize */
3071 				 ss->tx.max_desc - 2,	/* num segs */
3072 				 sc->tx_boundary,	/* maxsegsz */
3073 				 BUS_DMA_ALLOCNOW,	/* flags */
3074 				 NULL, NULL,		/* lock */
3075 				 &ss->tx.dmat);		/* tag */
3076 
3077 	if (err != 0) {
3078 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3079 			      err);
3080 		return err;;
3081 	}
3082 
3083 	/* now use these tags to setup dmamaps for each slot
3084 	   in the ring */
3085 	for (i = 0; i <= ss->tx.mask; i++) {
3086 		err = bus_dmamap_create(ss->tx.dmat, 0,
3087 					&ss->tx.info[i].map);
3088 		if (err != 0) {
3089 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3090 				      err);
3091 			return err;;
3092 		}
3093 	}
3094 	return 0;
3095 
3096 }
3097 
3098 static int
3099 mxge_alloc_rings(mxge_softc_t *sc)
3100 {
3101 	mxge_cmd_t cmd;
3102 	int tx_ring_size;
3103 	int tx_ring_entries, rx_ring_entries;
3104 	int err, slice;
3105 
3106 	/* get ring sizes */
3107 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3108 	tx_ring_size = cmd.data0;
3109 	if (err != 0) {
3110 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3111 		goto abort;
3112 	}
3113 
3114 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3115 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3116 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3117 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3118 	IFQ_SET_READY(&sc->ifp->if_snd);
3119 
3120 	for (slice = 0; slice < sc->num_slices; slice++) {
3121 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3122 					     rx_ring_entries,
3123 					     tx_ring_entries);
3124 		if (err != 0)
3125 			goto abort;
3126 	}
3127 	return 0;
3128 
3129 abort:
3130 	mxge_free_rings(sc);
3131 	return err;
3132 
3133 }
3134 
3135 
3136 static void
3137 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3138 {
3139 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3140 
3141 	if (bufsize < MCLBYTES) {
3142 		/* easy, everything fits in a single buffer */
3143 		*big_buf_size = MCLBYTES;
3144 		*cl_size = MCLBYTES;
3145 		*nbufs = 1;
3146 		return;
3147 	}
3148 
3149 	if (bufsize < MJUMPAGESIZE) {
3150 		/* still easy, everything still fits in a single buffer */
3151 		*big_buf_size = MJUMPAGESIZE;
3152 		*cl_size = MJUMPAGESIZE;
3153 		*nbufs = 1;
3154 		return;
3155 	}
3156 #if MXGE_VIRT_JUMBOS
3157 	/* now we need to use virtually contiguous buffers */
3158 	*cl_size = MJUM9BYTES;
3159 	*big_buf_size = 4096;
3160 	*nbufs = mtu / 4096 + 1;
3161 	/* needs to be a power of two, so round up */
3162 	if (*nbufs == 3)
3163 		*nbufs = 4;
3164 #else
3165 	*cl_size = MJUM9BYTES;
3166 	*big_buf_size = MJUM9BYTES;
3167 	*nbufs = 1;
3168 #endif
3169 }
3170 
3171 static int
3172 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3173 {
3174 	mxge_softc_t *sc;
3175 	mxge_cmd_t cmd;
3176 	bus_dmamap_t map;
3177 	struct lro_entry *lro_entry;
3178 	int err, i, slice;
3179 
3180 
3181 	sc = ss->sc;
3182 	slice = ss - sc->ss;
3183 
3184 	SLIST_INIT(&ss->lro_free);
3185 	SLIST_INIT(&ss->lro_active);
3186 
3187 	for (i = 0; i < sc->lro_cnt; i++) {
3188 		lro_entry = (struct lro_entry *)
3189 			malloc(sizeof (*lro_entry), M_DEVBUF,
3190 			       M_NOWAIT | M_ZERO);
3191 		if (lro_entry == NULL) {
3192 			sc->lro_cnt = i;
3193 			break;
3194 		}
3195 		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3196 	}
3197 	/* get the lanai pointers to the send and receive rings */
3198 
3199 	err = 0;
3200 	/* We currently only send from the first slice */
3201 	if (slice == 0) {
3202 		cmd.data0 = slice;
3203 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3204 		ss->tx.lanai =
3205 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3206 	}
3207 	cmd.data0 = slice;
3208 	err |= mxge_send_cmd(sc,
3209 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3210 	ss->rx_small.lanai =
3211 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3212 	cmd.data0 = slice;
3213 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3214 	ss->rx_big.lanai =
3215 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3216 
3217 	if (err != 0) {
3218 		device_printf(sc->dev,
3219 			      "failed to get ring sizes or locations\n");
3220 		return EIO;
3221 	}
3222 
3223 	/* stock receive rings */
3224 	for (i = 0; i <= ss->rx_small.mask; i++) {
3225 		map = ss->rx_small.info[i].map;
3226 		err = mxge_get_buf_small(ss, map, i);
3227 		if (err) {
3228 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3229 				      i, ss->rx_small.mask + 1);
3230 			return ENOMEM;
3231 		}
3232 	}
3233 	for (i = 0; i <= ss->rx_big.mask; i++) {
3234 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3235 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3236 	}
3237 	ss->rx_big.nbufs = nbufs;
3238 	ss->rx_big.cl_size = cl_size;
3239 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3240 		map = ss->rx_big.info[i].map;
3241 		err = mxge_get_buf_big(ss, map, i);
3242 		if (err) {
3243 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3244 				      i, ss->rx_big.mask + 1);
3245 			return ENOMEM;
3246 		}
3247 	}
3248 	return 0;
3249 }
3250 
3251 static int
3252 mxge_open(mxge_softc_t *sc)
3253 {
3254 	mxge_cmd_t cmd;
3255 	int err, big_bytes, nbufs, slice, cl_size, i;
3256 	bus_addr_t bus;
3257 	volatile uint8_t *itable;
3258 
3259 	/* Copy the MAC address in case it was overridden */
3260 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3261 
3262 	err = mxge_reset(sc, 1);
3263 	if (err != 0) {
3264 		device_printf(sc->dev, "failed to reset\n");
3265 		return EIO;
3266 	}
3267 
3268 	if (sc->num_slices > 1) {
3269 		/* setup the indirection table */
3270 		cmd.data0 = sc->num_slices;
3271 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3272 				    &cmd);
3273 
3274 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3275 				     &cmd);
3276 		if (err != 0) {
3277 			device_printf(sc->dev,
3278 				      "failed to setup rss tables\n");
3279 			return err;
3280 		}
3281 
3282 		/* just enable an identity mapping */
3283 		itable = sc->sram + cmd.data0;
3284 		for (i = 0; i < sc->num_slices; i++)
3285 			itable[i] = (uint8_t)i;
3286 
3287 		cmd.data0 = 1;
3288 		cmd.data1 = mxge_rss_hash_type;
3289 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3290 		if (err != 0) {
3291 			device_printf(sc->dev, "failed to enable slices\n");
3292 			return err;
3293 		}
3294 	}
3295 
3296 
3297 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3298 
3299 	cmd.data0 = nbufs;
3300 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3301 			    &cmd);
3302 	/* error is only meaningful if we're trying to set
3303 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3304 	if (err && nbufs > 1) {
3305 		device_printf(sc->dev,
3306 			      "Failed to set alway-use-n to %d\n",
3307 			      nbufs);
3308 		return EIO;
3309 	}
3310 	/* Give the firmware the mtu and the big and small buffer
3311 	   sizes.  The firmware wants the big buf size to be a power
3312 	   of two. Luckily, FreeBSD's clusters are powers of two */
3313 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3314 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3315 	cmd.data0 = MHLEN - MXGEFW_PAD;
3316 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3317 			     &cmd);
3318 	cmd.data0 = big_bytes;
3319 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3320 
3321 	if (err != 0) {
3322 		device_printf(sc->dev, "failed to setup params\n");
3323 		goto abort;
3324 	}
3325 
3326 	/* Now give him the pointer to the stats block */
3327 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->ss->fw_stats_dma.bus_addr);
3328 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->ss->fw_stats_dma.bus_addr);
3329 	cmd.data2 = sizeof(struct mcp_irq_data);
3330 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3331 
3332 	if (err != 0) {
3333 		bus = sc->ss->fw_stats_dma.bus_addr;
3334 		bus += offsetof(struct mcp_irq_data, send_done_count);
3335 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3336 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3337 		err = mxge_send_cmd(sc,
3338 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3339 				    &cmd);
3340 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3341 		sc->fw_multicast_support = 0;
3342 	} else {
3343 		sc->fw_multicast_support = 1;
3344 	}
3345 
3346 	if (err != 0) {
3347 		device_printf(sc->dev, "failed to setup params\n");
3348 		goto abort;
3349 	}
3350 
3351 	for (slice = 0; slice < sc->num_slices; slice++) {
3352 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3353 		if (err != 0) {
3354 			device_printf(sc->dev, "couldn't open slice %d\n",
3355 				      slice);
3356 			goto abort;
3357 		}
3358 	}
3359 
3360 	/* Finally, start the firmware running */
3361 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3362 	if (err) {
3363 		device_printf(sc->dev, "Couldn't bring up link\n");
3364 		goto abort;
3365 	}
3366 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3367 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3368 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3369 
3370 	return 0;
3371 
3372 
3373 abort:
3374 	mxge_free_mbufs(sc);
3375 
3376 	return err;
3377 }
3378 
3379 static int
3380 mxge_close(mxge_softc_t *sc)
3381 {
3382 	mxge_cmd_t cmd;
3383 	int err, old_down_cnt;
3384 
3385 	callout_stop(&sc->co_hdl);
3386 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3387 	old_down_cnt = sc->down_cnt;
3388 	wmb();
3389 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3390 	if (err) {
3391 		device_printf(sc->dev, "Couldn't bring down link\n");
3392 	}
3393 	if (old_down_cnt == sc->down_cnt) {
3394 		/* wait for down irq */
3395 		DELAY(10 * sc->intr_coal_delay);
3396 	}
3397 	wmb();
3398 	if (old_down_cnt == sc->down_cnt) {
3399 		device_printf(sc->dev, "never got down irq\n");
3400 	}
3401 
3402 	mxge_free_mbufs(sc);
3403 
3404 	return 0;
3405 }
3406 
3407 static void
3408 mxge_setup_cfg_space(mxge_softc_t *sc)
3409 {
3410 	device_t dev = sc->dev;
3411 	int reg;
3412 	uint16_t cmd, lnk, pectl;
3413 
3414 	/* find the PCIe link width and set max read request to 4KB*/
3415 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3416 		lnk = pci_read_config(dev, reg + 0x12, 2);
3417 		sc->link_width = (lnk >> 4) & 0x3f;
3418 
3419 		pectl = pci_read_config(dev, reg + 0x8, 2);
3420 		pectl = (pectl & ~0x7000) | (5 << 12);
3421 		pci_write_config(dev, reg + 0x8, pectl, 2);
3422 	}
3423 
3424 	/* Enable DMA and Memory space access */
3425 	pci_enable_busmaster(dev);
3426 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3427 	cmd |= PCIM_CMD_MEMEN;
3428 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3429 }
3430 
3431 static uint32_t
3432 mxge_read_reboot(mxge_softc_t *sc)
3433 {
3434 	device_t dev = sc->dev;
3435 	uint32_t vs;
3436 
3437 	/* find the vendor specific offset */
3438 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3439 		device_printf(sc->dev,
3440 			      "could not find vendor specific offset\n");
3441 		return (uint32_t)-1;
3442 	}
3443 	/* enable read32 mode */
3444 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3445 	/* tell NIC which register to read */
3446 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3447 	return (pci_read_config(dev, vs + 0x14, 4));
3448 }
3449 
3450 static int
3451 mxge_watchdog_reset(mxge_softc_t *sc)
3452 {
3453 	struct pci_devinfo *dinfo;
3454 	int err;
3455 	uint32_t reboot;
3456 	uint16_t cmd;
3457 
3458 	err = ENXIO;
3459 
3460 	device_printf(sc->dev, "Watchdog reset!\n");
3461 
3462 	/*
3463 	 * check to see if the NIC rebooted.  If it did, then all of
3464 	 * PCI config space has been reset, and things like the
3465 	 * busmaster bit will be zero.  If this is the case, then we
3466 	 * must restore PCI config space before the NIC can be used
3467 	 * again
3468 	 */
3469 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3470 	if (cmd == 0xffff) {
3471 		/*
3472 		 * maybe the watchdog caught the NIC rebooting; wait
3473 		 * up to 100ms for it to finish.  If it does not come
3474 		 * back, then give up
3475 		 */
3476 		DELAY(1000*100);
3477 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3478 		if (cmd == 0xffff) {
3479 			device_printf(sc->dev, "NIC disappeared!\n");
3480 			return (err);
3481 		}
3482 	}
3483 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3484 		/* print the reboot status */
3485 		reboot = mxge_read_reboot(sc);
3486 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3487 			      reboot);
3488 		/* restore PCI configuration space */
3489 		dinfo = device_get_ivars(sc->dev);
3490 		pci_cfg_restore(sc->dev, dinfo);
3491 
3492 		/* and redo any changes we made to our config space */
3493 		mxge_setup_cfg_space(sc);
3494 
3495 		if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
3496 			mxge_close(sc);
3497 			err = mxge_open(sc);
3498 		}
3499 	} else {
3500 		device_printf(sc->dev, "NIC did not reboot, ring state:\n");
3501 		device_printf(sc->dev, "tx.req=%d tx.done=%d\n",
3502 			      sc->ss->tx.req, sc->ss->tx.done);
3503 		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3504 			      sc->ss->tx.pkt_done,
3505 			      be32toh(sc->ss->fw_stats->send_done_count));
3506 		device_printf(sc->dev, "not resetting\n");
3507 	}
3508 	return (err);
3509 }
3510 
3511 static int
3512 mxge_watchdog(mxge_softc_t *sc)
3513 {
3514 	mxge_tx_ring_t *tx = &sc->ss->tx;
3515 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3516 	int err = 0;
3517 
3518 	/* see if we have outstanding transmits, which
3519 	   have been pending for more than mxge_ticks */
3520 	if (tx->req != tx->done &&
3521 	    tx->watchdog_req != tx->watchdog_done &&
3522 	    tx->done == tx->watchdog_done) {
3523 		/* check for pause blocking before resetting */
3524 		if (tx->watchdog_rx_pause == rx_pause)
3525 			err = mxge_watchdog_reset(sc);
3526 		else
3527 			device_printf(sc->dev, "Flow control blocking "
3528 				      "xmits, check link partner\n");
3529 	}
3530 
3531 	tx->watchdog_req = tx->req;
3532 	tx->watchdog_done = tx->done;
3533 	tx->watchdog_rx_pause = rx_pause;
3534 
3535 	if (sc->need_media_probe)
3536 		mxge_media_probe(sc);
3537 	return (err);
3538 }
3539 
3540 static void
3541 mxge_update_stats(mxge_softc_t *sc)
3542 {
3543 	struct mxge_slice_state *ss;
3544 	u_long ipackets = 0;
3545 	int slice;
3546 
3547 	for(slice = 0; slice < sc->num_slices; slice++) {
3548 		ss = &sc->ss[slice];
3549 		ipackets += ss->ipackets;
3550 	}
3551 	sc->ifp->if_ipackets = ipackets;
3552 
3553 }
3554 static void
3555 mxge_tick(void *arg)
3556 {
3557 	mxge_softc_t *sc = arg;
3558 	int err = 0;
3559 
3560 	/* aggregate stats from different slices */
3561 	mxge_update_stats(sc);
3562 	if (!sc->watchdog_countdown) {
3563 		err = mxge_watchdog(sc);
3564 		sc->watchdog_countdown = 4;
3565 	}
3566 	sc->watchdog_countdown--;
3567 	if (err == 0)
3568 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3569 
3570 }
3571 
3572 static int
3573 mxge_media_change(struct ifnet *ifp)
3574 {
3575 	return EINVAL;
3576 }
3577 
3578 static int
3579 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3580 {
3581 	struct ifnet *ifp = sc->ifp;
3582 	int real_mtu, old_mtu;
3583 	int err = 0;
3584 
3585 
3586 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3587 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3588 		return EINVAL;
3589 	mtx_lock(&sc->driver_mtx);
3590 	old_mtu = ifp->if_mtu;
3591 	ifp->if_mtu = mtu;
3592 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3593 		mxge_close(sc);
3594 		err = mxge_open(sc);
3595 		if (err != 0) {
3596 			ifp->if_mtu = old_mtu;
3597 			mxge_close(sc);
3598 			(void) mxge_open(sc);
3599 		}
3600 	}
3601 	mtx_unlock(&sc->driver_mtx);
3602 	return err;
3603 }
3604 
3605 static void
3606 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3607 {
3608 	mxge_softc_t *sc = ifp->if_softc;
3609 
3610 
3611 	if (sc == NULL)
3612 		return;
3613 	ifmr->ifm_status = IFM_AVALID;
3614 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3615 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3616 	ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3617 }
3618 
3619 static int
3620 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
3621 {
3622 	mxge_softc_t *sc = ifp->if_softc;
3623 	struct ifreq *ifr = (struct ifreq *)data;
3624 	int err, mask;
3625 
3626 	err = 0;
3627 	switch (command) {
3628 	case SIOCSIFADDR:
3629 	case SIOCGIFADDR:
3630 		err = ether_ioctl(ifp, command, data);
3631 		break;
3632 
3633 	case SIOCSIFMTU:
3634 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3635 		break;
3636 
3637 	case SIOCSIFFLAGS:
3638 		mtx_lock(&sc->driver_mtx);
3639 		if (ifp->if_flags & IFF_UP) {
3640 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
3641 				err = mxge_open(sc);
3642 			} else {
3643 				/* take care of promis can allmulti
3644 				   flag chages */
3645 				mxge_change_promisc(sc,
3646 						    ifp->if_flags & IFF_PROMISC);
3647 				mxge_set_multicast_list(sc);
3648 			}
3649 		} else {
3650 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3651 				mxge_close(sc);
3652 			}
3653 		}
3654 		mtx_unlock(&sc->driver_mtx);
3655 		break;
3656 
3657 	case SIOCADDMULTI:
3658 	case SIOCDELMULTI:
3659 		mtx_lock(&sc->driver_mtx);
3660 		mxge_set_multicast_list(sc);
3661 		mtx_unlock(&sc->driver_mtx);
3662 		break;
3663 
3664 	case SIOCSIFCAP:
3665 		mtx_lock(&sc->driver_mtx);
3666 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3667 		if (mask & IFCAP_TXCSUM) {
3668 			if (IFCAP_TXCSUM & ifp->if_capenable) {
3669 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3670 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3671 						      | CSUM_TSO);
3672 			} else {
3673 				ifp->if_capenable |= IFCAP_TXCSUM;
3674 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3675 			}
3676 		} else if (mask & IFCAP_RXCSUM) {
3677 			if (IFCAP_RXCSUM & ifp->if_capenable) {
3678 				ifp->if_capenable &= ~IFCAP_RXCSUM;
3679 				sc->csum_flag = 0;
3680 			} else {
3681 				ifp->if_capenable |= IFCAP_RXCSUM;
3682 				sc->csum_flag = 1;
3683 			}
3684 		}
3685 		if (mask & IFCAP_TSO4) {
3686 			if (IFCAP_TSO4 & ifp->if_capenable) {
3687 				ifp->if_capenable &= ~IFCAP_TSO4;
3688 				ifp->if_hwassist &= ~CSUM_TSO;
3689 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
3690 				ifp->if_capenable |= IFCAP_TSO4;
3691 				ifp->if_hwassist |= CSUM_TSO;
3692 			} else {
3693 				printf("mxge requires tx checksum offload"
3694 				       " be enabled to use TSO\n");
3695 				err = EINVAL;
3696 			}
3697 		}
3698 		if (mask & IFCAP_LRO) {
3699 			if (IFCAP_LRO & ifp->if_capenable)
3700 				err = mxge_change_lro_locked(sc, 0);
3701 			else
3702 				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
3703 		}
3704 		if (mask & IFCAP_VLAN_HWTAGGING)
3705 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3706 		mtx_unlock(&sc->driver_mtx);
3707 		VLAN_CAPABILITIES(ifp);
3708 
3709 		break;
3710 
3711 	case SIOCGIFMEDIA:
3712 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3713 				    &sc->media, command);
3714                 break;
3715 
3716 	default:
3717 		err = ENOTTY;
3718         }
3719 	return err;
3720 }
3721 
3722 static void
3723 mxge_fetch_tunables(mxge_softc_t *sc)
3724 {
3725 
3726 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
3727 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
3728 			  &mxge_flow_control);
3729 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
3730 			  &mxge_intr_coal_delay);
3731 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
3732 			  &mxge_nvidia_ecrc_enable);
3733 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
3734 			  &mxge_force_firmware);
3735 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
3736 			  &mxge_deassert_wait);
3737 	TUNABLE_INT_FETCH("hw.mxge.verbose",
3738 			  &mxge_verbose);
3739 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
3740 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
3741 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
3742 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
3743 	if (sc->lro_cnt != 0)
3744 		mxge_lro_cnt = sc->lro_cnt;
3745 
3746 	if (bootverbose)
3747 		mxge_verbose = 1;
3748 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
3749 		mxge_intr_coal_delay = 30;
3750 	if (mxge_ticks == 0)
3751 		mxge_ticks = hz / 2;
3752 	sc->pause = mxge_flow_control;
3753 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
3754 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_SRC_PORT) {
3755 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
3756 	}
3757 }
3758 
3759 
3760 static void
3761 mxge_free_slices(mxge_softc_t *sc)
3762 {
3763 	struct mxge_slice_state *ss;
3764 	int i;
3765 
3766 
3767 	if (sc->ss == NULL)
3768 		return;
3769 
3770 	for (i = 0; i < sc->num_slices; i++) {
3771 		ss = &sc->ss[i];
3772 		if (ss->fw_stats != NULL) {
3773 			mxge_dma_free(&ss->fw_stats_dma);
3774 			ss->fw_stats = NULL;
3775 			mtx_destroy(&ss->tx.mtx);
3776 		}
3777 		if (ss->rx_done.entry != NULL) {
3778 			mxge_dma_free(&ss->rx_done.dma);
3779 			ss->rx_done.entry = NULL;
3780 		}
3781 	}
3782 	free(sc->ss, M_DEVBUF);
3783 	sc->ss = NULL;
3784 }
3785 
3786 static int
3787 mxge_alloc_slices(mxge_softc_t *sc)
3788 {
3789 	mxge_cmd_t cmd;
3790 	struct mxge_slice_state *ss;
3791 	size_t bytes;
3792 	int err, i, max_intr_slots;
3793 
3794 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3795 	if (err != 0) {
3796 		device_printf(sc->dev, "Cannot determine rx ring size\n");
3797 		return err;
3798 	}
3799 	sc->rx_ring_size = cmd.data0;
3800 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
3801 
3802 	bytes = sizeof (*sc->ss) * sc->num_slices;
3803 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
3804 	if (sc->ss == NULL)
3805 		return (ENOMEM);
3806 	for (i = 0; i < sc->num_slices; i++) {
3807 		ss = &sc->ss[i];
3808 
3809 		ss->sc = sc;
3810 
3811 		/* allocate per-slice rx interrupt queues */
3812 
3813 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
3814 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
3815 		if (err != 0)
3816 			goto abort;
3817 		ss->rx_done.entry = ss->rx_done.dma.addr;
3818 		bzero(ss->rx_done.entry, bytes);
3819 
3820 		/*
3821 		 * allocate the per-slice firmware stats; stats
3822 		 * (including tx) are used used only on the first
3823 		 * slice for now
3824 		 */
3825 		if (i > 0)
3826 			continue;
3827 
3828 		bytes = sizeof (*ss->fw_stats);
3829 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
3830 				     sizeof (*ss->fw_stats), 64);
3831 		if (err != 0)
3832 			goto abort;
3833 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
3834 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
3835 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
3836 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
3837 	}
3838 
3839 	return (0);
3840 
3841 abort:
3842 	mxge_free_slices(sc);
3843 	return (ENOMEM);
3844 }
3845 
3846 static void
3847 mxge_slice_probe(mxge_softc_t *sc)
3848 {
3849 	mxge_cmd_t cmd;
3850 	char *old_fw;
3851 	int msix_cnt, status, max_intr_slots;
3852 
3853 	sc->num_slices = 1;
3854 	/*
3855 	 *  don't enable multiple slices if they are not enabled,
3856 	 *  or if this is not an SMP system
3857 	 */
3858 
3859 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
3860 		return;
3861 
3862 	/* see how many MSI-X interrupts are available */
3863 	msix_cnt = pci_msix_count(sc->dev);
3864 	if (msix_cnt < 2)
3865 		return;
3866 
3867 	/* now load the slice aware firmware see what it supports */
3868 	old_fw = sc->fw_name;
3869 	if (old_fw == mxge_fw_aligned)
3870 		sc->fw_name = mxge_fw_rss_aligned;
3871 	else
3872 		sc->fw_name = mxge_fw_rss_unaligned;
3873 	status = mxge_load_firmware(sc, 0);
3874 	if (status != 0) {
3875 		device_printf(sc->dev, "Falling back to a single slice\n");
3876 		return;
3877 	}
3878 
3879 	/* try to send a reset command to the card to see if it
3880 	   is alive */
3881 	memset(&cmd, 0, sizeof (cmd));
3882 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
3883 	if (status != 0) {
3884 		device_printf(sc->dev, "failed reset\n");
3885 		goto abort_with_fw;
3886 	}
3887 
3888 	/* get rx ring size */
3889 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3890 	if (status != 0) {
3891 		device_printf(sc->dev, "Cannot determine rx ring size\n");
3892 		goto abort_with_fw;
3893 	}
3894 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
3895 
3896 	/* tell it the size of the interrupt queues */
3897 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
3898 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
3899 	if (status != 0) {
3900 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
3901 		goto abort_with_fw;
3902 	}
3903 
3904 	/* ask the maximum number of slices it supports */
3905 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
3906 	if (status != 0) {
3907 		device_printf(sc->dev,
3908 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
3909 		goto abort_with_fw;
3910 	}
3911 	sc->num_slices = cmd.data0;
3912 	if (sc->num_slices > msix_cnt)
3913 		sc->num_slices = msix_cnt;
3914 
3915 	if (mxge_max_slices == -1) {
3916 		/* cap to number of CPUs in system */
3917 		if (sc->num_slices > mp_ncpus)
3918 			sc->num_slices = mp_ncpus;
3919 	} else {
3920 		if (sc->num_slices > mxge_max_slices)
3921 			sc->num_slices = mxge_max_slices;
3922 	}
3923 	/* make sure it is a power of two */
3924 	while (sc->num_slices & (sc->num_slices - 1))
3925 		sc->num_slices--;
3926 
3927 	if (mxge_verbose)
3928 		device_printf(sc->dev, "using %d slices\n",
3929 			      sc->num_slices);
3930 
3931 	return;
3932 
3933 abort_with_fw:
3934 	sc->fw_name = old_fw;
3935 	(void) mxge_load_firmware(sc, 0);
3936 }
3937 
3938 static int
3939 mxge_add_msix_irqs(mxge_softc_t *sc)
3940 {
3941 	size_t bytes;
3942 	int count, err, i, rid;
3943 
3944 	rid = PCIR_BAR(2);
3945 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
3946 						    &rid, RF_ACTIVE);
3947 
3948 	if (sc->msix_table_res == NULL) {
3949 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
3950 		return ENXIO;
3951 	}
3952 
3953 	count = sc->num_slices;
3954 	err = pci_alloc_msix(sc->dev, &count);
3955 	if (err != 0) {
3956 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
3957 			      "err = %d \n", sc->num_slices, err);
3958 		goto abort_with_msix_table;
3959 	}
3960 	if (count < sc->num_slices) {
3961 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
3962 			      count, sc->num_slices);
3963 		device_printf(sc->dev,
3964 			      "Try setting hw.mxge.max_slices to %d\n",
3965 			      count);
3966 		err = ENOSPC;
3967 		goto abort_with_msix;
3968 	}
3969 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
3970 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
3971 	if (sc->msix_irq_res == NULL) {
3972 		err = ENOMEM;
3973 		goto abort_with_msix;
3974 	}
3975 
3976 	for (i = 0; i < sc->num_slices; i++) {
3977 		rid = i + 1;
3978 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
3979 							  SYS_RES_IRQ,
3980 							  &rid, RF_ACTIVE);
3981 		if (sc->msix_irq_res[i] == NULL) {
3982 			device_printf(sc->dev, "couldn't allocate IRQ res"
3983 				      " for message %d\n", i);
3984 			err = ENXIO;
3985 			goto abort_with_res;
3986 		}
3987 	}
3988 
3989 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
3990 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
3991 
3992 	for (i = 0; i < sc->num_slices; i++) {
3993 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
3994 				     INTR_TYPE_NET | INTR_MPSAFE,
3995 #if __FreeBSD_version > 700030
3996 				     NULL,
3997 #endif
3998 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
3999 		if (err != 0) {
4000 			device_printf(sc->dev, "couldn't setup intr for "
4001 				      "message %d\n", i);
4002 			goto abort_with_intr;
4003 		}
4004 	}
4005 
4006 	if (mxge_verbose) {
4007 		device_printf(sc->dev, "using %d msix IRQs:",
4008 			      sc->num_slices);
4009 		for (i = 0; i < sc->num_slices; i++)
4010 			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4011 		printf("\n");
4012 	}
4013 	return (0);
4014 
4015 abort_with_intr:
4016 	for (i = 0; i < sc->num_slices; i++) {
4017 		if (sc->msix_ih[i] != NULL) {
4018 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4019 					  sc->msix_ih[i]);
4020 			sc->msix_ih[i] = NULL;
4021 		}
4022 	}
4023 	free(sc->msix_ih, M_DEVBUF);
4024 
4025 
4026 abort_with_res:
4027 	for (i = 0; i < sc->num_slices; i++) {
4028 		rid = i + 1;
4029 		if (sc->msix_irq_res[i] != NULL)
4030 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4031 					     sc->msix_irq_res[i]);
4032 		sc->msix_irq_res[i] = NULL;
4033 	}
4034 	free(sc->msix_irq_res, M_DEVBUF);
4035 
4036 
4037 abort_with_msix:
4038 	pci_release_msi(sc->dev);
4039 
4040 abort_with_msix_table:
4041 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4042 			     sc->msix_table_res);
4043 
4044 	return err;
4045 }
4046 
4047 static int
4048 mxge_add_single_irq(mxge_softc_t *sc)
4049 {
4050 	int count, err, rid;
4051 
4052 	count = pci_msi_count(sc->dev);
4053 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4054 		rid = 1;
4055 	} else {
4056 		rid = 0;
4057 		sc->legacy_irq = 1;
4058 	}
4059 	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4060 					 1, RF_SHAREABLE | RF_ACTIVE);
4061 	if (sc->irq_res == NULL) {
4062 		device_printf(sc->dev, "could not alloc interrupt\n");
4063 		return ENXIO;
4064 	}
4065 	if (mxge_verbose)
4066 		device_printf(sc->dev, "using %s irq %ld\n",
4067 			      sc->legacy_irq ? "INTx" : "MSI",
4068 			      rman_get_start(sc->irq_res));
4069 	err = bus_setup_intr(sc->dev, sc->irq_res,
4070 			     INTR_TYPE_NET | INTR_MPSAFE,
4071 #if __FreeBSD_version > 700030
4072 			     NULL,
4073 #endif
4074 			     mxge_intr, &sc->ss[0], &sc->ih);
4075 	if (err != 0) {
4076 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4077 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4078 		if (!sc->legacy_irq)
4079 			pci_release_msi(sc->dev);
4080 	}
4081 	return err;
4082 }
4083 
4084 static void
4085 mxge_rem_msix_irqs(mxge_softc_t *sc)
4086 {
4087 	int i, rid;
4088 
4089 	for (i = 0; i < sc->num_slices; i++) {
4090 		if (sc->msix_ih[i] != NULL) {
4091 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4092 					  sc->msix_ih[i]);
4093 			sc->msix_ih[i] = NULL;
4094 		}
4095 	}
4096 	free(sc->msix_ih, M_DEVBUF);
4097 
4098 	for (i = 0; i < sc->num_slices; i++) {
4099 		rid = i + 1;
4100 		if (sc->msix_irq_res[i] != NULL)
4101 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4102 					     sc->msix_irq_res[i]);
4103 		sc->msix_irq_res[i] = NULL;
4104 	}
4105 	free(sc->msix_irq_res, M_DEVBUF);
4106 
4107 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4108 			     sc->msix_table_res);
4109 
4110 	pci_release_msi(sc->dev);
4111 	return;
4112 }
4113 
4114 static void
4115 mxge_rem_single_irq(mxge_softc_t *sc)
4116 {
4117 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4118 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4119 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4120 	if (!sc->legacy_irq)
4121 		pci_release_msi(sc->dev);
4122 }
4123 
4124 static void
4125 mxge_rem_irq(mxge_softc_t *sc)
4126 {
4127 	if (sc->num_slices > 1)
4128 		mxge_rem_msix_irqs(sc);
4129 	else
4130 		mxge_rem_single_irq(sc);
4131 }
4132 
4133 static int
4134 mxge_add_irq(mxge_softc_t *sc)
4135 {
4136 	int err;
4137 
4138 	if (sc->num_slices > 1)
4139 		err = mxge_add_msix_irqs(sc);
4140 	else
4141 		err = mxge_add_single_irq(sc);
4142 
4143 	if (0 && err == 0 && sc->num_slices > 1) {
4144 		mxge_rem_msix_irqs(sc);
4145 		err = mxge_add_msix_irqs(sc);
4146 	}
4147 	return err;
4148 }
4149 
4150 
4151 static int
4152 mxge_attach(device_t dev)
4153 {
4154 	mxge_softc_t *sc = device_get_softc(dev);
4155 	struct ifnet *ifp;
4156 	int err, rid;
4157 
4158 	sc->dev = dev;
4159 	mxge_fetch_tunables(sc);
4160 
4161 	err = bus_dma_tag_create(NULL,			/* parent */
4162 				 1,			/* alignment */
4163 				 0,			/* boundary */
4164 				 BUS_SPACE_MAXADDR,	/* low */
4165 				 BUS_SPACE_MAXADDR,	/* high */
4166 				 NULL, NULL,		/* filter */
4167 				 65536 + 256,		/* maxsize */
4168 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4169 				 65536,			/* maxsegsize */
4170 				 0,			/* flags */
4171 				 NULL, NULL,		/* lock */
4172 				 &sc->parent_dmat);	/* tag */
4173 
4174 	if (err != 0) {
4175 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4176 			      err);
4177 		goto abort_with_nothing;
4178 	}
4179 
4180 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4181 	if (ifp == NULL) {
4182 		device_printf(dev, "can not if_alloc()\n");
4183 		err = ENOSPC;
4184 		goto abort_with_parent_dmat;
4185 	}
4186 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4187 
4188 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4189 		 device_get_nameunit(dev));
4190 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4191 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4192 		 "%s:drv", device_get_nameunit(dev));
4193 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4194 		 MTX_NETWORK_LOCK, MTX_DEF);
4195 
4196 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4197 
4198 	mxge_setup_cfg_space(sc);
4199 
4200 	/* Map the board into the kernel */
4201 	rid = PCIR_BARS;
4202 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4203 					 ~0, 1, RF_ACTIVE);
4204 	if (sc->mem_res == NULL) {
4205 		device_printf(dev, "could not map memory\n");
4206 		err = ENXIO;
4207 		goto abort_with_lock;
4208 	}
4209 	sc->sram = rman_get_virtual(sc->mem_res);
4210 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4211 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4212 		device_printf(dev, "impossible memory region size %ld\n",
4213 			      rman_get_size(sc->mem_res));
4214 		err = ENXIO;
4215 		goto abort_with_mem_res;
4216 	}
4217 
4218 	/* make NULL terminated copy of the EEPROM strings section of
4219 	   lanai SRAM */
4220 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4221 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4222 				rman_get_bushandle(sc->mem_res),
4223 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4224 				sc->eeprom_strings,
4225 				MXGE_EEPROM_STRINGS_SIZE - 2);
4226 	err = mxge_parse_strings(sc);
4227 	if (err != 0)
4228 		goto abort_with_mem_res;
4229 
4230 	/* Enable write combining for efficient use of PCIe bus */
4231 	mxge_enable_wc(sc);
4232 
4233 	/* Allocate the out of band dma memory */
4234 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4235 			     sizeof (mxge_cmd_t), 64);
4236 	if (err != 0)
4237 		goto abort_with_mem_res;
4238 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4239 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4240 	if (err != 0)
4241 		goto abort_with_cmd_dma;
4242 
4243 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4244 	if (err != 0)
4245 		goto abort_with_zeropad_dma;
4246 
4247 	/* select & load the firmware */
4248 	err = mxge_select_firmware(sc);
4249 	if (err != 0)
4250 		goto abort_with_dmabench;
4251 	sc->intr_coal_delay = mxge_intr_coal_delay;
4252 
4253 	mxge_slice_probe(sc);
4254 	err = mxge_alloc_slices(sc);
4255 	if (err != 0)
4256 		goto abort_with_dmabench;
4257 
4258 	err = mxge_reset(sc, 0);
4259 	if (err != 0)
4260 		goto abort_with_slices;
4261 
4262 	err = mxge_alloc_rings(sc);
4263 	if (err != 0) {
4264 		device_printf(sc->dev, "failed to allocate rings\n");
4265 		goto abort_with_dmabench;
4266 	}
4267 
4268 	err = mxge_add_irq(sc);
4269 	if (err != 0) {
4270 		device_printf(sc->dev, "failed to add irq\n");
4271 		goto abort_with_rings;
4272 	}
4273 
4274 	ifp->if_baudrate = IF_Gbps(10UL);
4275 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4276 		IFCAP_VLAN_MTU | IFCAP_LRO;
4277 
4278 #ifdef MXGE_NEW_VLAN_API
4279 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4280 #endif
4281 
4282 	sc->max_mtu = mxge_max_mtu(sc);
4283 	if (sc->max_mtu >= 9000)
4284 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4285 	else
4286 		device_printf(dev, "MTU limited to %d.  Install "
4287 			      "latest firmware for 9000 byte jumbo support\n",
4288 			      sc->max_mtu - ETHER_HDR_LEN);
4289 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4290 	ifp->if_capenable = ifp->if_capabilities;
4291 	if (sc->lro_cnt == 0)
4292 		ifp->if_capenable &= ~IFCAP_LRO;
4293 	sc->csum_flag = 1;
4294         ifp->if_init = mxge_init;
4295         ifp->if_softc = sc;
4296         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4297         ifp->if_ioctl = mxge_ioctl;
4298         ifp->if_start = mxge_start;
4299 	/* Initialise the ifmedia structure */
4300 	ifmedia_init(&sc->media, 0, mxge_media_change,
4301 		     mxge_media_status);
4302 	mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4303 	mxge_media_probe(sc);
4304 	ether_ifattach(ifp, sc->mac_addr);
4305 	/* ether_ifattach sets mtu to 1500 */
4306 	if (ifp->if_capabilities & IFCAP_JUMBO_MTU)
4307 		ifp->if_mtu = 9000;
4308 
4309 	mxge_add_sysctls(sc);
4310 	return 0;
4311 
4312 abort_with_rings:
4313 	mxge_free_rings(sc);
4314 abort_with_slices:
4315 	mxge_free_slices(sc);
4316 abort_with_dmabench:
4317 	mxge_dma_free(&sc->dmabench_dma);
4318 abort_with_zeropad_dma:
4319 	mxge_dma_free(&sc->zeropad_dma);
4320 abort_with_cmd_dma:
4321 	mxge_dma_free(&sc->cmd_dma);
4322 abort_with_mem_res:
4323 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4324 abort_with_lock:
4325 	pci_disable_busmaster(dev);
4326 	mtx_destroy(&sc->cmd_mtx);
4327 	mtx_destroy(&sc->driver_mtx);
4328 	if_free(ifp);
4329 abort_with_parent_dmat:
4330 	bus_dma_tag_destroy(sc->parent_dmat);
4331 
4332 abort_with_nothing:
4333 	return err;
4334 }
4335 
4336 static int
4337 mxge_detach(device_t dev)
4338 {
4339 	mxge_softc_t *sc = device_get_softc(dev);
4340 
4341 	if (mxge_vlans_active(sc)) {
4342 		device_printf(sc->dev,
4343 			      "Detach vlans before removing module\n");
4344 		return EBUSY;
4345 	}
4346 	mtx_lock(&sc->driver_mtx);
4347 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4348 		mxge_close(sc);
4349 	mtx_unlock(&sc->driver_mtx);
4350 	ether_ifdetach(sc->ifp);
4351 	callout_drain(&sc->co_hdl);
4352 	ifmedia_removeall(&sc->media);
4353 	mxge_dummy_rdma(sc, 0);
4354 	mxge_rem_sysctls(sc);
4355 	mxge_rem_irq(sc);
4356 	mxge_free_rings(sc);
4357 	mxge_free_slices(sc);
4358 	mxge_dma_free(&sc->dmabench_dma);
4359 	mxge_dma_free(&sc->zeropad_dma);
4360 	mxge_dma_free(&sc->cmd_dma);
4361 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4362 	pci_disable_busmaster(dev);
4363 	mtx_destroy(&sc->cmd_mtx);
4364 	mtx_destroy(&sc->driver_mtx);
4365 	if_free(sc->ifp);
4366 	bus_dma_tag_destroy(sc->parent_dmat);
4367 	return 0;
4368 }
4369 
4370 static int
4371 mxge_shutdown(device_t dev)
4372 {
4373 	return 0;
4374 }
4375 
4376 /*
4377   This file uses Myri10GE driver indentation.
4378 
4379   Local Variables:
4380   c-file-style:"linux"
4381   tab-width:8
4382   End:
4383 */
4384