xref: /dragonfly/sys/dev/netif/mxge/if_mxge.c (revision 07a2f99c)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2009, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 $FreeBSD: src/sys/dev/mxge/if_mxge.c,v 1.63 2009/06/26 11:45:06 rwatson Exp $
29 
30 ***************************************************************************/
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/linker.h>
35 #include <sys/firmware.h>
36 #include <sys/endian.h>
37 #include <sys/in_cksum.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kernel.h>
42 #include <sys/module.h>
43 #include <sys/serialize.h>
44 #include <sys/socket.h>
45 #include <sys/sysctl.h>
46 
47 /* count xmits ourselves, rather than via drbr */
48 #define NO_SLOW_STATS
49 #include <net/if.h>
50 #include <net/if_arp.h>
51 #include <net/ifq_var.h>
52 #include <net/ethernet.h>
53 #include <net/if_dl.h>
54 #include <net/if_media.h>
55 
56 #include <net/bpf.h>
57 
58 #include <net/if_types.h>
59 #include <net/vlan/if_vlan_var.h>
60 #include <net/zlib.h>
61 
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/tcp.h>
66 
67 #include <sys/bus.h>
68 #include <sys/rman.h>
69 
70 #include <bus/pci/pcireg.h>
71 #include <bus/pci/pcivar.h>
72 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
73 
74 #include <vm/vm.h>		/* for pmap_mapdev() */
75 #include <vm/pmap.h>
76 
77 #if defined(__i386) || defined(__x86_64)
78 #include <machine/specialreg.h>
79 #endif
80 
81 #include <dev/netif/mxge/mxge_mcp.h>
82 #include <dev/netif/mxge/mcp_gen_header.h>
83 /*#define MXGE_FAKE_IFP*/
84 #include <dev/netif/mxge/if_mxge_var.h>
85 #ifdef IFNET_BUF_RING
86 #include <sys/buf_ring.h>
87 #endif
88 
89 #include "opt_inet.h"
90 
91 /* tunable params */
92 static int mxge_nvidia_ecrc_enable = 1;
93 static int mxge_force_firmware = 0;
94 static int mxge_intr_coal_delay = 30;
95 static int mxge_deassert_wait = 1;
96 static int mxge_flow_control = 1;
97 static int mxge_verbose = 0;
98 static int mxge_lro_cnt = 8;
99 static int mxge_ticks;
100 static int mxge_max_slices = 1;
101 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
102 static int mxge_always_promisc = 0;
103 /* XXX: not yet */
104 /* static int mxge_initial_mtu = ETHERMTU_JUMBO; */
105 static int mxge_initial_mtu = ETHERMTU;
106 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
107 static char *mxge_fw_aligned = "mxge_eth_z8e";
108 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
109 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
110 
111 static int mxge_probe(device_t dev);
112 static int mxge_attach(device_t dev);
113 static int mxge_detach(device_t dev);
114 static int mxge_shutdown(device_t dev);
115 static void mxge_intr(void *arg);
116 
117 static device_method_t mxge_methods[] =
118 {
119   /* Device interface */
120   DEVMETHOD(device_probe, mxge_probe),
121   DEVMETHOD(device_attach, mxge_attach),
122   DEVMETHOD(device_detach, mxge_detach),
123   DEVMETHOD(device_shutdown, mxge_shutdown),
124   {0, 0}
125 };
126 
127 static driver_t mxge_driver =
128 {
129   "mxge",
130   mxge_methods,
131   sizeof(mxge_softc_t),
132 };
133 
134 static devclass_t mxge_devclass;
135 
136 /* Declare ourselves to be a child of the PCI bus.*/
137 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, NULL, NULL);
138 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
139 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
140 
141 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
142 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
143 static int mxge_close(mxge_softc_t *sc);
144 static int mxge_open(mxge_softc_t *sc);
145 static void mxge_tick(void *arg);
146 
147 /* XXX: we don't have Large Receive Offload support yet */
148  inline int
149 mxge_lro_rx(struct mxge_slice_state *ss, struct mbuf *m_head, uint32_t csum)
150 {
151 	(void)ss;
152 	(void)m_head;
153 	(void)csum;
154 	return 1;
155 }
156 
157  inline void
158 mxge_lro_flush(struct mxge_slice_state *ss, struct lro_entry *lro)
159 {
160 	(void)ss;
161 	(void)lro;
162 }
163 
164 static int
165 mxge_probe(device_t dev)
166 {
167 	int rev;
168 
169 
170 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
171 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
172 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
173 		rev = pci_get_revid(dev);
174 		switch (rev) {
175 		case MXGE_PCI_REV_Z8E:
176 			device_set_desc(dev, "Myri10G-PCIE-8A");
177 			break;
178 		case MXGE_PCI_REV_Z8ES:
179 			device_set_desc(dev, "Myri10G-PCIE-8B");
180 			break;
181 		default:
182 			device_set_desc(dev, "Myri10G-PCIE-8??");
183 			device_printf(dev, "Unrecognized rev %d NIC\n",
184 				      rev);
185 			break;
186 		}
187 		return 0;
188 	}
189 	return ENXIO;
190 }
191 
192 static void
193 mxge_enable_wc(mxge_softc_t *sc)
194 {
195 #if 0
196 #if defined(__i386) || defined(__x86_64)
197 	vm_offset_t len;
198 	int err;
199 
200 	sc->wc = 1;
201 	len = rman_get_size(sc->mem_res);
202 	err = pmap_change_attr((vm_offset_t) sc->sram,
203 			       len, PAT_WRITE_COMBINING);
204 	if (err != 0) {
205 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
206 			      err);
207 		sc->wc = 0;
208 	}
209 #endif
210 #else
211 	sc->wc = 0;	/* TBD: PAT support */
212 #endif
213 }
214 
215 
216 /* callback to get our DMA address */
217 static void
218 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
219 			 int error)
220 {
221 	if (error == 0) {
222 		*(bus_addr_t *) arg = segs->ds_addr;
223 	}
224 }
225 
226 static int
227 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
228 		   bus_size_t alignment)
229 {
230 	int err;
231 	device_t dev = sc->dev;
232 	bus_size_t boundary, maxsegsize;
233 
234 	if (bytes > 4096 && alignment == 4096) {
235 		boundary = 0;
236 		maxsegsize = bytes;
237 	} else {
238 		boundary = 4096;
239 		maxsegsize = 4096;
240 	}
241 
242 	/* allocate DMAable memory tags */
243 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
244 				 alignment,		/* alignment */
245 				 boundary,		/* boundary */
246 				 BUS_SPACE_MAXADDR,	/* low */
247 				 BUS_SPACE_MAXADDR,	/* high */
248 				 NULL, NULL,		/* filter */
249 				 bytes,			/* maxsize */
250 				 1,			/* num segs */
251 				 maxsegsize,		/* maxsegsize */
252 				 BUS_DMA_COHERENT,	/* flags */
253 				 &dma->dmat);		/* tag */
254 	if (err != 0) {
255 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
256 		return err;
257 	}
258 
259 	/* allocate DMAable memory & map */
260 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
261 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
262 				| BUS_DMA_ZERO),  &dma->map);
263 	if (err != 0) {
264 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
265 		goto abort_with_dmat;
266 	}
267 
268 	/* load the memory */
269 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
270 			      mxge_dmamap_callback,
271 			      (void *)&dma->bus_addr, 0);
272 	if (err != 0) {
273 		device_printf(dev, "couldn't load map (err = %d)\n", err);
274 		goto abort_with_mem;
275 	}
276 	return 0;
277 
278 abort_with_mem:
279 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
280 abort_with_dmat:
281 	(void)bus_dma_tag_destroy(dma->dmat);
282 	return err;
283 }
284 
285 
286 static void
287 mxge_dma_free(mxge_dma_t *dma)
288 {
289 	bus_dmamap_unload(dma->dmat, dma->map);
290 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
291 	(void)bus_dma_tag_destroy(dma->dmat);
292 }
293 
294 /*
295  * The eeprom strings on the lanaiX have the format
296  * SN=x\0
297  * MAC=x:x:x:x:x:x\0
298  * PC=text\0
299  */
300 
301 static int
302 mxge_parse_strings(mxge_softc_t *sc)
303 {
304 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
305 
306 	char *ptr, *limit;
307 	int i, found_mac;
308 
309 	ptr = sc->eeprom_strings;
310 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
311 	found_mac = 0;
312 	while (ptr < limit && *ptr != '\0') {
313 		if (memcmp(ptr, "MAC=", 4) == 0) {
314 			ptr += 1;
315 			sc->mac_addr_string = ptr;
316 			for (i = 0; i < 6; i++) {
317 				ptr += 3;
318 				if ((ptr + 2) > limit)
319 					goto abort;
320 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
321 				found_mac = 1;
322 			}
323 		} else if (memcmp(ptr, "PC=", 3) == 0) {
324 			ptr += 3;
325 			strncpy(sc->product_code_string, ptr,
326 				sizeof (sc->product_code_string) - 1);
327 		} else if (memcmp(ptr, "SN=", 3) == 0) {
328 			ptr += 3;
329 			strncpy(sc->serial_number_string, ptr,
330 				sizeof (sc->serial_number_string) - 1);
331 		}
332 		MXGE_NEXT_STRING(ptr);
333 	}
334 
335 	if (found_mac)
336 		return 0;
337 
338  abort:
339 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
340 
341 	return ENXIO;
342 }
343 
344 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
345 static void
346 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
347 {
348 	uint32_t val;
349 	unsigned long base, off;
350 	char *va, *cfgptr;
351 	device_t pdev, mcp55;
352 	uint16_t vendor_id, device_id, word;
353 	uintptr_t bus, slot, func, ivend, idev;
354 	uint32_t *ptr32;
355 
356 
357 	if (!mxge_nvidia_ecrc_enable)
358 		return;
359 
360 	pdev = device_get_parent(device_get_parent(sc->dev));
361 	if (pdev == NULL) {
362 		device_printf(sc->dev, "could not find parent?\n");
363 		return;
364 	}
365 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
366 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
367 
368 	if (vendor_id != 0x10de)
369 		return;
370 
371 	base = 0;
372 
373 	if (device_id == 0x005d) {
374 		/* ck804, base address is magic */
375 		base = 0xe0000000UL;
376 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
377 		/* mcp55, base address stored in chipset */
378 		mcp55 = pci_find_bsf(0, 0, 0);
379 		if (mcp55 &&
380 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
381 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
382 			word = pci_read_config(mcp55, 0x90, 2);
383 			base = ((unsigned long)word & 0x7ffeU) << 25;
384 		}
385 	}
386 	if (!base)
387 		return;
388 
389 	/* XXXX
390 	   Test below is commented because it is believed that doing
391 	   config read/write beyond 0xff will access the config space
392 	   for the next larger function.  Uncomment this and remove
393 	   the hacky pmap_mapdev() way of accessing config space when
394 	   FreeBSD grows support for extended pcie config space access
395 	*/
396 #if 0
397 	/* See if we can, by some miracle, access the extended
398 	   config space */
399 	val = pci_read_config(pdev, 0x178, 4);
400 	if (val != 0xffffffff) {
401 		val |= 0x40;
402 		pci_write_config(pdev, 0x178, val, 4);
403 		return;
404 	}
405 #endif
406 	/* Rather than using normal pci config space writes, we must
407 	 * map the Nvidia config space ourselves.  This is because on
408 	 * opteron/nvidia class machine the 0xe000000 mapping is
409 	 * handled by the nvidia chipset, that means the internal PCI
410 	 * device (the on-chip northbridge), or the amd-8131 bridge
411 	 * and things behind them are not visible by this method.
412 	 */
413 
414 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
415 		      PCI_IVAR_BUS, &bus);
416 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
417 		      PCI_IVAR_SLOT, &slot);
418 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
419 		      PCI_IVAR_FUNCTION, &func);
420 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
421 		      PCI_IVAR_VENDOR, &ivend);
422 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
423 		      PCI_IVAR_DEVICE, &idev);
424 
425 	off =  base
426 		+ 0x00100000UL * (unsigned long)bus
427 		+ 0x00001000UL * (unsigned long)(func
428 						 + 8 * slot);
429 
430 	/* map it into the kernel */
431 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
432 
433 
434 	if (va == NULL) {
435 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
436 		return;
437 	}
438 	/* get a pointer to the config space mapped into the kernel */
439 	cfgptr = va + (off & PAGE_MASK);
440 
441 	/* make sure that we can really access it */
442 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
443 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
444 	if (! (vendor_id == ivend && device_id == idev)) {
445 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
446 			      vendor_id, device_id);
447 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
448 		return;
449 	}
450 
451 	ptr32 = (uint32_t*)(cfgptr + 0x178);
452 	val = *ptr32;
453 
454 	if (val == 0xffffffff) {
455 		device_printf(sc->dev, "extended mapping failed\n");
456 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
457 		return;
458 	}
459 	*ptr32 = val | 0x40;
460 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
461 	if (mxge_verbose)
462 		device_printf(sc->dev,
463 			      "Enabled ECRC on upstream Nvidia bridge "
464 			      "at %d:%d:%d\n",
465 			      (int)bus, (int)slot, (int)func);
466 	return;
467 }
468 #else
469 static void
470 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
471 {
472 	device_printf(sc->dev,
473 		      "Nforce 4 chipset on non-x86/x86_64!?!?!\n");
474 	return;
475 }
476 #endif
477 
478 
479 static int
480 mxge_dma_test(mxge_softc_t *sc, int test_type)
481 {
482 	mxge_cmd_t cmd;
483 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
484 	int status;
485 	uint32_t len;
486 	char *test = " ";
487 
488 
489 	/* Run a small DMA test.
490 	 * The magic multipliers to the length tell the firmware
491 	 * to do DMA read, write, or read+write tests.  The
492 	 * results are returned in cmd.data0.  The upper 16
493 	 * bits of the return is the number of transfers completed.
494 	 * The lower 16 bits is the time in 0.5us ticks that the
495 	 * transfers took to complete.
496 	 */
497 
498 	len = sc->tx_boundary;
499 
500 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
501 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
502 	cmd.data2 = len * 0x10000;
503 	status = mxge_send_cmd(sc, test_type, &cmd);
504 	if (status != 0) {
505 		test = "read";
506 		goto abort;
507 	}
508 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
509 		(cmd.data0 & 0xffff);
510 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
511 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
512 	cmd.data2 = len * 0x1;
513 	status = mxge_send_cmd(sc, test_type, &cmd);
514 	if (status != 0) {
515 		test = "write";
516 		goto abort;
517 	}
518 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
519 		(cmd.data0 & 0xffff);
520 
521 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
522 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
523 	cmd.data2 = len * 0x10001;
524 	status = mxge_send_cmd(sc, test_type, &cmd);
525 	if (status != 0) {
526 		test = "read/write";
527 		goto abort;
528 	}
529 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
530 		(cmd.data0 & 0xffff);
531 
532 abort:
533 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
534 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
535 			      test, status);
536 
537 	return status;
538 }
539 
540 /*
541  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
542  * when the PCI-E Completion packets are aligned on an 8-byte
543  * boundary.  Some PCI-E chip sets always align Completion packets; on
544  * the ones that do not, the alignment can be enforced by enabling
545  * ECRC generation (if supported).
546  *
547  * When PCI-E Completion packets are not aligned, it is actually more
548  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
549  *
550  * If the driver can neither enable ECRC nor verify that it has
551  * already been enabled, then it must use a firmware image which works
552  * around unaligned completion packets (ethp_z8e.dat), and it should
553  * also ensure that it never gives the device a Read-DMA which is
554  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
555  * enabled, then the driver should use the aligned (eth_z8e.dat)
556  * firmware image, and set tx_boundary to 4KB.
557  */
558 
559 static int
560 mxge_firmware_probe(mxge_softc_t *sc)
561 {
562 	device_t dev = sc->dev;
563 	int reg, status;
564 	uint16_t pectl;
565 
566 	sc->tx_boundary = 4096;
567 	/*
568 	 * Verify the max read request size was set to 4KB
569 	 * before trying the test with 4KB.
570 	 */
571 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
572 		pectl = pci_read_config(dev, reg + 0x8, 2);
573 		if ((pectl & (5 << 12)) != (5 << 12)) {
574 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
575 				      pectl);
576 			sc->tx_boundary = 2048;
577 		}
578 	}
579 
580 	/*
581 	 * load the optimized firmware (which assumes aligned PCIe
582 	 * completions) in order to see if it works on this host.
583 	 */
584 	sc->fw_name = mxge_fw_aligned;
585 	status = mxge_load_firmware(sc, 1);
586 	if (status != 0) {
587 		return status;
588 	}
589 
590 	/*
591 	 * Enable ECRC if possible
592 	 */
593 	mxge_enable_nvidia_ecrc(sc);
594 
595 	/*
596 	 * Run a DMA test which watches for unaligned completions and
597 	 * aborts on the first one seen.
598 	 */
599 
600 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
601 	if (status == 0)
602 		return 0; /* keep the aligned firmware */
603 
604 	if (status != E2BIG)
605 		device_printf(dev, "DMA test failed: %d\n", status);
606 	if (status == ENOSYS)
607 		device_printf(dev, "Falling back to ethp! "
608 			      "Please install up to date fw\n");
609 	return status;
610 }
611 
612 static int
613 mxge_select_firmware(mxge_softc_t *sc)
614 {
615 	int aligned = 0;
616 
617 
618 	if (mxge_force_firmware != 0) {
619 		if (mxge_force_firmware == 1)
620 			aligned = 1;
621 		else
622 			aligned = 0;
623 		if (mxge_verbose)
624 			device_printf(sc->dev,
625 				      "Assuming %s completions (forced)\n",
626 				      aligned ? "aligned" : "unaligned");
627 		goto abort;
628 	}
629 
630 	/* if the PCIe link width is 4 or less, we can use the aligned
631 	   firmware and skip any checks */
632 	if (sc->link_width != 0 && sc->link_width <= 4) {
633 		device_printf(sc->dev,
634 			      "PCIe x%d Link, expect reduced performance\n",
635 			      sc->link_width);
636 		aligned = 1;
637 		goto abort;
638 	}
639 
640 	if (0 == mxge_firmware_probe(sc))
641 		return 0;
642 
643 abort:
644 	if (aligned) {
645 		sc->fw_name = mxge_fw_aligned;
646 		sc->tx_boundary = 4096;
647 	} else {
648 		sc->fw_name = mxge_fw_unaligned;
649 		sc->tx_boundary = 2048;
650 	}
651 	return (mxge_load_firmware(sc, 0));
652 }
653 
654 union qualhack
655 {
656         const char *ro_char;
657         char *rw_char;
658 };
659 
660 static int
661 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
662 {
663 
664 
665 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
666 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
667 			      be32toh(hdr->mcp_type));
668 		return EIO;
669 	}
670 
671 	/* save firmware version for sysctl */
672 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
673 	if (mxge_verbose)
674 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
675 
676 	ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
677 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
678 
679 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
680 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
681 		device_printf(sc->dev, "Found firmware version %s\n",
682 			      sc->fw_version);
683 		device_printf(sc->dev, "Driver needs %d.%d\n",
684 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
685 		return EINVAL;
686 	}
687 	return 0;
688 
689 }
690 
691 static void *
692 z_alloc(void *nil, u_int items, u_int size)
693 {
694         void *ptr;
695 
696         ptr = kmalloc(items * size, M_TEMP, M_NOWAIT);
697         return ptr;
698 }
699 
700 static void
701 z_free(void *nil, void *ptr)
702 {
703         kfree(ptr, M_TEMP);
704 }
705 
706 
707 static int
708 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
709 {
710 	z_stream zs;
711 	char *inflate_buffer;
712 	const struct firmware *fw;
713 	const mcp_gen_header_t *hdr;
714 	unsigned hdr_offset;
715 	int status;
716 	unsigned int i;
717 	char dummy;
718 	size_t fw_len;
719 
720 	fw = firmware_get(sc->fw_name);
721 	if (fw == NULL) {
722 		device_printf(sc->dev, "Could not find firmware image %s\n",
723 			      sc->fw_name);
724 		return ENOENT;
725 	}
726 
727 
728 
729 	/* setup zlib and decompress f/w */
730 	bzero(&zs, sizeof (zs));
731 	zs.zalloc = z_alloc;
732 	zs.zfree = z_free;
733 	status = inflateInit(&zs);
734 	if (status != Z_OK) {
735 		status = EIO;
736 		goto abort_with_fw;
737 	}
738 
739 	/* the uncompressed size is stored as the firmware version,
740 	   which would otherwise go unused */
741 	fw_len = (size_t) fw->version;
742 	inflate_buffer = kmalloc(fw_len, M_TEMP, M_NOWAIT);
743 	if (inflate_buffer == NULL)
744 		goto abort_with_zs;
745 	zs.avail_in = fw->datasize;
746 	zs.next_in = __DECONST(char *, fw->data);
747 	zs.avail_out = fw_len;
748 	zs.next_out = inflate_buffer;
749 	status = inflate(&zs, Z_FINISH);
750 	if (status != Z_STREAM_END) {
751 		device_printf(sc->dev, "zlib %d\n", status);
752 		status = EIO;
753 		goto abort_with_buffer;
754 	}
755 
756 	/* check id */
757 	hdr_offset = htobe32(*(const uint32_t *)
758 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
759 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
760 		device_printf(sc->dev, "Bad firmware file");
761 		status = EIO;
762 		goto abort_with_buffer;
763 	}
764 	hdr = (const void*)(inflate_buffer + hdr_offset);
765 
766 	status = mxge_validate_firmware(sc, hdr);
767 	if (status != 0)
768 		goto abort_with_buffer;
769 
770 	/* Copy the inflated firmware to NIC SRAM. */
771 	for (i = 0; i < fw_len; i += 256) {
772 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
773 			      inflate_buffer + i,
774 			      min(256U, (unsigned)(fw_len - i)));
775 		wmb();
776 		dummy = *sc->sram;
777 		wmb();
778 	}
779 
780 	*limit = fw_len;
781 	status = 0;
782 abort_with_buffer:
783 	kfree(inflate_buffer, M_TEMP);
784 abort_with_zs:
785 	inflateEnd(&zs);
786 abort_with_fw:
787 	firmware_put(fw, FIRMWARE_UNLOAD);
788 	return status;
789 }
790 
791 /*
792  * Enable or disable periodic RDMAs from the host to make certain
793  * chipsets resend dropped PCIe messages
794  */
795 
796 static void
797 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
798 {
799 	char buf_bytes[72];
800 	volatile uint32_t *confirm;
801 	volatile char *submit;
802 	uint32_t *buf, dma_low, dma_high;
803 	int i;
804 
805 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
806 
807 	/* clear confirmation addr */
808 	confirm = (volatile uint32_t *)sc->cmd;
809 	*confirm = 0;
810 	wmb();
811 
812 	/* send an rdma command to the PCIe engine, and wait for the
813 	   response in the confirmation address.  The firmware should
814 	   write a -1 there to indicate it is alive and well
815 	*/
816 
817 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
818 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
819 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
820 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
821 	buf[2] = htobe32(0xffffffff);		/* confirm data */
822 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
823 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
824 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
825 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
826 	buf[5] = htobe32(enable);			/* enable? */
827 
828 
829 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
830 
831 	mxge_pio_copy(submit, buf, 64);
832 	wmb();
833 	DELAY(1000);
834 	wmb();
835 	i = 0;
836 	while (*confirm != 0xffffffff && i < 20) {
837 		DELAY(1000);
838 		i++;
839 	}
840 	if (*confirm != 0xffffffff) {
841 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
842 			      (enable ? "enable" : "disable"), confirm,
843 			      *confirm);
844 	}
845 	return;
846 }
847 
848 static int
849 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
850 {
851 	mcp_cmd_t *buf;
852 	char buf_bytes[sizeof(*buf) + 8];
853 	volatile mcp_cmd_response_t *response = sc->cmd;
854 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
855 	uint32_t dma_low, dma_high;
856 	int err, sleep_total = 0;
857 
858 	/*
859 	 * We may be called during attach, before if_serializer is available.
860 	 * This is not a fast path, just check for NULL
861 	 */
862 
863 	if (sc->ifp->if_serializer)
864 		ASSERT_SERIALIZED(sc->ifp->if_serializer);
865 
866 	/* ensure buf is aligned to 8 bytes */
867 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
868 
869 	buf->data0 = htobe32(data->data0);
870 	buf->data1 = htobe32(data->data1);
871 	buf->data2 = htobe32(data->data2);
872 	buf->cmd = htobe32(cmd);
873 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
874 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
875 
876 	buf->response_addr.low = htobe32(dma_low);
877 	buf->response_addr.high = htobe32(dma_high);
878 
879 
880 	response->result = 0xffffffff;
881 	wmb();
882 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
883 
884 	/* wait up to 20ms */
885 	err = EAGAIN;
886 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
887 		bus_dmamap_sync(sc->cmd_dma.dmat,
888 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
889 		wmb();
890 		switch (be32toh(response->result)) {
891 		case 0:
892 			data->data0 = be32toh(response->data);
893 			err = 0;
894 			break;
895 		case 0xffffffff:
896 			DELAY(1000);
897 			break;
898 		case MXGEFW_CMD_UNKNOWN:
899 			err = ENOSYS;
900 			break;
901 		case MXGEFW_CMD_ERROR_UNALIGNED:
902 			err = E2BIG;
903 			break;
904 		case MXGEFW_CMD_ERROR_BUSY:
905 			err = EBUSY;
906 			break;
907 		default:
908 			device_printf(sc->dev,
909 				      "mxge: command %d "
910 				      "failed, result = %d\n",
911 				      cmd, be32toh(response->result));
912 			err = ENXIO;
913 			break;
914 		}
915 		if (err != EAGAIN)
916 			break;
917 	}
918 	if (err == EAGAIN)
919 		device_printf(sc->dev, "mxge: command %d timed out"
920 			      "result = %d\n",
921 			      cmd, be32toh(response->result));
922 	return err;
923 }
924 
925 static int
926 mxge_adopt_running_firmware(mxge_softc_t *sc)
927 {
928 	struct mcp_gen_header *hdr;
929 	const size_t bytes = sizeof (struct mcp_gen_header);
930 	size_t hdr_offset;
931 	int status;
932 
933 	/* find running firmware header */
934 	hdr_offset = htobe32(*(volatile uint32_t *)
935 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
936 
937 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
938 		device_printf(sc->dev,
939 			      "Running firmware has bad header offset (%d)\n",
940 			      (int)hdr_offset);
941 		return EIO;
942 	}
943 
944 	/* copy header of running firmware from SRAM to host memory to
945 	 * validate firmware */
946 	hdr = kmalloc(bytes, M_DEVBUF, M_NOWAIT);
947 	if (hdr == NULL) {
948 		device_printf(sc->dev, "could not kmalloc firmware hdr\n");
949 		return ENOMEM;
950 	}
951 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
952 				rman_get_bushandle(sc->mem_res),
953 				hdr_offset, (char *)hdr, bytes);
954 	status = mxge_validate_firmware(sc, hdr);
955 	kfree(hdr, M_DEVBUF);
956 
957 	/*
958 	 * check to see if adopted firmware has bug where adopting
959 	 * it will cause broadcasts to be filtered unless the NIC
960 	 * is kept in ALLMULTI mode
961 	 */
962 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
963 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
964 		sc->adopted_rx_filter_bug = 1;
965 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
966 			      "working around rx filter bug\n",
967 			      sc->fw_ver_major, sc->fw_ver_minor,
968 			      sc->fw_ver_tiny);
969 	}
970 
971 	return status;
972 }
973 
974 
975 static int
976 mxge_load_firmware(mxge_softc_t *sc, int adopt)
977 {
978 	volatile uint32_t *confirm;
979 	volatile char *submit;
980 	char buf_bytes[72];
981 	uint32_t *buf, size, dma_low, dma_high;
982 	int status, i;
983 
984 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
985 
986 	size = sc->sram_size;
987 	status = mxge_load_firmware_helper(sc, &size);
988 	if (status) {
989 		if (!adopt)
990 			return status;
991 		/* Try to use the currently running firmware, if
992 		   it is new enough */
993 		status = mxge_adopt_running_firmware(sc);
994 		if (status) {
995 			device_printf(sc->dev,
996 				      "failed to adopt running firmware\n");
997 			return status;
998 		}
999 		device_printf(sc->dev,
1000 			      "Successfully adopted running firmware\n");
1001 		if (sc->tx_boundary == 4096) {
1002 			device_printf(sc->dev,
1003 				"Using firmware currently running on NIC"
1004 				 ".  For optimal\n");
1005 			device_printf(sc->dev,
1006 				 "performance consider loading optimized "
1007 				 "firmware\n");
1008 		}
1009 		sc->fw_name = mxge_fw_unaligned;
1010 		sc->tx_boundary = 2048;
1011 		return 0;
1012 	}
1013 	/* clear confirmation addr */
1014 	confirm = (volatile uint32_t *)sc->cmd;
1015 	*confirm = 0;
1016 	wmb();
1017 	/* send a reload command to the bootstrap MCP, and wait for the
1018 	   response in the confirmation address.  The firmware should
1019 	   write a -1 there to indicate it is alive and well
1020 	*/
1021 
1022 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1023 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1024 
1025 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1026 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1027 	buf[2] = htobe32(0xffffffff);	/* confirm data */
1028 
1029 	/* FIX: All newest firmware should un-protect the bottom of
1030 	   the sram before handoff. However, the very first interfaces
1031 	   do not. Therefore the handoff copy must skip the first 8 bytes
1032 	*/
1033 					/* where the code starts*/
1034 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1035 	buf[4] = htobe32(size - 8); 	/* length of code */
1036 	buf[5] = htobe32(8);		/* where to copy to */
1037 	buf[6] = htobe32(0);		/* where to jump to */
1038 
1039 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1040 	mxge_pio_copy(submit, buf, 64);
1041 	wmb();
1042 	DELAY(1000);
1043 	wmb();
1044 	i = 0;
1045 	while (*confirm != 0xffffffff && i < 20) {
1046 		DELAY(1000*10);
1047 		i++;
1048 		bus_dmamap_sync(sc->cmd_dma.dmat,
1049 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1050 	}
1051 	if (*confirm != 0xffffffff) {
1052 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1053 			confirm, *confirm);
1054 
1055 		return ENXIO;
1056 	}
1057 	return 0;
1058 }
1059 
1060 static int
1061 mxge_update_mac_address(mxge_softc_t *sc)
1062 {
1063 	mxge_cmd_t cmd;
1064 	uint8_t *addr = sc->mac_addr;
1065 	int status;
1066 
1067 
1068 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1069 		     | (addr[2] << 8) | addr[3]);
1070 
1071 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1072 
1073 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1074 	return status;
1075 }
1076 
1077 static int
1078 mxge_change_pause(mxge_softc_t *sc, int pause)
1079 {
1080 	mxge_cmd_t cmd;
1081 	int status;
1082 
1083 	if (pause)
1084 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1085 				       &cmd);
1086 	else
1087 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1088 				       &cmd);
1089 
1090 	if (status) {
1091 		device_printf(sc->dev, "Failed to set flow control mode\n");
1092 		return ENXIO;
1093 	}
1094 	sc->pause = pause;
1095 	return 0;
1096 }
1097 
1098 static void
1099 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1100 {
1101 	mxge_cmd_t cmd;
1102 	int status;
1103 
1104 	if( sc->ifp->if_serializer)
1105 		ASSERT_SERIALIZED(sc->ifp->if_serializer);
1106 	if (mxge_always_promisc)
1107 		promisc = 1;
1108 
1109 	if (promisc)
1110 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1111 				       &cmd);
1112 	else
1113 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1114 				       &cmd);
1115 
1116 	if (status) {
1117 		device_printf(sc->dev, "Failed to set promisc mode\n");
1118 	}
1119 }
1120 
1121 static void
1122 mxge_set_multicast_list(mxge_softc_t *sc)
1123 {
1124 	mxge_cmd_t cmd;
1125 	struct ifmultiaddr *ifma;
1126 	struct ifnet *ifp = sc->ifp;
1127 	int err;
1128 
1129 	if (ifp->if_serializer)
1130 		ASSERT_SERIALIZED(ifp->if_serializer);
1131 
1132 	/* This firmware is known to not support multicast */
1133 	if (!sc->fw_multicast_support)
1134 		return;
1135 
1136 	/* Disable multicast filtering while we play with the lists*/
1137 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1138 	if (err != 0) {
1139 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1140 		       " error status: %d\n", err);
1141 		return;
1142 	}
1143 
1144 	if (sc->adopted_rx_filter_bug)
1145 		return;
1146 
1147 	if (ifp->if_flags & IFF_ALLMULTI)
1148 		/* request to disable multicast filtering, so quit here */
1149 		return;
1150 
1151 	/* Flush all the filters */
1152 
1153 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1154 	if (err != 0) {
1155 		device_printf(sc->dev,
1156 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1157 			      ", error status: %d\n", err);
1158 		return;
1159 	}
1160 
1161 	/* Walk the multicast list, and add each address */
1162 
1163 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1164 		if (ifma->ifma_addr->sa_family != AF_LINK)
1165 			continue;
1166 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1167 		      &cmd.data0, 4);
1168 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1169 		      &cmd.data1, 2);
1170 		cmd.data0 = htonl(cmd.data0);
1171 		cmd.data1 = htonl(cmd.data1);
1172 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1173 		if (err != 0) {
1174 			device_printf(sc->dev, "Failed "
1175 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1176 			       "%d\t", err);
1177 			/* abort, leaving multicast filtering off */
1178 			return;
1179 		}
1180 	}
1181 	/* Enable multicast filtering */
1182 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1183 	if (err != 0) {
1184 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1185 		       ", error status: %d\n", err);
1186 	}
1187 }
1188 
1189 static int
1190 mxge_max_mtu(mxge_softc_t *sc)
1191 {
1192 	mxge_cmd_t cmd;
1193 	int status;
1194 
1195 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1196 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1197 
1198 	/* try to set nbufs to see if it we can
1199 	   use virtually contiguous jumbos */
1200 	cmd.data0 = 0;
1201 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1202 			       &cmd);
1203 	if (status == 0)
1204 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1205 
1206 	/* otherwise, we're limited to MJUMPAGESIZE */
1207 	return MJUMPAGESIZE - MXGEFW_PAD;
1208 }
1209 
1210 static int
1211 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1212 {
1213 	struct mxge_slice_state *ss;
1214 	mxge_rx_done_t *rx_done;
1215 	volatile uint32_t *irq_claim;
1216 	mxge_cmd_t cmd;
1217 	int slice, status;
1218 
1219 	/* try to send a reset command to the card to see if it
1220 	   is alive */
1221 	memset(&cmd, 0, sizeof (cmd));
1222 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1223 	if (status != 0) {
1224 		device_printf(sc->dev, "failed reset\n");
1225 		return ENXIO;
1226 	}
1227 
1228 	mxge_dummy_rdma(sc, 1);
1229 
1230 
1231 	/* set the intrq size */
1232 	cmd.data0 = sc->rx_ring_size;
1233 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1234 
1235 	/*
1236 	 * Even though we already know how many slices are supported
1237 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1238 	 * has magic side effects, and must be called after a reset.
1239 	 * It must be called prior to calling any RSS related cmds,
1240 	 * including assigning an interrupt queue for anything but
1241 	 * slice 0.  It must also be called *after*
1242 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1243 	 * the firmware to compute offsets.
1244 	 */
1245 
1246 	if (sc->num_slices > 1) {
1247 		/* ask the maximum number of slices it supports */
1248 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1249 					   &cmd);
1250 		if (status != 0) {
1251 			device_printf(sc->dev,
1252 				      "failed to get number of slices\n");
1253 			return status;
1254 		}
1255 		/*
1256 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1257 		 * to setting up the interrupt queue DMA
1258 		 */
1259 		cmd.data0 = sc->num_slices;
1260 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1261 #ifdef IFNET_BUF_RING
1262 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1263 #endif
1264 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1265 					   &cmd);
1266 		if (status != 0) {
1267 			device_printf(sc->dev,
1268 				      "failed to set number of slices\n");
1269 			return status;
1270 		}
1271 	}
1272 
1273 
1274 	if (interrupts_setup) {
1275 		/* Now exchange information about interrupts  */
1276 		for (slice = 0; slice < sc->num_slices; slice++) {
1277 			rx_done = &sc->ss[slice].rx_done;
1278 			memset(rx_done->entry, 0, sc->rx_ring_size);
1279 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1280 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1281 			cmd.data2 = slice;
1282 			status |= mxge_send_cmd(sc,
1283 						MXGEFW_CMD_SET_INTRQ_DMA,
1284 						&cmd);
1285 		}
1286 	}
1287 
1288 	status |= mxge_send_cmd(sc,
1289 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1290 
1291 
1292 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1293 
1294 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1295 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1296 
1297 
1298 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1299 				&cmd);
1300 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1301 	if (status != 0) {
1302 		device_printf(sc->dev, "failed set interrupt parameters\n");
1303 		return status;
1304 	}
1305 
1306 
1307 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1308 
1309 
1310 	/* run a DMA benchmark */
1311 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1312 
1313 	for (slice = 0; slice < sc->num_slices; slice++) {
1314 		ss = &sc->ss[slice];
1315 
1316 		ss->irq_claim = irq_claim + (2 * slice);
1317 		/* reset mcp/driver shared state back to 0 */
1318 		ss->rx_done.idx = 0;
1319 		ss->rx_done.cnt = 0;
1320 		ss->tx.req = 0;
1321 		ss->tx.done = 0;
1322 		ss->tx.pkt_done = 0;
1323 		ss->tx.queue_active = 0;
1324 		ss->tx.activate = 0;
1325 		ss->tx.deactivate = 0;
1326 		ss->tx.wake = 0;
1327 		ss->tx.defrag = 0;
1328 		ss->tx.stall = 0;
1329 		ss->rx_big.cnt = 0;
1330 		ss->rx_small.cnt = 0;
1331 		ss->lro_bad_csum = 0;
1332 		ss->lro_queued = 0;
1333 		ss->lro_flushed = 0;
1334 		if (ss->fw_stats != NULL) {
1335 			ss->fw_stats->valid = 0;
1336 			ss->fw_stats->send_done_count = 0;
1337 		}
1338 	}
1339 	sc->rdma_tags_available = 15;
1340 	status = mxge_update_mac_address(sc);
1341 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1342 	mxge_change_pause(sc, sc->pause);
1343 	mxge_set_multicast_list(sc);
1344 	return status;
1345 }
1346 
1347 static int
1348 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1349 {
1350         mxge_softc_t *sc;
1351         unsigned int intr_coal_delay;
1352         int err;
1353 
1354         sc = arg1;
1355         intr_coal_delay = sc->intr_coal_delay;
1356         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1357         if (err != 0) {
1358                 return err;
1359         }
1360         if (intr_coal_delay == sc->intr_coal_delay)
1361                 return 0;
1362 
1363         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1364                 return EINVAL;
1365 
1366 	lwkt_serialize_enter(sc->ifp->if_serializer);
1367 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1368 	sc->intr_coal_delay = intr_coal_delay;
1369 
1370 	lwkt_serialize_exit(sc->ifp->if_serializer);
1371         return err;
1372 }
1373 
1374 static int
1375 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1376 {
1377         mxge_softc_t *sc;
1378         unsigned int enabled;
1379         int err;
1380 
1381         sc = arg1;
1382         enabled = sc->pause;
1383         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1384         if (err != 0) {
1385                 return err;
1386         }
1387         if (enabled == sc->pause)
1388                 return 0;
1389 
1390 	lwkt_serialize_enter(sc->ifp->if_serializer);
1391 	err = mxge_change_pause(sc, enabled);
1392 	lwkt_serialize_exit(sc->ifp->if_serializer);
1393         return err;
1394 }
1395 
1396 static int
1397 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1398 {
1399 	struct ifnet *ifp;
1400 	int err = 0;
1401 
1402 	ifp = sc->ifp;
1403 	if (lro_cnt == 0)
1404 		ifp->if_capenable &= ~IFCAP_LRO;
1405 	else
1406 		ifp->if_capenable |= IFCAP_LRO;
1407 	sc->lro_cnt = lro_cnt;
1408 	if (ifp->if_flags & IFF_RUNNING) {
1409 		mxge_close(sc);
1410 		err = mxge_open(sc);
1411 	}
1412 	return err;
1413 }
1414 
1415 static int
1416 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1417 {
1418 	mxge_softc_t *sc;
1419 	unsigned int lro_cnt;
1420 	int err;
1421 
1422 	sc = arg1;
1423 	lro_cnt = sc->lro_cnt;
1424 	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1425 	if (err != 0)
1426 		return err;
1427 
1428 	if (lro_cnt == sc->lro_cnt)
1429 		return 0;
1430 
1431 	if (lro_cnt > 128)
1432 		return EINVAL;
1433 
1434 	lwkt_serialize_enter(sc->ifp->if_serializer);
1435 	err = mxge_change_lro_locked(sc, lro_cnt);
1436 	lwkt_serialize_exit(sc->ifp->if_serializer);
1437 	return err;
1438 }
1439 
1440 static int
1441 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1442 {
1443         int err;
1444 
1445         if (arg1 == NULL)
1446                 return EFAULT;
1447         arg2 = be32toh(*(int *)arg1);
1448         arg1 = NULL;
1449         err = sysctl_handle_int(oidp, arg1, arg2, req);
1450 
1451         return err;
1452 }
1453 
1454 static void
1455 mxge_rem_sysctls(mxge_softc_t *sc)
1456 {
1457 	struct mxge_slice_state *ss;
1458 	int slice;
1459 
1460 	if (sc->slice_sysctl_tree == NULL)
1461 		return;
1462 
1463 	for (slice = 0; slice < sc->num_slices; slice++) {
1464 		ss = &sc->ss[slice];
1465 		if (ss == NULL || ss->sysctl_tree == NULL)
1466 			continue;
1467 		sysctl_ctx_free(&ss->sysctl_ctx);
1468 		ss->sysctl_tree = NULL;
1469 	}
1470 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1471 	sc->slice_sysctl_tree = NULL;
1472 	sysctl_ctx_free(&sc->sysctl_ctx);
1473 	sc->sysctl_tree = NULL;
1474 
1475 }
1476 
1477 static void
1478 mxge_add_sysctls(mxge_softc_t *sc)
1479 {
1480 	struct sysctl_ctx_list *ctx;
1481 	struct sysctl_oid_list *children;
1482 	mcp_irq_data_t *fw;
1483 	struct mxge_slice_state *ss;
1484 	int slice;
1485 	char slice_num[8];
1486 
1487 	ctx = &sc->sysctl_ctx;
1488 	sysctl_ctx_init(ctx);
1489 	sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1490 					  OID_AUTO,
1491 					  device_get_nameunit(sc->dev),
1492 					  CTLFLAG_RD, 0, "");
1493 	if (sc->sysctl_tree == NULL) {
1494 		device_printf(sc->dev, "can't add sysctl node\n");
1495 		return;
1496 	}
1497 
1498 	children = SYSCTL_CHILDREN(sc->sysctl_tree);
1499 	fw = sc->ss[0].fw_stats;
1500 
1501 	/* random information */
1502 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1503 		       "firmware_version",
1504 		       CTLFLAG_RD, &sc->fw_version,
1505 		       0, "firmware version");
1506 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1507 		       "serial_number",
1508 		       CTLFLAG_RD, &sc->serial_number_string,
1509 		       0, "serial number");
1510 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1511 		       "product_code",
1512 		       CTLFLAG_RD, &sc->product_code_string,
1513 		       0, "product_code");
1514 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1515 		       "pcie_link_width",
1516 		       CTLFLAG_RD, &sc->link_width,
1517 		       0, "tx_boundary");
1518 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1519 		       "tx_boundary",
1520 		       CTLFLAG_RD, &sc->tx_boundary,
1521 		       0, "tx_boundary");
1522 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1523 		       "write_combine",
1524 		       CTLFLAG_RD, &sc->wc,
1525 		       0, "write combining PIO?");
1526 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1527 		       "read_dma_MBs",
1528 		       CTLFLAG_RD, &sc->read_dma,
1529 		       0, "DMA Read speed in MB/s");
1530 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1531 		       "write_dma_MBs",
1532 		       CTLFLAG_RD, &sc->write_dma,
1533 		       0, "DMA Write speed in MB/s");
1534 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1535 		       "read_write_dma_MBs",
1536 		       CTLFLAG_RD, &sc->read_write_dma,
1537 		       0, "DMA concurrent Read/Write speed in MB/s");
1538 
1539 
1540 	/* performance related tunables */
1541 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1542 			"intr_coal_delay",
1543 			CTLTYPE_INT|CTLFLAG_RW, sc,
1544 			0, mxge_change_intr_coal,
1545 			"I", "interrupt coalescing delay in usecs");
1546 
1547 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1548 			"flow_control_enabled",
1549 			CTLTYPE_INT|CTLFLAG_RW, sc,
1550 			0, mxge_change_flow_control,
1551 			"I", "interrupt coalescing delay in usecs");
1552 
1553 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1554 		       "deassert_wait",
1555 		       CTLFLAG_RW, &mxge_deassert_wait,
1556 		       0, "Wait for IRQ line to go low in ihandler");
1557 
1558 	/* stats block from firmware is in network byte order.
1559 	   Need to swap it */
1560 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1561 			"link_up",
1562 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1563 			0, mxge_handle_be32,
1564 			"I", "link up");
1565 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1566 			"rdma_tags_available",
1567 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1568 			0, mxge_handle_be32,
1569 			"I", "rdma_tags_available");
1570 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1571 			"dropped_bad_crc32",
1572 			CTLTYPE_INT|CTLFLAG_RD,
1573 			&fw->dropped_bad_crc32,
1574 			0, mxge_handle_be32,
1575 			"I", "dropped_bad_crc32");
1576 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1577 			"dropped_bad_phy",
1578 			CTLTYPE_INT|CTLFLAG_RD,
1579 			&fw->dropped_bad_phy,
1580 			0, mxge_handle_be32,
1581 			"I", "dropped_bad_phy");
1582 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1583 			"dropped_link_error_or_filtered",
1584 			CTLTYPE_INT|CTLFLAG_RD,
1585 			&fw->dropped_link_error_or_filtered,
1586 			0, mxge_handle_be32,
1587 			"I", "dropped_link_error_or_filtered");
1588 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1589 			"dropped_link_overflow",
1590 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1591 			0, mxge_handle_be32,
1592 			"I", "dropped_link_overflow");
1593 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1594 			"dropped_multicast_filtered",
1595 			CTLTYPE_INT|CTLFLAG_RD,
1596 			&fw->dropped_multicast_filtered,
1597 			0, mxge_handle_be32,
1598 			"I", "dropped_multicast_filtered");
1599 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1600 			"dropped_no_big_buffer",
1601 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1602 			0, mxge_handle_be32,
1603 			"I", "dropped_no_big_buffer");
1604 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1605 			"dropped_no_small_buffer",
1606 			CTLTYPE_INT|CTLFLAG_RD,
1607 			&fw->dropped_no_small_buffer,
1608 			0, mxge_handle_be32,
1609 			"I", "dropped_no_small_buffer");
1610 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1611 			"dropped_overrun",
1612 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1613 			0, mxge_handle_be32,
1614 			"I", "dropped_overrun");
1615 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1616 			"dropped_pause",
1617 			CTLTYPE_INT|CTLFLAG_RD,
1618 			&fw->dropped_pause,
1619 			0, mxge_handle_be32,
1620 			"I", "dropped_pause");
1621 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1622 			"dropped_runt",
1623 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1624 			0, mxge_handle_be32,
1625 			"I", "dropped_runt");
1626 
1627 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1628 			"dropped_unicast_filtered",
1629 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1630 			0, mxge_handle_be32,
1631 			"I", "dropped_unicast_filtered");
1632 
1633 	/* verbose printing? */
1634 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1635 		       "verbose",
1636 		       CTLFLAG_RW, &mxge_verbose,
1637 		       0, "verbose printing");
1638 
1639 	/* lro */
1640 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1641 			"lro_cnt",
1642 			CTLTYPE_INT|CTLFLAG_RW, sc,
1643 			0, mxge_change_lro,
1644 			"I", "number of lro merge queues");
1645 
1646 
1647 	/* add counters exported for debugging from all slices */
1648 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1649 	sc->slice_sysctl_tree =
1650 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1651 				"slice", CTLFLAG_RD, 0, "");
1652 
1653 	for (slice = 0; slice < sc->num_slices; slice++) {
1654 		ss = &sc->ss[slice];
1655 		sysctl_ctx_init(&ss->sysctl_ctx);
1656 		ctx = &ss->sysctl_ctx;
1657 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1658 		ksprintf(slice_num, "%d", slice);
1659 		ss->sysctl_tree =
1660 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1661 					CTLFLAG_RD, 0, "");
1662 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1663 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1664 			       "rx_small_cnt",
1665 			       CTLFLAG_RD, &ss->rx_small.cnt,
1666 			       0, "rx_small_cnt");
1667 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1668 			       "rx_big_cnt",
1669 			       CTLFLAG_RD, &ss->rx_big.cnt,
1670 			       0, "rx_small_cnt");
1671 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1672 			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1673 			       0, "number of lro merge queues flushed");
1674 
1675 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1676 			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1677 			       0, "number of frames appended to lro merge"
1678 			       "queues");
1679 
1680 #ifndef IFNET_BUF_RING
1681 		/* only transmit from slice 0 for now */
1682 		if (slice > 0)
1683 			continue;
1684 #endif
1685 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1686 			       "tx_req",
1687 			       CTLFLAG_RD, &ss->tx.req,
1688 			       0, "tx_req");
1689 
1690 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1691 			       "tx_done",
1692 			       CTLFLAG_RD, &ss->tx.done,
1693 			       0, "tx_done");
1694 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1695 			       "tx_pkt_done",
1696 			       CTLFLAG_RD, &ss->tx.pkt_done,
1697 			       0, "tx_done");
1698 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1699 			       "tx_stall",
1700 			       CTLFLAG_RD, &ss->tx.stall,
1701 			       0, "tx_stall");
1702 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1703 			       "tx_wake",
1704 			       CTLFLAG_RD, &ss->tx.wake,
1705 			       0, "tx_wake");
1706 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1707 			       "tx_defrag",
1708 			       CTLFLAG_RD, &ss->tx.defrag,
1709 			       0, "tx_defrag");
1710 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1711 			       "tx_queue_active",
1712 			       CTLFLAG_RD, &ss->tx.queue_active,
1713 			       0, "tx_queue_active");
1714 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1715 			       "tx_activate",
1716 			       CTLFLAG_RD, &ss->tx.activate,
1717 			       0, "tx_activate");
1718 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1719 			       "tx_deactivate",
1720 			       CTLFLAG_RD, &ss->tx.deactivate,
1721 			       0, "tx_deactivate");
1722 	}
1723 }
1724 
1725 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1726    backwards one at a time and handle ring wraps */
1727 
1728 static inline void
1729 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1730 			    mcp_kreq_ether_send_t *src, int cnt)
1731 {
1732         int idx, starting_slot;
1733         starting_slot = tx->req;
1734         while (cnt > 1) {
1735                 cnt--;
1736                 idx = (starting_slot + cnt) & tx->mask;
1737                 mxge_pio_copy(&tx->lanai[idx],
1738 			      &src[cnt], sizeof(*src));
1739                 wmb();
1740         }
1741 }
1742 
1743 /*
1744  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1745  * at most 32 bytes at a time, so as to avoid involving the software
1746  * pio handler in the nic.   We re-write the first segment's flags
1747  * to mark them valid only after writing the entire chain
1748  */
1749 
1750 static inline void
1751 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1752                   int cnt)
1753 {
1754         int idx, i;
1755         uint32_t *src_ints;
1756 	volatile uint32_t *dst_ints;
1757         mcp_kreq_ether_send_t *srcp;
1758 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1759 	uint8_t last_flags;
1760 
1761         idx = tx->req & tx->mask;
1762 
1763 	last_flags = src->flags;
1764 	src->flags = 0;
1765         wmb();
1766         dst = dstp = &tx->lanai[idx];
1767         srcp = src;
1768 
1769         if ((idx + cnt) < tx->mask) {
1770                 for (i = 0; i < (cnt - 1); i += 2) {
1771                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1772                         wmb(); /* force write every 32 bytes */
1773                         srcp += 2;
1774                         dstp += 2;
1775                 }
1776         } else {
1777                 /* submit all but the first request, and ensure
1778                    that it is submitted below */
1779                 mxge_submit_req_backwards(tx, src, cnt);
1780                 i = 0;
1781         }
1782         if (i < cnt) {
1783                 /* submit the first request */
1784                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1785                 wmb(); /* barrier before setting valid flag */
1786         }
1787 
1788         /* re-write the last 32-bits with the valid flags */
1789         src->flags = last_flags;
1790         src_ints = (uint32_t *)src;
1791         src_ints+=3;
1792         dst_ints = (volatile uint32_t *)dst;
1793         dst_ints+=3;
1794         *dst_ints =  *src_ints;
1795         tx->req += cnt;
1796         wmb();
1797 }
1798 
1799 #if IFCAP_TSO4
1800 
1801 static void
1802 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1803 	       int busdma_seg_cnt, int ip_off)
1804 {
1805 	mxge_tx_ring_t *tx;
1806 	mcp_kreq_ether_send_t *req;
1807 	bus_dma_segment_t *seg;
1808 	struct ip *ip;
1809 	struct tcphdr *tcp;
1810 	uint32_t low, high_swapped;
1811 	int len, seglen, cum_len, cum_len_next;
1812 	int next_is_first, chop, cnt, rdma_count, small;
1813 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1814 	uint8_t flags, flags_next;
1815 	static int once;
1816 
1817 	mss = m->m_pkthdr.tso_segsz;
1818 
1819 	/* negative cum_len signifies to the
1820 	 * send loop that we are still in the
1821 	 * header portion of the TSO packet.
1822 	 */
1823 
1824 	/* ensure we have the ethernet, IP and TCP
1825 	   header together in the first mbuf, copy
1826 	   it to a scratch buffer if not */
1827 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1828 		m_copydata(m, 0, ip_off + sizeof (*ip),
1829 			   ss->scratch);
1830 		ip = (struct ip *)(ss->scratch + ip_off);
1831 	} else {
1832 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1833 	}
1834 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1835 			    + sizeof (*tcp))) {
1836 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1837 			   + sizeof (*tcp),  ss->scratch);
1838 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1839 	}
1840 
1841 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1842 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1843 
1844 	/* TSO implies checksum offload on this hardware */
1845 	cksum_offset = ip_off + (ip->ip_hl << 2);
1846 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1847 
1848 
1849 	/* for TSO, pseudo_hdr_offset holds mss.
1850 	 * The firmware figures out where to put
1851 	 * the checksum by parsing the header. */
1852 	pseudo_hdr_offset = htobe16(mss);
1853 
1854 	tx = &ss->tx;
1855 	req = tx->req_list;
1856 	seg = tx->seg_list;
1857 	cnt = 0;
1858 	rdma_count = 0;
1859 	/* "rdma_count" is the number of RDMAs belonging to the
1860 	 * current packet BEFORE the current send request. For
1861 	 * non-TSO packets, this is equal to "count".
1862 	 * For TSO packets, rdma_count needs to be reset
1863 	 * to 0 after a segment cut.
1864 	 *
1865 	 * The rdma_count field of the send request is
1866 	 * the number of RDMAs of the packet starting at
1867 	 * that request. For TSO send requests with one ore more cuts
1868 	 * in the middle, this is the number of RDMAs starting
1869 	 * after the last cut in the request. All previous
1870 	 * segments before the last cut implicitly have 1 RDMA.
1871 	 *
1872 	 * Since the number of RDMAs is not known beforehand,
1873 	 * it must be filled-in retroactively - after each
1874 	 * segmentation cut or at the end of the entire packet.
1875 	 */
1876 
1877 	while (busdma_seg_cnt) {
1878 		/* Break the busdma segment up into pieces*/
1879 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1880 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1881 		len = seg->ds_len;
1882 
1883 		while (len) {
1884 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1885 			seglen = len;
1886 			cum_len_next = cum_len + seglen;
1887 			(req-rdma_count)->rdma_count = rdma_count + 1;
1888 			if (__predict_true(cum_len >= 0)) {
1889 				/* payload */
1890 				chop = (cum_len_next > mss);
1891 				cum_len_next = cum_len_next % mss;
1892 				next_is_first = (cum_len_next == 0);
1893 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1894 				flags_next |= next_is_first *
1895 					MXGEFW_FLAGS_FIRST;
1896 				rdma_count |= -(chop | next_is_first);
1897 				rdma_count += chop & !next_is_first;
1898 			} else if (cum_len_next >= 0) {
1899 				/* header ends */
1900 				rdma_count = -1;
1901 				cum_len_next = 0;
1902 				seglen = -cum_len;
1903 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1904 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1905 					MXGEFW_FLAGS_FIRST |
1906 					(small * MXGEFW_FLAGS_SMALL);
1907 			    }
1908 
1909 			req->addr_high = high_swapped;
1910 			req->addr_low = htobe32(low);
1911 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1912 			req->pad = 0;
1913 			req->rdma_count = 1;
1914 			req->length = htobe16(seglen);
1915 			req->cksum_offset = cksum_offset;
1916 			req->flags = flags | ((cum_len & 1) *
1917 					      MXGEFW_FLAGS_ALIGN_ODD);
1918 			low += seglen;
1919 			len -= seglen;
1920 			cum_len = cum_len_next;
1921 			flags = flags_next;
1922 			req++;
1923 			cnt++;
1924 			rdma_count++;
1925 			if (__predict_false(cksum_offset > seglen))
1926 				cksum_offset -= seglen;
1927 			else
1928 				cksum_offset = 0;
1929 			if (__predict_false(cnt > tx->max_desc))
1930 				goto drop;
1931 		}
1932 		busdma_seg_cnt--;
1933 		seg++;
1934 	}
1935 	(req-rdma_count)->rdma_count = rdma_count;
1936 
1937 	do {
1938 		req--;
1939 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1940 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1941 
1942 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1943 	mxge_submit_req(tx, tx->req_list, cnt);
1944 #ifdef IFNET_BUF_RING
1945 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1946 		/* tell the NIC to start polling this slice */
1947 		*tx->send_go = 1;
1948 		tx->queue_active = 1;
1949 		tx->activate++;
1950 		wmb();
1951 	}
1952 #endif
1953 	return;
1954 
1955 drop:
1956 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1957 	m_freem(m);
1958 	ss->oerrors++;
1959 	if (!once) {
1960 		kprintf("tx->max_desc exceeded via TSO!\n");
1961 		kprintf("mss = %d, %ld, %d!\n", mss,
1962 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1963 		once = 1;
1964 	}
1965 	return;
1966 
1967 }
1968 
1969 #endif /* IFCAP_TSO4 */
1970 
1971 #ifdef MXGE_NEW_VLAN_API
1972 /*
1973  * We reproduce the software vlan tag insertion from
1974  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1975  * vlan tag insertion. We need to advertise this in order to have the
1976  * vlan interface respect our csum offload flags.
1977  */
1978 static struct mbuf *
1979 mxge_vlan_tag_insert(struct mbuf *m)
1980 {
1981 	struct ether_vlan_header *evl;
1982 
1983 	M_PREPEND(m, EVL_ENCAPLEN, MB_DONTWAIT);
1984 	if (__predict_false(m == NULL))
1985 		return NULL;
1986 	if (m->m_len < sizeof(*evl)) {
1987 		m = m_pullup(m, sizeof(*evl));
1988 		if (__predict_false(m == NULL))
1989 			return NULL;
1990 	}
1991 	/*
1992 	 * Transform the Ethernet header into an Ethernet header
1993 	 * with 802.1Q encapsulation.
1994 	 */
1995 	evl = mtod(m, struct ether_vlan_header *);
1996 	bcopy((char *)evl + EVL_ENCAPLEN,
1997 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1998 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1999 	evl->evl_tag = htons(m->m_pkthdr.ether_vlantag);
2000 	m->m_flags &= ~M_VLANTAG;
2001 	return m;
2002 }
2003 #endif /* MXGE_NEW_VLAN_API */
2004 
2005 static void
2006 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2007 {
2008 	mxge_softc_t *sc;
2009 	mcp_kreq_ether_send_t *req;
2010 	bus_dma_segment_t *seg;
2011 	struct mbuf *m_tmp;
2012 	struct ifnet *ifp;
2013 	mxge_tx_ring_t *tx;
2014 	struct ip *ip;
2015 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
2016 	uint16_t pseudo_hdr_offset;
2017         uint8_t flags, cksum_offset;
2018 
2019 
2020 	sc = ss->sc;
2021 	ifp = sc->ifp;
2022 	tx = &ss->tx;
2023 
2024 	ip_off = sizeof (struct ether_header);
2025 #ifdef MXGE_NEW_VLAN_API
2026 	if (m->m_flags & M_VLANTAG) {
2027 		m = mxge_vlan_tag_insert(m);
2028 		if (__predict_false(m == NULL))
2029 			goto drop;
2030 		ip_off += EVL_ENCAPLEN;
2031 	}
2032 #endif
2033 	/* (try to) map the frame for DMA */
2034 	idx = tx->req & tx->mask;
2035 	err = bus_dmamap_load_mbuf_segment(tx->dmat, tx->info[idx].map,
2036 					   m, tx->seg_list, 1, &cnt,
2037 					   BUS_DMA_NOWAIT);
2038 	if (__predict_false(err == EFBIG)) {
2039 		/* Too many segments in the chain.  Try
2040 		   to defrag */
2041 		m_tmp = m_defrag(m, MB_DONTWAIT);
2042 		if (m_tmp == NULL) {
2043 			goto drop;
2044 		}
2045 		ss->tx.defrag++;
2046 		m = m_tmp;
2047 		err = bus_dmamap_load_mbuf_segment(tx->dmat,
2048 					      tx->info[idx].map,
2049 					      m, tx->seg_list, 1, &cnt,
2050 					      BUS_DMA_NOWAIT);
2051 	}
2052 	if (__predict_false(err != 0)) {
2053 		device_printf(sc->dev, "bus_dmamap_load_mbuf_segment returned %d"
2054 			      " packet len = %d\n", err, m->m_pkthdr.len);
2055 		goto drop;
2056 	}
2057 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2058 			BUS_DMASYNC_PREWRITE);
2059 	tx->info[idx].m = m;
2060 
2061 #if IFCAP_TSO4
2062 	/* TSO is different enough, we handle it in another routine */
2063 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2064 		mxge_encap_tso(ss, m, cnt, ip_off);
2065 		return;
2066 	}
2067 #endif
2068 
2069 	req = tx->req_list;
2070 	cksum_offset = 0;
2071 	pseudo_hdr_offset = 0;
2072 	flags = MXGEFW_FLAGS_NO_TSO;
2073 
2074 	/* checksum offloading? */
2075 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2076 		/* ensure ip header is in first mbuf, copy
2077 		   it to a scratch buffer if not */
2078 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2079 			m_copydata(m, 0, ip_off + sizeof (*ip),
2080 				   ss->scratch);
2081 			ip = (struct ip *)(ss->scratch + ip_off);
2082 		} else {
2083 			ip = (struct ip *)(mtod(m, char *) + ip_off);
2084 		}
2085 		cksum_offset = ip_off + (ip->ip_hl << 2);
2086 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2087 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2088 		req->cksum_offset = cksum_offset;
2089 		flags |= MXGEFW_FLAGS_CKSUM;
2090 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2091 	} else {
2092 		odd_flag = 0;
2093 	}
2094 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2095 		flags |= MXGEFW_FLAGS_SMALL;
2096 
2097 	/* convert segments into a request list */
2098 	cum_len = 0;
2099 	seg = tx->seg_list;
2100 	req->flags = MXGEFW_FLAGS_FIRST;
2101 	for (i = 0; i < cnt; i++) {
2102 		req->addr_low =
2103 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2104 		req->addr_high =
2105 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2106 		req->length = htobe16(seg->ds_len);
2107 		req->cksum_offset = cksum_offset;
2108 		if (cksum_offset > seg->ds_len)
2109 			cksum_offset -= seg->ds_len;
2110 		else
2111 			cksum_offset = 0;
2112 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2113 		req->pad = 0; /* complete solid 16-byte block */
2114 		req->rdma_count = 1;
2115 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2116 		cum_len += seg->ds_len;
2117 		seg++;
2118 		req++;
2119 		req->flags = 0;
2120 	}
2121 	req--;
2122 	/* pad runts to 60 bytes */
2123 	if (cum_len < 60) {
2124 		req++;
2125 		req->addr_low =
2126 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2127 		req->addr_high =
2128 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2129 		req->length = htobe16(60 - cum_len);
2130 		req->cksum_offset = 0;
2131 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2132 		req->pad = 0; /* complete solid 16-byte block */
2133 		req->rdma_count = 1;
2134 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2135 		cnt++;
2136 	}
2137 
2138 	tx->req_list[0].rdma_count = cnt;
2139 #if 0
2140 	/* print what the firmware will see */
2141 	for (i = 0; i < cnt; i++) {
2142 		kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2143 		    "cso:%d, flags:0x%x, rdma:%d\n",
2144 		    i, (int)ntohl(tx->req_list[i].addr_high),
2145 		    (int)ntohl(tx->req_list[i].addr_low),
2146 		    (int)ntohs(tx->req_list[i].length),
2147 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2148 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2149 		    tx->req_list[i].rdma_count);
2150 	}
2151 	kprintf("--------------\n");
2152 #endif
2153 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2154 	mxge_submit_req(tx, tx->req_list, cnt);
2155 #ifdef IFNET_BUF_RING
2156 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2157 		/* tell the NIC to start polling this slice */
2158 		*tx->send_go = 1;
2159 		tx->queue_active = 1;
2160 		tx->activate++;
2161 		wmb();
2162 	}
2163 #endif
2164 	return;
2165 
2166 drop:
2167 	m_freem(m);
2168 	ss->oerrors++;
2169 	return;
2170 }
2171 
2172 #ifdef IFNET_BUF_RING
2173 static void
2174 mxge_qflush(struct ifnet *ifp)
2175 {
2176 	mxge_softc_t *sc = ifp->if_softc;
2177 	mxge_tx_ring_t *tx;
2178 	struct mbuf *m;
2179 	int slice;
2180 
2181 	for (slice = 0; slice < sc->num_slices; slice++) {
2182 		tx = &sc->ss[slice].tx;
2183 		lwkt_serialize_enter(sc->ifp->if_serializer);
2184 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2185 			m_freem(m);
2186 		lwkt_serialize_exit(sc->ifp->if_serializer);
2187 	}
2188 	if_qflush(ifp);
2189 }
2190 
2191 static inline void
2192 mxge_start_locked(struct mxge_slice_state *ss)
2193 {
2194 	mxge_softc_t *sc;
2195 	struct mbuf *m;
2196 	struct ifnet *ifp;
2197 	mxge_tx_ring_t *tx;
2198 
2199 	sc = ss->sc;
2200 	ifp = sc->ifp;
2201 	tx = &ss->tx;
2202 
2203 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2204 		m = drbr_dequeue(ifp, tx->br);
2205 		if (m == NULL) {
2206 			return;
2207 		}
2208 		/* let BPF see it */
2209 		BPF_MTAP(ifp, m);
2210 
2211 		/* give it to the nic */
2212 		mxge_encap(ss, m);
2213 	}
2214 	/* ran out of transmit slots */
2215 	if (((ss->if_flags & IFF_OACTIVE) == 0)
2216 	    && (!drbr_empty(ifp, tx->br))) {
2217 		ss->if_flags |= IFF_OACTIVE;
2218 		tx->stall++;
2219 	}
2220 }
2221 
2222 static int
2223 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2224 {
2225 	mxge_softc_t *sc;
2226 	struct ifnet *ifp;
2227 	mxge_tx_ring_t *tx;
2228 	int err;
2229 
2230 	sc = ss->sc;
2231 	ifp = sc->ifp;
2232 	tx = &ss->tx;
2233 
2234 	if ((ss->if_flags & (IFF_RUNNING|IFF_OACTIVE)) !=
2235 	    IFF_RUNNING) {
2236 		err = drbr_enqueue(ifp, tx->br, m);
2237 		return (err);
2238 	}
2239 
2240 	if (drbr_empty(ifp, tx->br) &&
2241 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2242 		/* let BPF see it */
2243 		BPF_MTAP(ifp, m);
2244 		/* give it to the nic */
2245 		mxge_encap(ss, m);
2246 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2247 		return (err);
2248 	}
2249 	if (!drbr_empty(ifp, tx->br))
2250 		mxge_start_locked(ss);
2251 	return (0);
2252 }
2253 
2254 static int
2255 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2256 {
2257 	mxge_softc_t *sc = ifp->if_softc;
2258 	struct mxge_slice_state *ss;
2259 	mxge_tx_ring_t *tx;
2260 	int err = 0;
2261 	int slice;
2262 
2263 #if 0
2264 	slice = m->m_pkthdr.flowid;
2265 #endif
2266 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2267 
2268 	ss = &sc->ss[slice];
2269 	tx = &ss->tx;
2270 
2271 	if(lwkt_serialize_try(ifp->if_serializer)) {
2272 		err = mxge_transmit_locked(ss, m);
2273 		lwkt_serialize_exit(ifp->if_serializer);
2274 	} else {
2275 		err = drbr_enqueue(ifp, tx->br, m);
2276 	}
2277 
2278 	return (err);
2279 }
2280 
2281 #else
2282 
2283 static inline void
2284 mxge_start_locked(struct mxge_slice_state *ss)
2285 {
2286 	mxge_softc_t *sc;
2287 	struct mbuf *m;
2288 	struct ifnet *ifp;
2289 	mxge_tx_ring_t *tx;
2290 
2291 	sc = ss->sc;
2292 	ifp = sc->ifp;
2293 	tx = &ss->tx;
2294 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2295 		m = ifq_dequeue(&ifp->if_snd, NULL);
2296 		if (m == NULL) {
2297 			return;
2298 		}
2299 		/* let BPF see it */
2300 		BPF_MTAP(ifp, m);
2301 
2302 		/* give it to the nic */
2303 		mxge_encap(ss, m);
2304 	}
2305 	/* ran out of transmit slots */
2306 	if ((sc->ifp->if_flags & IFF_OACTIVE) == 0) {
2307 		sc->ifp->if_flags |= IFF_OACTIVE;
2308 		tx->stall++;
2309 	}
2310 }
2311 #endif
2312 static void
2313 mxge_start(struct ifnet *ifp)
2314 {
2315 	mxge_softc_t *sc = ifp->if_softc;
2316 	struct mxge_slice_state *ss;
2317 
2318 	ASSERT_SERIALIZED(sc->ifp->if_serializer);
2319 	/* only use the first slice for now */
2320 	ss = &sc->ss[0];
2321 	mxge_start_locked(ss);
2322 }
2323 
2324 /*
2325  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2326  * at most 32 bytes at a time, so as to avoid involving the software
2327  * pio handler in the nic.   We re-write the first segment's low
2328  * DMA address to mark it valid only after we write the entire chunk
2329  * in a burst
2330  */
2331 static inline void
2332 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2333 		mcp_kreq_ether_recv_t *src)
2334 {
2335 	uint32_t low;
2336 
2337 	low = src->addr_low;
2338 	src->addr_low = 0xffffffff;
2339 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2340 	wmb();
2341 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2342 	wmb();
2343 	src->addr_low = low;
2344 	dst->addr_low = low;
2345 	wmb();
2346 }
2347 
2348 static int
2349 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2350 {
2351 	bus_dma_segment_t seg;
2352 	struct mbuf *m;
2353 	mxge_rx_ring_t *rx = &ss->rx_small;
2354 	int cnt, err;
2355 
2356 	m = m_gethdr(MB_DONTWAIT, MT_DATA);
2357 	if (m == NULL) {
2358 		rx->alloc_fail++;
2359 		err = ENOBUFS;
2360 		goto done;
2361 	}
2362 	m->m_len = m->m_pkthdr.len = MHLEN;
2363 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2364 				      &seg, 1, &cnt, BUS_DMA_NOWAIT);
2365 	if (err != 0) {
2366 		kprintf("can't dmamap small (%d)\n", err);
2367 		m_free(m);
2368 		goto done;
2369 	}
2370 	rx->info[idx].m = m;
2371 	rx->shadow[idx].addr_low =
2372 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2373 	rx->shadow[idx].addr_high =
2374 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2375 
2376 done:
2377 	if ((idx & 7) == 7)
2378 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2379 	return err;
2380 }
2381 
2382 
2383 static int
2384 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2385 {
2386 	bus_dma_segment_t seg[3];
2387 	struct mbuf *m;
2388 	mxge_rx_ring_t *rx = &ss->rx_big;
2389 	int cnt, err, i;
2390 
2391 	if (rx->cl_size == MCLBYTES)
2392 		m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2393 	else {
2394 #if 0
2395 		m = m_getjcl(MB_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2396 #else
2397 		/*
2398 		 * XXX: allocate normal sized buffers for big buffers.
2399 		 * We should be fine as long as we don't get any jumbo frames
2400 		 */
2401 		m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2402 #endif
2403 	}
2404 	if (m == NULL) {
2405 		rx->alloc_fail++;
2406 		err = ENOBUFS;
2407 		goto done;
2408 	}
2409 	m->m_pkthdr.len = 0;
2410 	m->m_len = m->m_pkthdr.len = rx->mlen;
2411 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2412 				      seg, 1, &cnt, BUS_DMA_NOWAIT);
2413 	if (err != 0) {
2414 		kprintf("can't dmamap big (%d)\n", err);
2415 		m_free(m);
2416 		goto done;
2417 	}
2418 	rx->info[idx].m = m;
2419 	rx->shadow[idx].addr_low =
2420 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2421 	rx->shadow[idx].addr_high =
2422 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2423 
2424 #if MXGE_VIRT_JUMBOS
2425 	for (i = 1; i < cnt; i++) {
2426 		rx->shadow[idx + i].addr_low =
2427 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2428 		rx->shadow[idx + i].addr_high =
2429 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2430        }
2431 #endif
2432 
2433 done:
2434        for (i = 0; i < rx->nbufs; i++) {
2435 		if ((idx & 7) == 7) {
2436 			mxge_submit_8rx(&rx->lanai[idx - 7],
2437 					&rx->shadow[idx - 7]);
2438 		}
2439 		idx++;
2440 	}
2441 	return err;
2442 }
2443 
2444 /*
2445  *  Myri10GE hardware checksums are not valid if the sender
2446  *  padded the frame with non-zero padding.  This is because
2447  *  the firmware just does a simple 16-bit 1s complement
2448  *  checksum across the entire frame, excluding the first 14
2449  *  bytes.  It is best to simply to check the checksum and
2450  *  tell the stack about it only if the checksum is good
2451  */
2452 
2453 static inline uint16_t
2454 mxge_rx_csum(struct mbuf *m, int csum)
2455 {
2456 	struct ether_header *eh;
2457 	struct ip *ip;
2458 	uint16_t c;
2459 
2460 	eh = mtod(m, struct ether_header *);
2461 
2462 	/* only deal with IPv4 TCP & UDP for now */
2463 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2464 		return 1;
2465 	ip = (struct ip *)(eh + 1);
2466 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2467 			    ip->ip_p != IPPROTO_UDP))
2468 		return 1;
2469 #ifdef INET
2470 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2471 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2472 			    - (ip->ip_hl << 2) + ip->ip_p));
2473 #else
2474 	c = 1;
2475 #endif
2476 	c ^= 0xffff;
2477 	return (c);
2478 }
2479 
2480 static void
2481 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2482 {
2483 	struct ether_vlan_header *evl;
2484 	struct ether_header *eh;
2485 	uint32_t partial;
2486 
2487 	evl = mtod(m, struct ether_vlan_header *);
2488 	eh = mtod(m, struct ether_header *);
2489 
2490 	/*
2491 	 * fix checksum by subtracting EVL_ENCAPLEN bytes
2492 	 * after what the firmware thought was the end of the ethernet
2493 	 * header.
2494 	 */
2495 
2496 	/* put checksum into host byte order */
2497 	*csum = ntohs(*csum);
2498 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2499 	(*csum) += ~partial;
2500 	(*csum) +=  ((*csum) < ~partial);
2501 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2502 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2503 
2504 	/* restore checksum to network byte order;
2505 	   later consumers expect this */
2506 	*csum = htons(*csum);
2507 
2508 	/* save the tag */
2509 #ifdef MXGE_NEW_VLAN_API
2510 	m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2511 #else
2512 	{
2513 		struct m_tag *mtag;
2514 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2515 				   MB_DONTWAIT);
2516 		if (mtag == NULL)
2517 			return;
2518 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2519 		m_tag_prepend(m, mtag);
2520 	}
2521 
2522 #endif
2523 	m->m_flags |= M_VLANTAG;
2524 
2525 	/*
2526 	 * Remove the 802.1q header by copying the Ethernet
2527 	 * addresses over it and adjusting the beginning of
2528 	 * the data in the mbuf.  The encapsulated Ethernet
2529 	 * type field is already in place.
2530 	 */
2531 	bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2532 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2533 	m_adj(m, EVL_ENCAPLEN);
2534 }
2535 
2536 
2537 static inline void
2538 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2539 {
2540 	mxge_softc_t *sc;
2541 	struct ifnet *ifp;
2542 	struct mbuf *m;
2543 	struct ether_header *eh;
2544 	mxge_rx_ring_t *rx;
2545 	bus_dmamap_t old_map;
2546 	int idx;
2547 	uint16_t tcpudp_csum;
2548 
2549 	sc = ss->sc;
2550 	ifp = sc->ifp;
2551 	rx = &ss->rx_big;
2552 	idx = rx->cnt & rx->mask;
2553 	rx->cnt += rx->nbufs;
2554 	/* save a pointer to the received mbuf */
2555 	m = rx->info[idx].m;
2556 	/* try to replace the received mbuf */
2557 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2558 		/* drop the frame -- the old mbuf is re-cycled */
2559 		ifp->if_ierrors++;
2560 		return;
2561 	}
2562 
2563 	/* unmap the received buffer */
2564 	old_map = rx->info[idx].map;
2565 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2566 	bus_dmamap_unload(rx->dmat, old_map);
2567 
2568 	/* swap the bus_dmamap_t's */
2569 	rx->info[idx].map = rx->extra_map;
2570 	rx->extra_map = old_map;
2571 
2572 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2573 	 * aligned */
2574 	m->m_data += MXGEFW_PAD;
2575 
2576 	m->m_pkthdr.rcvif = ifp;
2577 	m->m_len = m->m_pkthdr.len = len;
2578 	ss->ipackets++;
2579 	eh = mtod(m, struct ether_header *);
2580 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2581 		mxge_vlan_tag_remove(m, &csum);
2582 	}
2583 	/* if the checksum is valid, mark it in the mbuf header */
2584 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2585 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2586 			return;
2587 		/* otherwise, it was a UDP frame, or a TCP frame which
2588 		   we could not do LRO on.  Tell the stack that the
2589 		   checksum is good */
2590 		m->m_pkthdr.csum_data = 0xffff;
2591 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2592 	}
2593 #if 0
2594 	/* flowid only valid if RSS hashing is enabled */
2595 	if (sc->num_slices > 1) {
2596 		m->m_pkthdr.flowid = (ss - sc->ss);
2597 		m->m_flags |= M_FLOWID;
2598 	}
2599 #endif
2600 	ifp->if_input(ifp, m);
2601 }
2602 
2603 static inline void
2604 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2605 {
2606 	mxge_softc_t *sc;
2607 	struct ifnet *ifp;
2608 	struct ether_header *eh;
2609 	struct mbuf *m;
2610 	mxge_rx_ring_t *rx;
2611 	bus_dmamap_t old_map;
2612 	int idx;
2613 	uint16_t tcpudp_csum;
2614 
2615 	sc = ss->sc;
2616 	ifp = sc->ifp;
2617 	rx = &ss->rx_small;
2618 	idx = rx->cnt & rx->mask;
2619 	rx->cnt++;
2620 	/* save a pointer to the received mbuf */
2621 	m = rx->info[idx].m;
2622 	/* try to replace the received mbuf */
2623 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2624 		/* drop the frame -- the old mbuf is re-cycled */
2625 		ifp->if_ierrors++;
2626 		return;
2627 	}
2628 
2629 	/* unmap the received buffer */
2630 	old_map = rx->info[idx].map;
2631 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2632 	bus_dmamap_unload(rx->dmat, old_map);
2633 
2634 	/* swap the bus_dmamap_t's */
2635 	rx->info[idx].map = rx->extra_map;
2636 	rx->extra_map = old_map;
2637 
2638 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2639 	 * aligned */
2640 	m->m_data += MXGEFW_PAD;
2641 
2642 	m->m_pkthdr.rcvif = ifp;
2643 	m->m_len = m->m_pkthdr.len = len;
2644 	ss->ipackets++;
2645 	eh = mtod(m, struct ether_header *);
2646 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2647 		mxge_vlan_tag_remove(m, &csum);
2648 	}
2649 	/* if the checksum is valid, mark it in the mbuf header */
2650 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2651 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2652 			return;
2653 		/* otherwise, it was a UDP frame, or a TCP frame which
2654 		   we could not do LRO on.  Tell the stack that the
2655 		   checksum is good */
2656 		m->m_pkthdr.csum_data = 0xffff;
2657 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2658 	}
2659 #if 0
2660 	/* flowid only valid if RSS hashing is enabled */
2661 	if (sc->num_slices > 1) {
2662 		m->m_pkthdr.flowid = (ss - sc->ss);
2663 		m->m_flags |= M_FLOWID;
2664 	}
2665 #endif
2666 	ifp->if_input(ifp, m);
2667 }
2668 
2669 /*
2670  * XXX
2671  *
2672  * Inlining the call to this function causes mxge_intr() to grow too large
2673  * for GCC's stack size limits (which shouldn't take into account inlining
2674  * of leaf functions at one call site anyway). Inlining is definitely a
2675  * good idea in this case though, so mark the function appropriately.
2676  */
2677 static inline __always_inline void
2678 mxge_clean_rx_done(struct mxge_slice_state *ss)
2679 {
2680 	mxge_rx_done_t *rx_done = &ss->rx_done;
2681 	int limit = 0;
2682 	uint16_t length;
2683 	uint16_t checksum;
2684 
2685 	while (rx_done->entry[rx_done->idx].length != 0) {
2686 		length = ntohs(rx_done->entry[rx_done->idx].length);
2687 		rx_done->entry[rx_done->idx].length = 0;
2688 		checksum = rx_done->entry[rx_done->idx].checksum;
2689 		if (length <= (MHLEN - MXGEFW_PAD))
2690 			mxge_rx_done_small(ss, length, checksum);
2691 		else
2692 			mxge_rx_done_big(ss, length, checksum);
2693 		rx_done->cnt++;
2694 		rx_done->idx = rx_done->cnt & rx_done->mask;
2695 
2696 		/* limit potential for livelock */
2697 		if (__predict_false(++limit > rx_done->mask / 2))
2698 			break;
2699 	}
2700 #ifdef INET
2701 	while (!SLIST_EMPTY(&ss->lro_active)) {
2702 		struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2703 		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2704 		mxge_lro_flush(ss, lro);
2705 	}
2706 #endif
2707 }
2708 
2709 
2710 static inline void
2711 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2712 {
2713 	struct ifnet *ifp;
2714 	mxge_tx_ring_t *tx;
2715 	struct mbuf *m;
2716 	bus_dmamap_t map;
2717 	int idx;
2718 	int *flags;
2719 
2720 	tx = &ss->tx;
2721 	ifp = ss->sc->ifp;
2722 	ASSERT_SERIALIZED(ifp->if_serializer);
2723 	while (tx->pkt_done != mcp_idx) {
2724 		idx = tx->done & tx->mask;
2725 		tx->done++;
2726 		m = tx->info[idx].m;
2727 		/* mbuf and DMA map only attached to the first
2728 		   segment per-mbuf */
2729 		if (m != NULL) {
2730 			ss->obytes += m->m_pkthdr.len;
2731 			if (m->m_flags & M_MCAST)
2732 				ss->omcasts++;
2733 			ss->opackets++;
2734 			tx->info[idx].m = NULL;
2735 			map = tx->info[idx].map;
2736 			bus_dmamap_unload(tx->dmat, map);
2737 			m_freem(m);
2738 		}
2739 		if (tx->info[idx].flag) {
2740 			tx->info[idx].flag = 0;
2741 			tx->pkt_done++;
2742 		}
2743 	}
2744 
2745 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2746            its OK to send packets */
2747 #ifdef IFNET_BUF_RING
2748 	flags = &ss->if_flags;
2749 #else
2750 	flags = &ifp->if_flags;
2751 #endif
2752 	if ((*flags) & IFF_OACTIVE &&
2753 	    tx->req - tx->done < (tx->mask + 1)/4) {
2754 		*(flags) &= ~IFF_OACTIVE;
2755 		ss->tx.wake++;
2756 		mxge_start_locked(ss);
2757 	}
2758 #ifdef IFNET_BUF_RING
2759 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2760 		/* let the NIC stop polling this queue, since there
2761 		 * are no more transmits pending */
2762 		if (tx->req == tx->done) {
2763 			*tx->send_stop = 1;
2764 			tx->queue_active = 0;
2765 			tx->deactivate++;
2766 			wmb();
2767 		}
2768 	}
2769 #endif
2770 
2771 }
2772 
2773 static struct mxge_media_type mxge_xfp_media_types[] =
2774 {
2775 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2776 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2777 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2778 	{0,		(1 << 5),	"10GBASE-ER"},
2779 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2780 	{0,		(1 << 3),	"10GBASE-SW"},
2781 	{0,		(1 << 2),	"10GBASE-LW"},
2782 	{0,		(1 << 1),	"10GBASE-EW"},
2783 	{0,		(1 << 0),	"Reserved"}
2784 };
2785 static struct mxge_media_type mxge_sfp_media_types[] =
2786 {
2787 	{0,		(1 << 7),	"Reserved"},
2788 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2789 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2790 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"}
2791 };
2792 
2793 static void
2794 mxge_set_media(mxge_softc_t *sc, int type)
2795 {
2796 	sc->media_flags |= type;
2797 	ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2798 	ifmedia_set(&sc->media, sc->media_flags);
2799 }
2800 
2801 
2802 /*
2803  * Determine the media type for a NIC.  Some XFPs will identify
2804  * themselves only when their link is up, so this is initiated via a
2805  * link up interrupt.  However, this can potentially take up to
2806  * several milliseconds, so it is run via the watchdog routine, rather
2807  * than in the interrupt handler itself.   This need only be done
2808  * once, not each time the link is up.
2809  */
2810 static void
2811 mxge_media_probe(mxge_softc_t *sc)
2812 {
2813 	mxge_cmd_t cmd;
2814 	char *cage_type;
2815 	char *ptr;
2816 	struct mxge_media_type *mxge_media_types = NULL;
2817 	int i, err, ms, mxge_media_type_entries;
2818 	uint32_t byte;
2819 
2820 	sc->need_media_probe = 0;
2821 
2822 	/* if we've already set a media type, we're done */
2823 	if (sc->media_flags  != (IFM_ETHER | IFM_AUTO))
2824 		return;
2825 
2826 	/*
2827 	 * parse the product code to deterimine the interface type
2828 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2829 	 * after the 3rd dash in the driver's cached copy of the
2830 	 * EEPROM's product code string.
2831 	 */
2832 	ptr = sc->product_code_string;
2833 	if (ptr == NULL) {
2834 		device_printf(sc->dev, "Missing product code\n");
2835 	}
2836 
2837 	for (i = 0; i < 3; i++, ptr++) {
2838 		ptr = index(ptr, '-');
2839 		if (ptr == NULL) {
2840 			device_printf(sc->dev,
2841 				      "only %d dashes in PC?!?\n", i);
2842 			return;
2843 		}
2844 	}
2845 	if (*ptr == 'C') {
2846 		/* -C is CX4 */
2847 		mxge_set_media(sc, IFM_10G_CX4);
2848 		return;
2849 	}
2850 	else if (*ptr == 'Q') {
2851 		/* -Q is Quad Ribbon Fiber */
2852 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2853 		/* FreeBSD has no media type for Quad ribbon fiber */
2854 		return;
2855 	}
2856 
2857 	if (*ptr == 'R') {
2858 		/* -R is XFP */
2859 		mxge_media_types = mxge_xfp_media_types;
2860 		mxge_media_type_entries = NELEM(mxge_xfp_media_types);
2861 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2862 		cage_type = "XFP";
2863 	}
2864 
2865 	if (*ptr == 'S' || *(ptr +1) == 'S') {
2866 		/* -S or -2S is SFP+ */
2867 		mxge_media_types = mxge_sfp_media_types;
2868 		mxge_media_type_entries = NELEM(mxge_sfp_media_types);
2869 		cage_type = "SFP+";
2870 		byte = 3;
2871 	}
2872 
2873 	if (mxge_media_types == NULL) {
2874 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2875 		return;
2876 	}
2877 
2878 	/*
2879 	 * At this point we know the NIC has an XFP cage, so now we
2880 	 * try to determine what is in the cage by using the
2881 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2882 	 * register.  We read just one byte, which may take over
2883 	 * a millisecond
2884 	 */
2885 
2886 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2887 	cmd.data1 = byte;
2888 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2889 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2890 		device_printf(sc->dev, "failed to read XFP\n");
2891 	}
2892 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2893 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2894 	}
2895 	if (err != MXGEFW_CMD_OK) {
2896 		return;
2897 	}
2898 
2899 	/* now we wait for the data to be cached */
2900 	cmd.data0 = byte;
2901 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2902 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2903 		DELAY(1000);
2904 		cmd.data0 = byte;
2905 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2906 	}
2907 	if (err != MXGEFW_CMD_OK) {
2908 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2909 			      cage_type, err, ms);
2910 		return;
2911 	}
2912 
2913 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2914 		if (mxge_verbose)
2915 			device_printf(sc->dev, "%s:%s\n", cage_type,
2916 				      mxge_media_types[0].name);
2917 		mxge_set_media(sc, IFM_10G_CX4);
2918 		return;
2919 	}
2920 	for (i = 1; i < mxge_media_type_entries; i++) {
2921 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2922 			if (mxge_verbose)
2923 				device_printf(sc->dev, "%s:%s\n",
2924 					      cage_type,
2925 					      mxge_media_types[i].name);
2926 
2927 			mxge_set_media(sc, mxge_media_types[i].flag);
2928 			return;
2929 		}
2930 	}
2931 	device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2932 		      cmd.data0);
2933 
2934 	return;
2935 }
2936 
2937 static void
2938 mxge_intr(void *arg)
2939 {
2940 	struct mxge_slice_state *ss = arg;
2941 	mxge_softc_t *sc = ss->sc;
2942 	mcp_irq_data_t *stats = ss->fw_stats;
2943 	mxge_tx_ring_t *tx = &ss->tx;
2944 	mxge_rx_done_t *rx_done = &ss->rx_done;
2945 	uint32_t send_done_count;
2946 	uint8_t valid;
2947 
2948 
2949 #ifndef IFNET_BUF_RING
2950 	/* an interrupt on a non-zero slice is implicitly valid
2951 	   since MSI-X irqs are not shared */
2952 	if (ss != sc->ss) {
2953 		mxge_clean_rx_done(ss);
2954 		*ss->irq_claim = be32toh(3);
2955 		return;
2956 	}
2957 #endif
2958 
2959 	/* make sure the DMA has finished */
2960 	if (!stats->valid) {
2961 		return;
2962 	}
2963 	valid = stats->valid;
2964 
2965 	if (sc->legacy_irq) {
2966 		/* lower legacy IRQ  */
2967 		*sc->irq_deassert = 0;
2968 		if (!mxge_deassert_wait)
2969 			/* don't wait for conf. that irq is low */
2970 			stats->valid = 0;
2971 	} else {
2972 		stats->valid = 0;
2973 	}
2974 
2975 	/* loop while waiting for legacy irq deassertion */
2976 	do {
2977 		/* check for transmit completes and receives */
2978 		send_done_count = be32toh(stats->send_done_count);
2979 		while ((send_done_count != tx->pkt_done) ||
2980 		       (rx_done->entry[rx_done->idx].length != 0)) {
2981 			if (send_done_count != tx->pkt_done)
2982 				mxge_tx_done(ss, (int)send_done_count);
2983 			mxge_clean_rx_done(ss);
2984 			send_done_count = be32toh(stats->send_done_count);
2985 		}
2986 		if (sc->legacy_irq && mxge_deassert_wait)
2987 			wmb();
2988 	} while (*((volatile uint8_t *) &stats->valid));
2989 
2990 	/* fw link & error stats meaningful only on the first slice */
2991 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2992 		if (sc->link_state != stats->link_up) {
2993 			sc->link_state = stats->link_up;
2994 			if (sc->link_state) {
2995 				sc->ifp->if_link_state = LINK_STATE_UP;
2996 				if_link_state_change(sc->ifp);
2997 				if (mxge_verbose)
2998 					device_printf(sc->dev, "link up\n");
2999 			} else {
3000 				sc->ifp->if_link_state = LINK_STATE_DOWN;
3001 				if_link_state_change(sc->ifp);
3002 				if (mxge_verbose)
3003 					device_printf(sc->dev, "link down\n");
3004 			}
3005 			sc->need_media_probe = 1;
3006 		}
3007 		if (sc->rdma_tags_available !=
3008 		    be32toh(stats->rdma_tags_available)) {
3009 			sc->rdma_tags_available =
3010 				be32toh(stats->rdma_tags_available);
3011 			device_printf(sc->dev, "RDMA timed out! %d tags "
3012 				      "left\n", sc->rdma_tags_available);
3013 		}
3014 
3015 		if (stats->link_down) {
3016 			sc->down_cnt += stats->link_down;
3017 			sc->link_state = 0;
3018 			sc->ifp->if_link_state = LINK_STATE_DOWN;
3019 			if_link_state_change(sc->ifp);
3020 		}
3021 	}
3022 
3023 	/* check to see if we have rx token to pass back */
3024 	if (valid & 0x1)
3025 	    *ss->irq_claim = be32toh(3);
3026 	*(ss->irq_claim + 1) = be32toh(3);
3027 }
3028 
3029 static void
3030 mxge_init(void *arg)
3031 {
3032 }
3033 
3034 
3035 
3036 static void
3037 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3038 {
3039 	struct lro_entry *lro_entry;
3040 	int i;
3041 
3042 	while (!SLIST_EMPTY(&ss->lro_free)) {
3043 		lro_entry = SLIST_FIRST(&ss->lro_free);
3044 		SLIST_REMOVE_HEAD(&ss->lro_free, next);
3045 		kfree(lro_entry, M_DEVBUF);
3046 	}
3047 
3048 	for (i = 0; i <= ss->rx_big.mask; i++) {
3049 		if (ss->rx_big.info[i].m == NULL)
3050 			continue;
3051 		bus_dmamap_unload(ss->rx_big.dmat,
3052 				  ss->rx_big.info[i].map);
3053 		m_freem(ss->rx_big.info[i].m);
3054 		ss->rx_big.info[i].m = NULL;
3055 	}
3056 
3057 	for (i = 0; i <= ss->rx_small.mask; i++) {
3058 		if (ss->rx_small.info[i].m == NULL)
3059 			continue;
3060 		bus_dmamap_unload(ss->rx_small.dmat,
3061 				  ss->rx_small.info[i].map);
3062 		m_freem(ss->rx_small.info[i].m);
3063 		ss->rx_small.info[i].m = NULL;
3064 	}
3065 
3066 	/* transmit ring used only on the first slice */
3067 	if (ss->tx.info == NULL)
3068 		return;
3069 
3070 	for (i = 0; i <= ss->tx.mask; i++) {
3071 		ss->tx.info[i].flag = 0;
3072 		if (ss->tx.info[i].m == NULL)
3073 			continue;
3074 		bus_dmamap_unload(ss->tx.dmat,
3075 				  ss->tx.info[i].map);
3076 		m_freem(ss->tx.info[i].m);
3077 		ss->tx.info[i].m = NULL;
3078 	}
3079 }
3080 
3081 static void
3082 mxge_free_mbufs(mxge_softc_t *sc)
3083 {
3084 	int slice;
3085 
3086 	for (slice = 0; slice < sc->num_slices; slice++)
3087 		mxge_free_slice_mbufs(&sc->ss[slice]);
3088 }
3089 
3090 static void
3091 mxge_free_slice_rings(struct mxge_slice_state *ss)
3092 {
3093 	int i;
3094 
3095 
3096 	if (ss->rx_done.entry != NULL)
3097 		mxge_dma_free(&ss->rx_done.dma);
3098 	ss->rx_done.entry = NULL;
3099 
3100 	if (ss->tx.req_bytes != NULL)
3101 		kfree(ss->tx.req_bytes, M_DEVBUF);
3102 	ss->tx.req_bytes = NULL;
3103 
3104 	if (ss->tx.seg_list != NULL)
3105 		kfree(ss->tx.seg_list, M_DEVBUF);
3106 	ss->tx.seg_list = NULL;
3107 
3108 	if (ss->rx_small.shadow != NULL)
3109 		kfree(ss->rx_small.shadow, M_DEVBUF);
3110 	ss->rx_small.shadow = NULL;
3111 
3112 	if (ss->rx_big.shadow != NULL)
3113 		kfree(ss->rx_big.shadow, M_DEVBUF);
3114 	ss->rx_big.shadow = NULL;
3115 
3116 	if (ss->tx.info != NULL) {
3117 		if (ss->tx.dmat != NULL) {
3118 			for (i = 0; i <= ss->tx.mask; i++) {
3119 				bus_dmamap_destroy(ss->tx.dmat,
3120 						   ss->tx.info[i].map);
3121 			}
3122 			bus_dma_tag_destroy(ss->tx.dmat);
3123 		}
3124 		kfree(ss->tx.info, M_DEVBUF);
3125 	}
3126 	ss->tx.info = NULL;
3127 
3128 	if (ss->rx_small.info != NULL) {
3129 		if (ss->rx_small.dmat != NULL) {
3130 			for (i = 0; i <= ss->rx_small.mask; i++) {
3131 				bus_dmamap_destroy(ss->rx_small.dmat,
3132 						   ss->rx_small.info[i].map);
3133 			}
3134 			bus_dmamap_destroy(ss->rx_small.dmat,
3135 					   ss->rx_small.extra_map);
3136 			bus_dma_tag_destroy(ss->rx_small.dmat);
3137 		}
3138 		kfree(ss->rx_small.info, M_DEVBUF);
3139 	}
3140 	ss->rx_small.info = NULL;
3141 
3142 	if (ss->rx_big.info != NULL) {
3143 		if (ss->rx_big.dmat != NULL) {
3144 			for (i = 0; i <= ss->rx_big.mask; i++) {
3145 				bus_dmamap_destroy(ss->rx_big.dmat,
3146 						   ss->rx_big.info[i].map);
3147 			}
3148 			bus_dmamap_destroy(ss->rx_big.dmat,
3149 					   ss->rx_big.extra_map);
3150 			bus_dma_tag_destroy(ss->rx_big.dmat);
3151 		}
3152 		kfree(ss->rx_big.info, M_DEVBUF);
3153 	}
3154 	ss->rx_big.info = NULL;
3155 }
3156 
3157 static void
3158 mxge_free_rings(mxge_softc_t *sc)
3159 {
3160 	int slice;
3161 
3162 	for (slice = 0; slice < sc->num_slices; slice++)
3163 		mxge_free_slice_rings(&sc->ss[slice]);
3164 }
3165 
3166 static int
3167 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3168 		       int tx_ring_entries)
3169 {
3170 	mxge_softc_t *sc = ss->sc;
3171 	size_t bytes;
3172 	int err, i;
3173 
3174 	err = ENOMEM;
3175 
3176 	/* allocate per-slice receive resources */
3177 
3178 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3179 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3180 
3181 	/* allocate the rx shadow rings */
3182 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3183 	ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3184 
3185 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3186 	ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3187 
3188 	/* allocate the rx host info rings */
3189 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3190 	ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3191 
3192 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3193 	ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3194 
3195 	/* allocate the rx busdma resources */
3196 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3197 				 1,			/* alignment */
3198 				 4096,			/* boundary */
3199 				 BUS_SPACE_MAXADDR,	/* low */
3200 				 BUS_SPACE_MAXADDR,	/* high */
3201 				 NULL, NULL,		/* filter */
3202 				 MHLEN,			/* maxsize */
3203 				 1,			/* num segs */
3204 				 MHLEN,			/* maxsegsize */
3205 				 BUS_DMA_ALLOCNOW,	/* flags */
3206 				 &ss->rx_small.dmat);	/* tag */
3207 	if (err != 0) {
3208 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3209 			      err);
3210 		return err;
3211 	}
3212 
3213 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3214 				 1,			/* alignment */
3215 #if MXGE_VIRT_JUMBOS
3216 				 4096,			/* boundary */
3217 #else
3218 				 0,			/* boundary */
3219 #endif
3220 				 BUS_SPACE_MAXADDR,	/* low */
3221 				 BUS_SPACE_MAXADDR,	/* high */
3222 				 NULL, NULL,		/* filter */
3223 				 3*4096,		/* maxsize */
3224 #if MXGE_VIRT_JUMBOS
3225 				 3,			/* num segs */
3226 				 4096,			/* maxsegsize*/
3227 #else
3228 				 1,			/* num segs */
3229 				 MJUM9BYTES,		/* maxsegsize*/
3230 #endif
3231 				 BUS_DMA_ALLOCNOW,	/* flags */
3232 				 &ss->rx_big.dmat);	/* tag */
3233 	if (err != 0) {
3234 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3235 			      err);
3236 		return err;
3237 	}
3238 	for (i = 0; i <= ss->rx_small.mask; i++) {
3239 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3240 					&ss->rx_small.info[i].map);
3241 		if (err != 0) {
3242 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3243 				      err);
3244 			return err;
3245 		}
3246 	}
3247 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3248 				&ss->rx_small.extra_map);
3249 	if (err != 0) {
3250 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3251 			      err);
3252 		return err;
3253 	}
3254 
3255 	for (i = 0; i <= ss->rx_big.mask; i++) {
3256 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3257 					&ss->rx_big.info[i].map);
3258 		if (err != 0) {
3259 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3260 				      err);
3261 			return err;
3262 		}
3263 	}
3264 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3265 				&ss->rx_big.extra_map);
3266 	if (err != 0) {
3267 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3268 			      err);
3269 		return err;
3270 	}
3271 
3272 	/* now allocate TX resouces */
3273 
3274 #ifndef IFNET_BUF_RING
3275 	/* only use a single TX ring for now */
3276 	if (ss != ss->sc->ss)
3277 		return 0;
3278 #endif
3279 
3280 	ss->tx.mask = tx_ring_entries - 1;
3281 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3282 
3283 
3284 	/* allocate the tx request copy block */
3285 	bytes = 8 +
3286 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3287 	ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3288 	/* ensure req_list entries are aligned to 8 bytes */
3289 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3290 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3291 
3292 	/* allocate the tx busdma segment list */
3293 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3294 	ss->tx.seg_list = (bus_dma_segment_t *)
3295 		kmalloc(bytes, M_DEVBUF, M_WAITOK);
3296 	if (ss->tx.seg_list == NULL)
3297 		return err;
3298 
3299 	/* allocate the tx host info ring */
3300 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3301 	ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3302 
3303 	/* allocate the tx busdma resources */
3304 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3305 				 1,			/* alignment */
3306 				 sc->tx_boundary,	/* boundary */
3307 				 BUS_SPACE_MAXADDR,	/* low */
3308 				 BUS_SPACE_MAXADDR,	/* high */
3309 				 NULL, NULL,		/* filter */
3310 				 65536 + 256,		/* maxsize */
3311 				 ss->tx.max_desc - 2,	/* num segs */
3312 				 sc->tx_boundary,	/* maxsegsz */
3313 				 BUS_DMA_ALLOCNOW,	/* flags */
3314 				 &ss->tx.dmat);		/* tag */
3315 
3316 	if (err != 0) {
3317 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3318 			      err);
3319 		return err;
3320 	}
3321 
3322 	/* now use these tags to setup dmamaps for each slot
3323 	   in the ring */
3324 	for (i = 0; i <= ss->tx.mask; i++) {
3325 		err = bus_dmamap_create(ss->tx.dmat, 0,
3326 					&ss->tx.info[i].map);
3327 		if (err != 0) {
3328 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3329 				      err);
3330 			return err;
3331 		}
3332 	}
3333 	return 0;
3334 
3335 }
3336 
3337 static int
3338 mxge_alloc_rings(mxge_softc_t *sc)
3339 {
3340 	mxge_cmd_t cmd;
3341 	int tx_ring_size;
3342 	int tx_ring_entries, rx_ring_entries;
3343 	int err, slice;
3344 
3345 	/* get ring sizes */
3346 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3347 	tx_ring_size = cmd.data0;
3348 	if (err != 0) {
3349 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3350 		goto abort;
3351 	}
3352 
3353 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3354 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3355 	ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3356 	ifq_set_ready(&sc->ifp->if_snd);
3357 
3358 	for (slice = 0; slice < sc->num_slices; slice++) {
3359 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3360 					     rx_ring_entries,
3361 					     tx_ring_entries);
3362 		if (err != 0)
3363 			goto abort;
3364 	}
3365 	return 0;
3366 
3367 abort:
3368 	mxge_free_rings(sc);
3369 	return err;
3370 
3371 }
3372 
3373 
3374 static void
3375 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3376 {
3377 	int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3378 
3379 	if (bufsize < MCLBYTES) {
3380 		/* easy, everything fits in a single buffer */
3381 		*big_buf_size = MCLBYTES;
3382 		*cl_size = MCLBYTES;
3383 		*nbufs = 1;
3384 		return;
3385 	}
3386 
3387 	if (bufsize < MJUMPAGESIZE) {
3388 		/* still easy, everything still fits in a single buffer */
3389 		*big_buf_size = MJUMPAGESIZE;
3390 		*cl_size = MJUMPAGESIZE;
3391 		*nbufs = 1;
3392 		return;
3393 	}
3394 #if MXGE_VIRT_JUMBOS
3395 	/* now we need to use virtually contiguous buffers */
3396 	*cl_size = MJUM9BYTES;
3397 	*big_buf_size = 4096;
3398 	*nbufs = mtu / 4096 + 1;
3399 	/* needs to be a power of two, so round up */
3400 	if (*nbufs == 3)
3401 		*nbufs = 4;
3402 #else
3403 	*cl_size = MJUM9BYTES;
3404 	*big_buf_size = MJUM9BYTES;
3405 	*nbufs = 1;
3406 #endif
3407 }
3408 
3409 static int
3410 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3411 {
3412 	mxge_softc_t *sc;
3413 	mxge_cmd_t cmd;
3414 	bus_dmamap_t map;
3415 	struct lro_entry *lro_entry;
3416 	int err, i, slice;
3417 
3418 
3419 	sc = ss->sc;
3420 	slice = ss - sc->ss;
3421 
3422 	SLIST_INIT(&ss->lro_free);
3423 	SLIST_INIT(&ss->lro_active);
3424 
3425 	for (i = 0; i < sc->lro_cnt; i++) {
3426 		lro_entry = (struct lro_entry *)
3427 			kmalloc(sizeof (*lro_entry), M_DEVBUF,
3428 			       M_NOWAIT | M_ZERO);
3429 		if (lro_entry == NULL) {
3430 			sc->lro_cnt = i;
3431 			break;
3432 		}
3433 		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3434 	}
3435 	/* get the lanai pointers to the send and receive rings */
3436 
3437 	err = 0;
3438 #ifndef IFNET_BUF_RING
3439 	/* We currently only send from the first slice */
3440 	if (slice == 0) {
3441 #endif
3442 		cmd.data0 = slice;
3443 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3444 		ss->tx.lanai =
3445 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3446 		ss->tx.send_go = (volatile uint32_t *)
3447 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3448 		ss->tx.send_stop = (volatile uint32_t *)
3449 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3450 #ifndef IFNET_BUF_RING
3451 	}
3452 #endif
3453 	cmd.data0 = slice;
3454 	err |= mxge_send_cmd(sc,
3455 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3456 	ss->rx_small.lanai =
3457 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3458 	cmd.data0 = slice;
3459 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3460 	ss->rx_big.lanai =
3461 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3462 
3463 	if (err != 0) {
3464 		device_printf(sc->dev,
3465 			      "failed to get ring sizes or locations\n");
3466 		return EIO;
3467 	}
3468 
3469 	/* stock receive rings */
3470 	for (i = 0; i <= ss->rx_small.mask; i++) {
3471 		map = ss->rx_small.info[i].map;
3472 		err = mxge_get_buf_small(ss, map, i);
3473 		if (err) {
3474 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3475 				      i, ss->rx_small.mask + 1);
3476 			return ENOMEM;
3477 		}
3478 	}
3479 	for (i = 0; i <= ss->rx_big.mask; i++) {
3480 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3481 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3482 	}
3483 	ss->rx_big.nbufs = nbufs;
3484 	ss->rx_big.cl_size = cl_size;
3485 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3486 		EVL_ENCAPLEN + MXGEFW_PAD;
3487 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3488 		map = ss->rx_big.info[i].map;
3489 		err = mxge_get_buf_big(ss, map, i);
3490 		if (err) {
3491 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3492 				      i, ss->rx_big.mask + 1);
3493 			return ENOMEM;
3494 		}
3495 	}
3496 	return 0;
3497 }
3498 
3499 static int
3500 mxge_open(mxge_softc_t *sc)
3501 {
3502 	mxge_cmd_t cmd;
3503 	int err, big_bytes, nbufs, slice, cl_size, i;
3504 	bus_addr_t bus;
3505 	volatile uint8_t *itable;
3506 	struct mxge_slice_state *ss;
3507 
3508 	ASSERT_SERIALIZED(sc->ifp->if_serializer);
3509 	/* Copy the MAC address in case it was overridden */
3510 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3511 
3512 	err = mxge_reset(sc, 1);
3513 	if (err != 0) {
3514 		device_printf(sc->dev, "failed to reset\n");
3515 		return EIO;
3516 	}
3517 
3518 	if (sc->num_slices > 1) {
3519 		/* setup the indirection table */
3520 		cmd.data0 = sc->num_slices;
3521 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3522 				    &cmd);
3523 
3524 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3525 				     &cmd);
3526 		if (err != 0) {
3527 			device_printf(sc->dev,
3528 				      "failed to setup rss tables\n");
3529 			return err;
3530 		}
3531 
3532 		/* just enable an identity mapping */
3533 		itable = sc->sram + cmd.data0;
3534 		for (i = 0; i < sc->num_slices; i++)
3535 			itable[i] = (uint8_t)i;
3536 
3537 		cmd.data0 = 1;
3538 		cmd.data1 = mxge_rss_hash_type;
3539 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3540 		if (err != 0) {
3541 			device_printf(sc->dev, "failed to enable slices\n");
3542 			return err;
3543 		}
3544 	}
3545 
3546 
3547 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3548 
3549 	cmd.data0 = nbufs;
3550 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3551 			    &cmd);
3552 	/* error is only meaningful if we're trying to set
3553 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3554 	if (err && nbufs > 1) {
3555 		device_printf(sc->dev,
3556 			      "Failed to set alway-use-n to %d\n",
3557 			      nbufs);
3558 		return EIO;
3559 	}
3560 	/* Give the firmware the mtu and the big and small buffer
3561 	   sizes.  The firmware wants the big buf size to be a power
3562 	   of two. Luckily, FreeBSD's clusters are powers of two */
3563 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3564 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3565 	cmd.data0 = MHLEN - MXGEFW_PAD;
3566 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3567 			     &cmd);
3568 	cmd.data0 = big_bytes;
3569 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3570 
3571 	if (err != 0) {
3572 		device_printf(sc->dev, "failed to setup params\n");
3573 		goto abort;
3574 	}
3575 
3576 	/* Now give him the pointer to the stats block */
3577 	for (slice = 0;
3578 #ifdef IFNET_BUF_RING
3579 	     slice < sc->num_slices;
3580 #else
3581 	     slice < 1;
3582 #endif
3583 	     slice++) {
3584 		ss = &sc->ss[slice];
3585 		cmd.data0 =
3586 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3587 		cmd.data1 =
3588 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3589 		cmd.data2 = sizeof(struct mcp_irq_data);
3590 		cmd.data2 |= (slice << 16);
3591 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3592 	}
3593 
3594 	if (err != 0) {
3595 		bus = sc->ss->fw_stats_dma.bus_addr;
3596 		bus += offsetof(struct mcp_irq_data, send_done_count);
3597 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3598 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3599 		err = mxge_send_cmd(sc,
3600 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3601 				    &cmd);
3602 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3603 		sc->fw_multicast_support = 0;
3604 	} else {
3605 		sc->fw_multicast_support = 1;
3606 	}
3607 
3608 	if (err != 0) {
3609 		device_printf(sc->dev, "failed to setup params\n");
3610 		goto abort;
3611 	}
3612 
3613 	for (slice = 0; slice < sc->num_slices; slice++) {
3614 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3615 		if (err != 0) {
3616 			device_printf(sc->dev, "couldn't open slice %d\n",
3617 				      slice);
3618 			goto abort;
3619 		}
3620 	}
3621 
3622 	/* Finally, start the firmware running */
3623 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3624 	if (err) {
3625 		device_printf(sc->dev, "Couldn't bring up link\n");
3626 		goto abort;
3627 	}
3628 #ifdef IFNET_BUF_RING
3629 	for (slice = 0; slice < sc->num_slices; slice++) {
3630 		ss = &sc->ss[slice];
3631 		ss->if_flags |= IFF_RUNNING;
3632 		ss->if_flags &= ~IFF_OACTIVE;
3633 	}
3634 #endif
3635 	sc->ifp->if_flags |= IFF_RUNNING;
3636 	sc->ifp->if_flags &= ~IFF_OACTIVE;
3637 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3638 
3639 	return 0;
3640 
3641 
3642 abort:
3643 	mxge_free_mbufs(sc);
3644 
3645 	return err;
3646 }
3647 
3648 static int
3649 mxge_close(mxge_softc_t *sc)
3650 {
3651 	mxge_cmd_t cmd;
3652 	int err, old_down_cnt;
3653 #ifdef IFNET_BUF_RING
3654 	struct mxge_slice_state *ss;
3655 	int slice;
3656 #endif
3657 
3658 	ASSERT_SERIALIZED(sc->ifp->if_serializer);
3659 	callout_stop(&sc->co_hdl);
3660 #ifdef IFNET_BUF_RING
3661 	for (slice = 0; slice < sc->num_slices; slice++) {
3662 		ss = &sc->ss[slice];
3663 		ss->if_flags &= ~IFF_RUNNING;
3664 	}
3665 #endif
3666 	sc->ifp->if_flags &= ~IFF_RUNNING;
3667 	old_down_cnt = sc->down_cnt;
3668 	wmb();
3669 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3670 	if (err) {
3671 		device_printf(sc->dev, "Couldn't bring down link\n");
3672 	}
3673 	if (old_down_cnt == sc->down_cnt) {
3674 		/* wait for down irq */
3675 		DELAY(10 * sc->intr_coal_delay);
3676 	}
3677 	wmb();
3678 	if (old_down_cnt == sc->down_cnt) {
3679 		device_printf(sc->dev, "never got down irq\n");
3680 	}
3681 
3682 	mxge_free_mbufs(sc);
3683 
3684 	return 0;
3685 }
3686 
3687 static void
3688 mxge_setup_cfg_space(mxge_softc_t *sc)
3689 {
3690 	device_t dev = sc->dev;
3691 	int reg;
3692 	uint16_t cmd, lnk, pectl;
3693 
3694 	/* find the PCIe link width and set max read request to 4KB*/
3695 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3696 		lnk = pci_read_config(dev, reg + 0x12, 2);
3697 		sc->link_width = (lnk >> 4) & 0x3f;
3698 
3699 		pectl = pci_read_config(dev, reg + 0x8, 2);
3700 		pectl = (pectl & ~0x7000) | (5 << 12);
3701 		pci_write_config(dev, reg + 0x8, pectl, 2);
3702 	}
3703 
3704 	/* Enable DMA and Memory space access */
3705 	pci_enable_busmaster(dev);
3706 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3707 	cmd |= PCIM_CMD_MEMEN;
3708 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3709 }
3710 
3711 static uint32_t
3712 mxge_read_reboot(mxge_softc_t *sc)
3713 {
3714 	device_t dev = sc->dev;
3715 	uint32_t vs;
3716 
3717 	/* find the vendor specific offset */
3718 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3719 		device_printf(sc->dev,
3720 			      "could not find vendor specific offset\n");
3721 		return (uint32_t)-1;
3722 	}
3723 	/* enable read32 mode */
3724 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3725 	/* tell NIC which register to read */
3726 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3727 	return (pci_read_config(dev, vs + 0x14, 4));
3728 }
3729 
3730 static int
3731 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3732 {
3733 	struct pci_devinfo *dinfo;
3734 	mxge_tx_ring_t *tx;
3735 	int err;
3736 	uint32_t reboot;
3737 	uint16_t cmd;
3738 
3739 	err = ENXIO;
3740 
3741 	device_printf(sc->dev, "Watchdog reset!\n");
3742 
3743 	/*
3744 	 * check to see if the NIC rebooted.  If it did, then all of
3745 	 * PCI config space has been reset, and things like the
3746 	 * busmaster bit will be zero.  If this is the case, then we
3747 	 * must restore PCI config space before the NIC can be used
3748 	 * again
3749 	 */
3750 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3751 	if (cmd == 0xffff) {
3752 		/*
3753 		 * maybe the watchdog caught the NIC rebooting; wait
3754 		 * up to 100ms for it to finish.  If it does not come
3755 		 * back, then give up
3756 		 */
3757 		DELAY(1000*100);
3758 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3759 		if (cmd == 0xffff) {
3760 			device_printf(sc->dev, "NIC disappeared!\n");
3761 			return (err);
3762 		}
3763 	}
3764 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3765 		/* print the reboot status */
3766 		reboot = mxge_read_reboot(sc);
3767 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3768 			      reboot);
3769 		/* restore PCI configuration space */
3770 		dinfo = device_get_ivars(sc->dev);
3771 		pci_cfg_restore(sc->dev, dinfo);
3772 
3773 		/* and redo any changes we made to our config space */
3774 		mxge_setup_cfg_space(sc);
3775 
3776 		if (sc->ifp->if_flags & IFF_RUNNING) {
3777 			mxge_close(sc);
3778 			err = mxge_open(sc);
3779 		}
3780 	} else {
3781 		tx = &sc->ss[slice].tx;
3782 		device_printf(sc->dev,
3783 			      "NIC did not reboot, slice %d ring state:\n",
3784 			      slice);
3785 		device_printf(sc->dev,
3786 			      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3787 			      tx->req, tx->done, tx->queue_active);
3788 		device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3789 			      tx->activate, tx->deactivate);
3790 		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3791 			      tx->pkt_done,
3792 			      be32toh(sc->ss->fw_stats->send_done_count));
3793 		device_printf(sc->dev, "not resetting\n");
3794 	}
3795 	return (err);
3796 }
3797 
3798 static int
3799 mxge_watchdog(mxge_softc_t *sc)
3800 {
3801 	mxge_tx_ring_t *tx;
3802 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3803 	int i, err = 0;
3804 
3805 	/* see if we have outstanding transmits, which
3806 	   have been pending for more than mxge_ticks */
3807 	for (i = 0;
3808 #ifdef IFNET_BUF_RING
3809 	     (i < sc->num_slices) && (err == 0);
3810 #else
3811 	     (i < 1) && (err == 0);
3812 #endif
3813 	     i++) {
3814 		tx = &sc->ss[i].tx;
3815 		if (tx->req != tx->done &&
3816 		    tx->watchdog_req != tx->watchdog_done &&
3817 		    tx->done == tx->watchdog_done) {
3818 			/* check for pause blocking before resetting */
3819 			if (tx->watchdog_rx_pause == rx_pause)
3820 				err = mxge_watchdog_reset(sc, i);
3821 			else
3822 				device_printf(sc->dev, "Flow control blocking "
3823 					      "xmits, check link partner\n");
3824 		}
3825 
3826 		tx->watchdog_req = tx->req;
3827 		tx->watchdog_done = tx->done;
3828 		tx->watchdog_rx_pause = rx_pause;
3829 	}
3830 
3831 	if (sc->need_media_probe)
3832 		mxge_media_probe(sc);
3833 	return (err);
3834 }
3835 
3836 static void
3837 mxge_update_stats(mxge_softc_t *sc)
3838 {
3839 	struct mxge_slice_state *ss;
3840 	u_long ipackets = 0;
3841 	u_long opackets = 0;
3842 #ifdef IFNET_BUF_RING
3843 	u_long obytes = 0;
3844 	u_long omcasts = 0;
3845 	u_long odrops = 0;
3846 #endif
3847 	u_long oerrors = 0;
3848 	int slice;
3849 
3850 	for (slice = 0; slice < sc->num_slices; slice++) {
3851 		ss = &sc->ss[slice];
3852 		ipackets += ss->ipackets;
3853 		opackets += ss->opackets;
3854 #ifdef IFNET_BUF_RING
3855 		obytes += ss->obytes;
3856 		omcasts += ss->omcasts;
3857 		odrops += ss->tx.br->br_drops;
3858 #endif
3859 		oerrors += ss->oerrors;
3860 	}
3861 	sc->ifp->if_ipackets = ipackets;
3862 	sc->ifp->if_opackets = opackets;
3863 #ifdef IFNET_BUF_RING
3864 	sc->ifp->if_obytes = obytes;
3865 	sc->ifp->if_omcasts = omcasts;
3866 	sc->ifp->if_snd.ifq_drops = odrops;
3867 #endif
3868 	sc->ifp->if_oerrors = oerrors;
3869 }
3870 
3871 static void
3872 mxge_tick(void *arg)
3873 {
3874 	mxge_softc_t *sc = arg;
3875 	int err = 0;
3876 
3877 	lwkt_serialize_enter(sc->ifp->if_serializer);
3878 	/* aggregate stats from different slices */
3879 	mxge_update_stats(sc);
3880 	if (!sc->watchdog_countdown) {
3881 		err = mxge_watchdog(sc);
3882 		sc->watchdog_countdown = 4;
3883 	}
3884 	sc->watchdog_countdown--;
3885 	if (err == 0)
3886 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3887 	lwkt_serialize_exit(sc->ifp->if_serializer);
3888 }
3889 
3890 static int
3891 mxge_media_change(struct ifnet *ifp)
3892 {
3893 	return EINVAL;
3894 }
3895 
3896 static int
3897 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3898 {
3899 	struct ifnet *ifp = sc->ifp;
3900 	int real_mtu, old_mtu;
3901 	int err = 0;
3902 
3903 	if (ifp->if_serializer)
3904 		ASSERT_SERIALIZED(ifp->if_serializer);
3905 
3906 	real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3907 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3908 		return EINVAL;
3909 	old_mtu = ifp->if_mtu;
3910 	ifp->if_mtu = mtu;
3911 	if (ifp->if_flags & IFF_RUNNING) {
3912 		mxge_close(sc);
3913 		err = mxge_open(sc);
3914 		if (err != 0) {
3915 			ifp->if_mtu = old_mtu;
3916 			mxge_close(sc);
3917 			(void) mxge_open(sc);
3918 		}
3919 	}
3920 	return err;
3921 }
3922 
3923 static void
3924 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3925 {
3926 	mxge_softc_t *sc = ifp->if_softc;
3927 
3928 
3929 	if (sc == NULL)
3930 		return;
3931 	ifmr->ifm_status = IFM_AVALID;
3932 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3933 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3934 	ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3935 }
3936 
3937 static int
3938 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data, struct ucred *cr)
3939 {
3940 	mxge_softc_t *sc = ifp->if_softc;
3941 	struct ifreq *ifr = (struct ifreq *)data;
3942 	int err, mask;
3943 
3944 	(void)cr;
3945 	err = 0;
3946 	ASSERT_SERIALIZED(ifp->if_serializer);
3947 	switch (command) {
3948 	case SIOCSIFADDR:
3949 	case SIOCGIFADDR:
3950 		err = ether_ioctl(ifp, command, data);
3951 		break;
3952 
3953 	case SIOCSIFMTU:
3954 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3955 		break;
3956 
3957 	case SIOCSIFFLAGS:
3958 		if (sc->dying) {
3959 			return EINVAL;
3960 		}
3961 		if (ifp->if_flags & IFF_UP) {
3962 			if (!(ifp->if_flags & IFF_RUNNING)) {
3963 				err = mxge_open(sc);
3964 			} else {
3965 				/* take care of promis can allmulti
3966 				   flag chages */
3967 				mxge_change_promisc(sc,
3968 						    ifp->if_flags & IFF_PROMISC);
3969 				mxge_set_multicast_list(sc);
3970 			}
3971 		} else {
3972 			if (ifp->if_flags & IFF_RUNNING) {
3973 				mxge_close(sc);
3974 			}
3975 		}
3976 		break;
3977 
3978 	case SIOCADDMULTI:
3979 	case SIOCDELMULTI:
3980 		mxge_set_multicast_list(sc);
3981 		break;
3982 
3983 	case SIOCSIFCAP:
3984 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3985 		if (mask & IFCAP_TXCSUM) {
3986 			if (IFCAP_TXCSUM & ifp->if_capenable) {
3987 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3988 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3989 						      | CSUM_TSO);
3990 			} else {
3991 				ifp->if_capenable |= IFCAP_TXCSUM;
3992 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3993 			}
3994 		} else if (mask & IFCAP_RXCSUM) {
3995 			if (IFCAP_RXCSUM & ifp->if_capenable) {
3996 				ifp->if_capenable &= ~IFCAP_RXCSUM;
3997 				sc->csum_flag = 0;
3998 			} else {
3999 				ifp->if_capenable |= IFCAP_RXCSUM;
4000 				sc->csum_flag = 1;
4001 			}
4002 		}
4003 		if (mask & IFCAP_TSO4) {
4004 			if (IFCAP_TSO4 & ifp->if_capenable) {
4005 				ifp->if_capenable &= ~IFCAP_TSO4;
4006 				ifp->if_hwassist &= ~CSUM_TSO;
4007 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4008 				ifp->if_capenable |= IFCAP_TSO4;
4009 				ifp->if_hwassist |= CSUM_TSO;
4010 			} else {
4011 				kprintf("mxge requires tx checksum offload"
4012 				       " be enabled to use TSO\n");
4013 				err = EINVAL;
4014 			}
4015 		}
4016 		if (mask & IFCAP_LRO) {
4017 			if (IFCAP_LRO & ifp->if_capenable)
4018 				err = mxge_change_lro_locked(sc, 0);
4019 			else
4020 				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
4021 		}
4022 		if (mask & IFCAP_VLAN_HWTAGGING)
4023 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4024 		VLAN_CAPABILITIES(ifp);
4025 
4026 		break;
4027 
4028 	case SIOCGIFMEDIA:
4029 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4030 				    &sc->media, command);
4031                 break;
4032 
4033 	default:
4034 		err = ENOTTY;
4035         }
4036 	return err;
4037 }
4038 
4039 static void
4040 mxge_fetch_tunables(mxge_softc_t *sc)
4041 {
4042 
4043 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4044 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4045 			  &mxge_flow_control);
4046 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4047 			  &mxge_intr_coal_delay);
4048 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4049 			  &mxge_nvidia_ecrc_enable);
4050 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4051 			  &mxge_force_firmware);
4052 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4053 			  &mxge_deassert_wait);
4054 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4055 			  &mxge_verbose);
4056 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4057 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4058 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4059 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4060 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4061 	if (sc->lro_cnt != 0)
4062 		mxge_lro_cnt = sc->lro_cnt;
4063 
4064 	if (bootverbose)
4065 		mxge_verbose = 1;
4066 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4067 		mxge_intr_coal_delay = 30;
4068 	if (mxge_ticks == 0)
4069 		mxge_ticks = hz / 2;
4070 	sc->pause = mxge_flow_control;
4071 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4072 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4073 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4074 	}
4075 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4076 	    mxge_initial_mtu < ETHER_MIN_LEN)
4077 		mxge_initial_mtu = ETHERMTU_JUMBO;
4078 }
4079 
4080 
4081 static void
4082 mxge_free_slices(mxge_softc_t *sc)
4083 {
4084 	struct mxge_slice_state *ss;
4085 	int i;
4086 
4087 
4088 	if (sc->ss == NULL)
4089 		return;
4090 
4091 	for (i = 0; i < sc->num_slices; i++) {
4092 		ss = &sc->ss[i];
4093 		if (ss->fw_stats != NULL) {
4094 			mxge_dma_free(&ss->fw_stats_dma);
4095 			ss->fw_stats = NULL;
4096 #ifdef IFNET_BUF_RING
4097 			if (ss->tx.br != NULL) {
4098 				drbr_free(ss->tx.br, M_DEVBUF);
4099 				ss->tx.br = NULL;
4100 			}
4101 #endif
4102 		}
4103 		if (ss->rx_done.entry != NULL) {
4104 			mxge_dma_free(&ss->rx_done.dma);
4105 			ss->rx_done.entry = NULL;
4106 		}
4107 	}
4108 	kfree(sc->ss, M_DEVBUF);
4109 	sc->ss = NULL;
4110 }
4111 
4112 static int
4113 mxge_alloc_slices(mxge_softc_t *sc)
4114 {
4115 	mxge_cmd_t cmd;
4116 	struct mxge_slice_state *ss;
4117 	size_t bytes;
4118 	int err, i, max_intr_slots;
4119 
4120 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4121 	if (err != 0) {
4122 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4123 		return err;
4124 	}
4125 	sc->rx_ring_size = cmd.data0;
4126 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4127 
4128 	bytes = sizeof (*sc->ss) * sc->num_slices;
4129 	sc->ss = kmalloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4130 	if (sc->ss == NULL)
4131 		return (ENOMEM);
4132 	for (i = 0; i < sc->num_slices; i++) {
4133 		ss = &sc->ss[i];
4134 
4135 		ss->sc = sc;
4136 
4137 		/* allocate per-slice rx interrupt queues */
4138 
4139 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4140 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4141 		if (err != 0)
4142 			goto abort;
4143 		ss->rx_done.entry = ss->rx_done.dma.addr;
4144 		bzero(ss->rx_done.entry, bytes);
4145 
4146 		/*
4147 		 * allocate the per-slice firmware stats; stats
4148 		 * (including tx) are used used only on the first
4149 		 * slice for now
4150 		 */
4151 #ifndef IFNET_BUF_RING
4152 		if (i > 0)
4153 			continue;
4154 #endif
4155 
4156 		bytes = sizeof (*ss->fw_stats);
4157 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4158 				     sizeof (*ss->fw_stats), 64);
4159 		if (err != 0)
4160 			goto abort;
4161 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4162 #ifdef IFNET_BUF_RING
4163 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4164 					   &ss->tx.lock);
4165 #endif
4166 	}
4167 
4168 	return (0);
4169 
4170 abort:
4171 	mxge_free_slices(sc);
4172 	return (ENOMEM);
4173 }
4174 
4175 static void
4176 mxge_slice_probe(mxge_softc_t *sc)
4177 {
4178 	mxge_cmd_t cmd;
4179 	char *old_fw;
4180 	int msix_cnt, status, max_intr_slots;
4181 
4182 	sc->num_slices = 1;
4183 	/*
4184 	 *  don't enable multiple slices if they are not enabled,
4185 	 *  or if this is not an SMP system
4186 	 */
4187 
4188 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
4189 		return;
4190 
4191 	/* see how many MSI-X interrupts are available */
4192 	msix_cnt = pci_msix_count(sc->dev);
4193 	if (msix_cnt < 2)
4194 		return;
4195 
4196 	/* now load the slice aware firmware see what it supports */
4197 	old_fw = sc->fw_name;
4198 	if (old_fw == mxge_fw_aligned)
4199 		sc->fw_name = mxge_fw_rss_aligned;
4200 	else
4201 		sc->fw_name = mxge_fw_rss_unaligned;
4202 	status = mxge_load_firmware(sc, 0);
4203 	if (status != 0) {
4204 		device_printf(sc->dev, "Falling back to a single slice\n");
4205 		return;
4206 	}
4207 
4208 	/* try to send a reset command to the card to see if it
4209 	   is alive */
4210 	memset(&cmd, 0, sizeof (cmd));
4211 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4212 	if (status != 0) {
4213 		device_printf(sc->dev, "failed reset\n");
4214 		goto abort_with_fw;
4215 	}
4216 
4217 	/* get rx ring size */
4218 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4219 	if (status != 0) {
4220 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4221 		goto abort_with_fw;
4222 	}
4223 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4224 
4225 	/* tell it the size of the interrupt queues */
4226 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4227 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4228 	if (status != 0) {
4229 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4230 		goto abort_with_fw;
4231 	}
4232 
4233 	/* ask the maximum number of slices it supports */
4234 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4235 	if (status != 0) {
4236 		device_printf(sc->dev,
4237 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4238 		goto abort_with_fw;
4239 	}
4240 	sc->num_slices = cmd.data0;
4241 	if (sc->num_slices > msix_cnt)
4242 		sc->num_slices = msix_cnt;
4243 
4244 	if (mxge_max_slices == -1) {
4245 		/* cap to number of CPUs in system */
4246 		if (sc->num_slices > ncpus)
4247 			sc->num_slices = ncpus;
4248 	} else {
4249 		if (sc->num_slices > mxge_max_slices)
4250 			sc->num_slices = mxge_max_slices;
4251 	}
4252 	/* make sure it is a power of two */
4253 	while (sc->num_slices & (sc->num_slices - 1))
4254 		sc->num_slices--;
4255 
4256 	if (mxge_verbose)
4257 		device_printf(sc->dev, "using %d slices\n",
4258 			      sc->num_slices);
4259 
4260 	return;
4261 
4262 abort_with_fw:
4263 	sc->fw_name = old_fw;
4264 	(void) mxge_load_firmware(sc, 0);
4265 }
4266 
4267 #if 0
4268 static int
4269 mxge_add_msix_irqs(mxge_softc_t *sc)
4270 {
4271 	size_t bytes;
4272 	int count, err, i, rid;
4273 
4274 	rid = PCIR_BAR(2);
4275 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4276 						    &rid, RF_ACTIVE);
4277 
4278 	if (sc->msix_table_res == NULL) {
4279 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4280 		return ENXIO;
4281 	}
4282 
4283 	count = sc->num_slices;
4284 	err = pci_alloc_msix(sc->dev, &count);
4285 	if (err != 0) {
4286 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4287 			      "err = %d \n", sc->num_slices, err);
4288 		goto abort_with_msix_table;
4289 	}
4290 	if (count < sc->num_slices) {
4291 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4292 			      count, sc->num_slices);
4293 		device_printf(sc->dev,
4294 			      "Try setting hw.mxge.max_slices to %d\n",
4295 			      count);
4296 		err = ENOSPC;
4297 		goto abort_with_msix;
4298 	}
4299 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4300 	sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4301 	if (sc->msix_irq_res == NULL) {
4302 		err = ENOMEM;
4303 		goto abort_with_msix;
4304 	}
4305 
4306 	for (i = 0; i < sc->num_slices; i++) {
4307 		rid = i + 1;
4308 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4309 							  SYS_RES_IRQ,
4310 							  &rid, RF_ACTIVE);
4311 		if (sc->msix_irq_res[i] == NULL) {
4312 			device_printf(sc->dev, "couldn't allocate IRQ res"
4313 				      " for message %d\n", i);
4314 			err = ENXIO;
4315 			goto abort_with_res;
4316 		}
4317 	}
4318 
4319 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4320 	sc->msix_ih =  kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4321 
4322 	for (i = 0; i < sc->num_slices; i++) {
4323 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4324 				     INTR_MPSAFE,
4325 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i],
4326 				     sc->ifp->if_serializer);
4327 		if (err != 0) {
4328 			device_printf(sc->dev, "couldn't setup intr for "
4329 				      "message %d\n", i);
4330 			goto abort_with_intr;
4331 		}
4332 	}
4333 
4334 	if (mxge_verbose) {
4335 		device_printf(sc->dev, "using %d msix IRQs:",
4336 			      sc->num_slices);
4337 		for (i = 0; i < sc->num_slices; i++)
4338 			kprintf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4339 		kprintf("\n");
4340 	}
4341 	return (0);
4342 
4343 abort_with_intr:
4344 	for (i = 0; i < sc->num_slices; i++) {
4345 		if (sc->msix_ih[i] != NULL) {
4346 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4347 					  sc->msix_ih[i]);
4348 			sc->msix_ih[i] = NULL;
4349 		}
4350 	}
4351 	kfree(sc->msix_ih, M_DEVBUF);
4352 
4353 
4354 abort_with_res:
4355 	for (i = 0; i < sc->num_slices; i++) {
4356 		rid = i + 1;
4357 		if (sc->msix_irq_res[i] != NULL)
4358 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4359 					     sc->msix_irq_res[i]);
4360 		sc->msix_irq_res[i] = NULL;
4361 	}
4362 	kfree(sc->msix_irq_res, M_DEVBUF);
4363 
4364 
4365 abort_with_msix:
4366 	pci_release_msi(sc->dev);
4367 
4368 abort_with_msix_table:
4369 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4370 			     sc->msix_table_res);
4371 
4372 	return err;
4373 }
4374 #endif
4375 
4376 static int
4377 mxge_add_single_irq(mxge_softc_t *sc)
4378 {
4379 	int count, err, rid;
4380 
4381 #ifdef OLD_MSI
4382 	count = pci_msi_count(sc->dev);
4383 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4384 		rid = 1;
4385 	} else {
4386 		rid = 0;
4387 		sc->legacy_irq = 1;
4388 	}
4389 #else
4390 	count = 0;
4391 	rid = 0;
4392 	sc->legacy_irq = 1;
4393 #endif
4394 	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4395 					 1, RF_SHAREABLE | RF_ACTIVE);
4396 	if (sc->irq_res == NULL) {
4397 		device_printf(sc->dev, "could not alloc interrupt\n");
4398 		return ENXIO;
4399 	}
4400 	if (mxge_verbose)
4401 		device_printf(sc->dev, "using %s irq %ld\n",
4402 			      sc->legacy_irq ? "INTx" : "MSI",
4403 			      rman_get_start(sc->irq_res));
4404 	err = bus_setup_intr(sc->dev, sc->irq_res,
4405 			     INTR_MPSAFE,
4406 			     mxge_intr, &sc->ss[0], &sc->ih,
4407 			     sc->ifp->if_serializer);
4408 	if (err != 0) {
4409 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4410 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4411 		if (!sc->legacy_irq)
4412 			pci_release_msi(sc->dev);
4413 	}
4414 	return err;
4415 }
4416 
4417 #if 0
4418 static void
4419 mxge_rem_msix_irqs(mxge_softc_t *sc)
4420 {
4421 	int i, rid;
4422 
4423 	for (i = 0; i < sc->num_slices; i++) {
4424 		if (sc->msix_ih[i] != NULL) {
4425 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4426 					  sc->msix_ih[i]);
4427 			sc->msix_ih[i] = NULL;
4428 		}
4429 	}
4430 	kfree(sc->msix_ih, M_DEVBUF);
4431 
4432 	for (i = 0; i < sc->num_slices; i++) {
4433 		rid = i + 1;
4434 		if (sc->msix_irq_res[i] != NULL)
4435 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4436 					     sc->msix_irq_res[i]);
4437 		sc->msix_irq_res[i] = NULL;
4438 	}
4439 	kfree(sc->msix_irq_res, M_DEVBUF);
4440 
4441 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4442 			     sc->msix_table_res);
4443 
4444 	pci_release_msi(sc->dev);
4445 	return;
4446 }
4447 #endif
4448 
4449 static void
4450 mxge_rem_single_irq(mxge_softc_t *sc)
4451 {
4452 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4453 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4454 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4455 	if (!sc->legacy_irq)
4456 		pci_release_msi(sc->dev);
4457 }
4458 
4459 static void
4460 mxge_rem_irq(mxge_softc_t *sc)
4461 {
4462 #if 0
4463 	if (sc->num_slices > 1)
4464 		mxge_rem_msix_irqs(sc);
4465 	else
4466 #endif
4467 		mxge_rem_single_irq(sc);
4468 }
4469 
4470 static int
4471 mxge_add_irq(mxge_softc_t *sc)
4472 {
4473 #if 0
4474 	int err;
4475 
4476 	if (sc->num_slices > 1)
4477 		err = mxge_add_msix_irqs(sc);
4478 	else
4479 		err = mxge_add_single_irq(sc);
4480 
4481 	if (0 && err == 0 && sc->num_slices > 1) {
4482 		mxge_rem_msix_irqs(sc);
4483 		err = mxge_add_msix_irqs(sc);
4484 	}
4485 	return err;
4486 #else
4487 	return mxge_add_single_irq(sc);
4488 #endif
4489 }
4490 
4491 
4492 static int
4493 mxge_attach(device_t dev)
4494 {
4495 	mxge_softc_t *sc = device_get_softc(dev);
4496 	struct ifnet *ifp = &sc->arpcom.ac_if;
4497 	int err, rid;
4498 
4499 	/*
4500 	 * avoid rewriting half the lines in this file to use
4501 	 * &sc->arpcom.ac_if instead
4502 	 */
4503 	sc->ifp = ifp;
4504 	sc->dev = dev;
4505 	mxge_fetch_tunables(sc);
4506 
4507 	err = bus_dma_tag_create(NULL,			/* parent */
4508 				 1,			/* alignment */
4509 				 0,			/* boundary */
4510 				 BUS_SPACE_MAXADDR,	/* low */
4511 				 BUS_SPACE_MAXADDR,	/* high */
4512 				 NULL, NULL,		/* filter */
4513 				 65536 + 256,		/* maxsize */
4514 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4515 				 65536,			/* maxsegsize */
4516 				 0,			/* flags */
4517 				 &sc->parent_dmat);	/* tag */
4518 
4519 	if (err != 0) {
4520 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4521 			      err);
4522 		goto abort_with_nothing;
4523 	}
4524 
4525 	sc->ifp = ifp;
4526 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4527 
4528 	callout_init_mp(&sc->co_hdl);
4529 
4530 	mxge_setup_cfg_space(sc);
4531 
4532 	/* Map the board into the kernel */
4533 	rid = PCIR_BARS;
4534 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4535 					 ~0, 1, RF_ACTIVE);
4536 	if (sc->mem_res == NULL) {
4537 		device_printf(dev, "could not map memory\n");
4538 		err = ENXIO;
4539 		goto abort_with_nothing;
4540 	}
4541 	sc->sram = rman_get_virtual(sc->mem_res);
4542 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4543 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4544 		device_printf(dev, "impossible memory region size %ld\n",
4545 			      rman_get_size(sc->mem_res));
4546 		err = ENXIO;
4547 		goto abort_with_mem_res;
4548 	}
4549 
4550 	/* make NULL terminated copy of the EEPROM strings section of
4551 	   lanai SRAM */
4552 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4553 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4554 				rman_get_bushandle(sc->mem_res),
4555 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4556 				sc->eeprom_strings,
4557 				MXGE_EEPROM_STRINGS_SIZE - 2);
4558 	err = mxge_parse_strings(sc);
4559 	if (err != 0)
4560 		goto abort_with_mem_res;
4561 
4562 	/* Enable write combining for efficient use of PCIe bus */
4563 	mxge_enable_wc(sc);
4564 
4565 	/* Allocate the out of band dma memory */
4566 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4567 			     sizeof (mxge_cmd_t), 64);
4568 	if (err != 0)
4569 		goto abort_with_mem_res;
4570 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4571 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4572 	if (err != 0)
4573 		goto abort_with_cmd_dma;
4574 
4575 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4576 	if (err != 0)
4577 		goto abort_with_zeropad_dma;
4578 
4579 	/* select & load the firmware */
4580 	err = mxge_select_firmware(sc);
4581 	if (err != 0)
4582 		goto abort_with_dmabench;
4583 	sc->intr_coal_delay = mxge_intr_coal_delay;
4584 
4585 	mxge_slice_probe(sc);
4586 	err = mxge_alloc_slices(sc);
4587 	if (err != 0)
4588 		goto abort_with_dmabench;
4589 
4590 	err = mxge_reset(sc, 0);
4591 	if (err != 0)
4592 		goto abort_with_slices;
4593 
4594 	err = mxge_alloc_rings(sc);
4595 	if (err != 0) {
4596 		device_printf(sc->dev, "failed to allocate rings\n");
4597 		goto abort_with_dmabench;
4598 	}
4599 
4600 	ifp->if_baudrate = IF_Gbps(10UL);
4601 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4602 		IFCAP_VLAN_MTU;
4603 #ifdef INET
4604 	ifp->if_capabilities |= IFCAP_LRO;
4605 #endif
4606 
4607 #ifdef MXGE_NEW_VLAN_API
4608 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4609 #endif
4610 
4611 	sc->max_mtu = mxge_max_mtu(sc);
4612 	if (sc->max_mtu >= 9000)
4613 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4614 	else
4615 		device_printf(dev, "MTU limited to %d.  Install "
4616 			      "latest firmware for 9000 byte jumbo support\n",
4617 			      sc->max_mtu - ETHER_HDR_LEN);
4618 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4619 	ifp->if_capenable = ifp->if_capabilities;
4620 	if (sc->lro_cnt == 0)
4621 		ifp->if_capenable &= ~IFCAP_LRO;
4622 	sc->csum_flag = 1;
4623         ifp->if_init = mxge_init;
4624         ifp->if_softc = sc;
4625         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4626         ifp->if_ioctl = mxge_ioctl;
4627         ifp->if_start = mxge_start;
4628 	/* Initialise the ifmedia structure */
4629 	ifmedia_init(&sc->media, 0, mxge_media_change,
4630 		     mxge_media_status);
4631 	mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4632 	mxge_media_probe(sc);
4633 	sc->dying = 0;
4634 	ether_ifattach(ifp, sc->mac_addr, NULL);
4635 	/* ether_ifattach sets mtu to ETHERMTU */
4636 	if (mxge_initial_mtu != ETHERMTU) {
4637 		lwkt_serialize_enter(ifp->if_serializer);
4638 		mxge_change_mtu(sc, mxge_initial_mtu);
4639 		lwkt_serialize_exit(ifp->if_serializer);
4640 	}
4641 	/* must come after ether_ifattach() */
4642 	err = mxge_add_irq(sc);
4643 	if (err != 0) {
4644 		device_printf(sc->dev, "failed to add irq\n");
4645 		goto abort_with_rings;
4646 	}
4647 
4648 	mxge_add_sysctls(sc);
4649 #ifdef IFNET_BUF_RING
4650 	ifp->if_transmit = mxge_transmit;
4651 	ifp->if_qflush = mxge_qflush;
4652 #endif
4653 	return 0;
4654 
4655 abort_with_rings:
4656 	mxge_free_rings(sc);
4657 abort_with_slices:
4658 	mxge_free_slices(sc);
4659 abort_with_dmabench:
4660 	mxge_dma_free(&sc->dmabench_dma);
4661 abort_with_zeropad_dma:
4662 	mxge_dma_free(&sc->zeropad_dma);
4663 abort_with_cmd_dma:
4664 	mxge_dma_free(&sc->cmd_dma);
4665 abort_with_mem_res:
4666 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4667 	pci_disable_busmaster(dev);
4668 	bus_dma_tag_destroy(sc->parent_dmat);
4669 abort_with_nothing:
4670 	return err;
4671 }
4672 
4673 static int
4674 mxge_detach(device_t dev)
4675 {
4676 	mxge_softc_t *sc = device_get_softc(dev);
4677 
4678 	lwkt_serialize_enter(sc->ifp->if_serializer);
4679 	sc->dying = 1;
4680 	if (sc->ifp->if_flags & IFF_RUNNING)
4681 		mxge_close(sc);
4682 	/*
4683 	 * XXX: race: the callout callback could be spinning on
4684 	 * the serializer and run anyway
4685 	 */
4686 	callout_stop(&sc->co_hdl);
4687 	lwkt_serialize_exit(sc->ifp->if_serializer);
4688 
4689 	ether_ifdetach(sc->ifp);
4690 	ifmedia_removeall(&sc->media);
4691 	mxge_dummy_rdma(sc, 0);
4692 	mxge_rem_sysctls(sc);
4693 	mxge_rem_irq(sc);
4694 	mxge_free_rings(sc);
4695 	mxge_free_slices(sc);
4696 	mxge_dma_free(&sc->dmabench_dma);
4697 	mxge_dma_free(&sc->zeropad_dma);
4698 	mxge_dma_free(&sc->cmd_dma);
4699 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4700 	pci_disable_busmaster(dev);
4701 	bus_dma_tag_destroy(sc->parent_dmat);
4702 	return 0;
4703 }
4704 
4705 static int
4706 mxge_shutdown(device_t dev)
4707 {
4708 	return 0;
4709 }
4710 
4711 /*
4712   This file uses Myri10GE driver indentation.
4713 
4714   Local Variables:
4715   c-file-style:"linux"
4716   tab-width:8
4717   End:
4718 */
4719