xref: /dragonfly/sys/dev/netif/mxge/if_mxge.c (revision 25a2db75)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2009, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 $FreeBSD: src/sys/dev/mxge/if_mxge.c,v 1.63 2009/06/26 11:45:06 rwatson Exp $
29 
30 ***************************************************************************/
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/linker.h>
35 #include <sys/firmware.h>
36 #include <sys/endian.h>
37 #include <sys/in_cksum.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kernel.h>
42 #include <sys/module.h>
43 #include <sys/serialize.h>
44 #include <sys/socket.h>
45 #include <sys/sysctl.h>
46 
47 /* count xmits ourselves, rather than via drbr */
48 #define NO_SLOW_STATS
49 #include <net/if.h>
50 #include <net/if_arp.h>
51 #include <net/ifq_var.h>
52 #include <net/ethernet.h>
53 #include <net/if_dl.h>
54 #include <net/if_media.h>
55 
56 #include <net/bpf.h>
57 
58 #include <net/if_types.h>
59 #include <net/vlan/if_vlan_var.h>
60 #include <net/zlib.h>
61 
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/tcp.h>
66 
67 #include <sys/bus.h>
68 #include <sys/rman.h>
69 
70 #include <bus/pci/pcireg.h>
71 #include <bus/pci/pcivar.h>
72 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
73 
74 #include <vm/vm.h>		/* for pmap_mapdev() */
75 #include <vm/pmap.h>
76 
77 #if defined(__i386) || defined(__x86_64)
78 #include <machine/specialreg.h>
79 #endif
80 
81 #include <dev/netif/mxge/mxge_mcp.h>
82 #include <dev/netif/mxge/mcp_gen_header.h>
83 /*#define MXGE_FAKE_IFP*/
84 #include <dev/netif/mxge/if_mxge_var.h>
85 #ifdef IFNET_BUF_RING
86 #include <sys/buf_ring.h>
87 #endif
88 
89 #include "opt_inet.h"
90 
91 /* tunable params */
92 static int mxge_nvidia_ecrc_enable = 1;
93 static int mxge_force_firmware = 0;
94 static int mxge_intr_coal_delay = 30;
95 static int mxge_deassert_wait = 1;
96 static int mxge_flow_control = 1;
97 static int mxge_verbose = 0;
98 static int mxge_lro_cnt = 8;
99 static int mxge_ticks;
100 static int mxge_max_slices = 1;
101 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
102 static int mxge_always_promisc = 0;
103 /* XXX: not yet */
104 /* static int mxge_initial_mtu = ETHERMTU_JUMBO; */
105 static int mxge_initial_mtu = ETHERMTU;
106 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
107 static char *mxge_fw_aligned = "mxge_eth_z8e";
108 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
109 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
110 
111 static int mxge_probe(device_t dev);
112 static int mxge_attach(device_t dev);
113 static int mxge_detach(device_t dev);
114 static int mxge_shutdown(device_t dev);
115 static void mxge_intr(void *arg);
116 
117 static device_method_t mxge_methods[] =
118 {
119   /* Device interface */
120   DEVMETHOD(device_probe, mxge_probe),
121   DEVMETHOD(device_attach, mxge_attach),
122   DEVMETHOD(device_detach, mxge_detach),
123   DEVMETHOD(device_shutdown, mxge_shutdown),
124   DEVMETHOD_END
125 };
126 
127 static driver_t mxge_driver =
128 {
129   "mxge",
130   mxge_methods,
131   sizeof(mxge_softc_t),
132 };
133 
134 static devclass_t mxge_devclass;
135 
136 /* Declare ourselves to be a child of the PCI bus.*/
137 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, NULL, NULL);
138 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
139 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
140 
141 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
142 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
143 static int mxge_close(mxge_softc_t *sc);
144 static int mxge_open(mxge_softc_t *sc);
145 static void mxge_tick(void *arg);
146 
147 /* XXX: we don't have Large Receive Offload support yet */
148  inline int
149 mxge_lro_rx(struct mxge_slice_state *ss, struct mbuf *m_head, uint32_t csum)
150 {
151 	(void)ss;
152 	(void)m_head;
153 	(void)csum;
154 	return 1;
155 }
156 
157  inline void
158 mxge_lro_flush(struct mxge_slice_state *ss, struct lro_entry *lro)
159 {
160 	(void)ss;
161 	(void)lro;
162 }
163 
164 static int
165 mxge_probe(device_t dev)
166 {
167 	int rev;
168 
169 
170 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
171 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
172 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
173 		rev = pci_get_revid(dev);
174 		switch (rev) {
175 		case MXGE_PCI_REV_Z8E:
176 			device_set_desc(dev, "Myri10G-PCIE-8A");
177 			break;
178 		case MXGE_PCI_REV_Z8ES:
179 			device_set_desc(dev, "Myri10G-PCIE-8B");
180 			break;
181 		default:
182 			device_set_desc(dev, "Myri10G-PCIE-8??");
183 			device_printf(dev, "Unrecognized rev %d NIC\n",
184 				      rev);
185 			break;
186 		}
187 		return 0;
188 	}
189 	return ENXIO;
190 }
191 
192 static void
193 mxge_enable_wc(mxge_softc_t *sc)
194 {
195 #if 0
196 #if defined(__i386) || defined(__x86_64)
197 	vm_offset_t len;
198 	int err;
199 
200 	sc->wc = 1;
201 	len = rman_get_size(sc->mem_res);
202 	err = pmap_change_attr((vm_offset_t) sc->sram,
203 			       len, PAT_WRITE_COMBINING);
204 	if (err != 0) {
205 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
206 			      err);
207 		sc->wc = 0;
208 	}
209 #endif
210 #else
211 	sc->wc = 0;	/* TBD: PAT support */
212 #endif
213 }
214 
215 
216 /* callback to get our DMA address */
217 static void
218 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
219 			 int error)
220 {
221 	if (error == 0) {
222 		*(bus_addr_t *) arg = segs->ds_addr;
223 	}
224 }
225 
226 static int
227 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
228 		   bus_size_t alignment)
229 {
230 	int err;
231 	device_t dev = sc->dev;
232 	bus_size_t boundary, maxsegsize;
233 
234 	if (bytes > 4096 && alignment == 4096) {
235 		boundary = 0;
236 		maxsegsize = bytes;
237 	} else {
238 		boundary = 4096;
239 		maxsegsize = 4096;
240 	}
241 
242 	/* allocate DMAable memory tags */
243 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
244 				 alignment,		/* alignment */
245 				 boundary,		/* boundary */
246 				 BUS_SPACE_MAXADDR,	/* low */
247 				 BUS_SPACE_MAXADDR,	/* high */
248 				 NULL, NULL,		/* filter */
249 				 bytes,			/* maxsize */
250 				 1,			/* num segs */
251 				 maxsegsize,		/* maxsegsize */
252 				 BUS_DMA_COHERENT,	/* flags */
253 				 &dma->dmat);		/* tag */
254 	if (err != 0) {
255 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
256 		return err;
257 	}
258 
259 	/* allocate DMAable memory & map */
260 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
261 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
262 				| BUS_DMA_ZERO),  &dma->map);
263 	if (err != 0) {
264 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
265 		goto abort_with_dmat;
266 	}
267 
268 	/* load the memory */
269 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
270 			      mxge_dmamap_callback,
271 			      (void *)&dma->bus_addr, 0);
272 	if (err != 0) {
273 		device_printf(dev, "couldn't load map (err = %d)\n", err);
274 		goto abort_with_mem;
275 	}
276 	return 0;
277 
278 abort_with_mem:
279 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
280 abort_with_dmat:
281 	(void)bus_dma_tag_destroy(dma->dmat);
282 	return err;
283 }
284 
285 
286 static void
287 mxge_dma_free(mxge_dma_t *dma)
288 {
289 	bus_dmamap_unload(dma->dmat, dma->map);
290 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
291 	(void)bus_dma_tag_destroy(dma->dmat);
292 }
293 
294 /*
295  * The eeprom strings on the lanaiX have the format
296  * SN=x\0
297  * MAC=x:x:x:x:x:x\0
298  * PC=text\0
299  */
300 
301 static int
302 mxge_parse_strings(mxge_softc_t *sc)
303 {
304 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
305 
306 	char *ptr, *limit;
307 	int i, found_mac;
308 
309 	ptr = sc->eeprom_strings;
310 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
311 	found_mac = 0;
312 	while (ptr < limit && *ptr != '\0') {
313 		if (memcmp(ptr, "MAC=", 4) == 0) {
314 			ptr += 1;
315 			sc->mac_addr_string = ptr;
316 			for (i = 0; i < 6; i++) {
317 				ptr += 3;
318 				if ((ptr + 2) > limit)
319 					goto abort;
320 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
321 				found_mac = 1;
322 			}
323 		} else if (memcmp(ptr, "PC=", 3) == 0) {
324 			ptr += 3;
325 			strncpy(sc->product_code_string, ptr,
326 				sizeof (sc->product_code_string) - 1);
327 		} else if (memcmp(ptr, "SN=", 3) == 0) {
328 			ptr += 3;
329 			strncpy(sc->serial_number_string, ptr,
330 				sizeof (sc->serial_number_string) - 1);
331 		}
332 		MXGE_NEXT_STRING(ptr);
333 	}
334 
335 	if (found_mac)
336 		return 0;
337 
338  abort:
339 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
340 
341 	return ENXIO;
342 }
343 
344 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
345 static void
346 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
347 {
348 	uint32_t val;
349 	unsigned long base, off;
350 	char *va, *cfgptr;
351 	device_t pdev, mcp55;
352 	uint16_t vendor_id, device_id, word;
353 	uintptr_t bus, slot, func, ivend, idev;
354 	uint32_t *ptr32;
355 
356 
357 	if (!mxge_nvidia_ecrc_enable)
358 		return;
359 
360 	pdev = device_get_parent(device_get_parent(sc->dev));
361 	if (pdev == NULL) {
362 		device_printf(sc->dev, "could not find parent?\n");
363 		return;
364 	}
365 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
366 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
367 
368 	if (vendor_id != 0x10de)
369 		return;
370 
371 	base = 0;
372 
373 	if (device_id == 0x005d) {
374 		/* ck804, base address is magic */
375 		base = 0xe0000000UL;
376 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
377 		/* mcp55, base address stored in chipset */
378 		mcp55 = pci_find_bsf(0, 0, 0);
379 		if (mcp55 &&
380 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
381 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
382 			word = pci_read_config(mcp55, 0x90, 2);
383 			base = ((unsigned long)word & 0x7ffeU) << 25;
384 		}
385 	}
386 	if (!base)
387 		return;
388 
389 	/* XXXX
390 	   Test below is commented because it is believed that doing
391 	   config read/write beyond 0xff will access the config space
392 	   for the next larger function.  Uncomment this and remove
393 	   the hacky pmap_mapdev() way of accessing config space when
394 	   FreeBSD grows support for extended pcie config space access
395 	*/
396 #if 0
397 	/* See if we can, by some miracle, access the extended
398 	   config space */
399 	val = pci_read_config(pdev, 0x178, 4);
400 	if (val != 0xffffffff) {
401 		val |= 0x40;
402 		pci_write_config(pdev, 0x178, val, 4);
403 		return;
404 	}
405 #endif
406 	/* Rather than using normal pci config space writes, we must
407 	 * map the Nvidia config space ourselves.  This is because on
408 	 * opteron/nvidia class machine the 0xe000000 mapping is
409 	 * handled by the nvidia chipset, that means the internal PCI
410 	 * device (the on-chip northbridge), or the amd-8131 bridge
411 	 * and things behind them are not visible by this method.
412 	 */
413 
414 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
415 		      PCI_IVAR_BUS, &bus);
416 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
417 		      PCI_IVAR_SLOT, &slot);
418 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
419 		      PCI_IVAR_FUNCTION, &func);
420 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
421 		      PCI_IVAR_VENDOR, &ivend);
422 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
423 		      PCI_IVAR_DEVICE, &idev);
424 
425 	off =  base
426 		+ 0x00100000UL * (unsigned long)bus
427 		+ 0x00001000UL * (unsigned long)(func
428 						 + 8 * slot);
429 
430 	/* map it into the kernel */
431 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
432 
433 
434 	if (va == NULL) {
435 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
436 		return;
437 	}
438 	/* get a pointer to the config space mapped into the kernel */
439 	cfgptr = va + (off & PAGE_MASK);
440 
441 	/* make sure that we can really access it */
442 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
443 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
444 	if (! (vendor_id == ivend && device_id == idev)) {
445 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
446 			      vendor_id, device_id);
447 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
448 		return;
449 	}
450 
451 	ptr32 = (uint32_t*)(cfgptr + 0x178);
452 	val = *ptr32;
453 
454 	if (val == 0xffffffff) {
455 		device_printf(sc->dev, "extended mapping failed\n");
456 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
457 		return;
458 	}
459 	*ptr32 = val | 0x40;
460 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
461 	if (mxge_verbose)
462 		device_printf(sc->dev,
463 			      "Enabled ECRC on upstream Nvidia bridge "
464 			      "at %d:%d:%d\n",
465 			      (int)bus, (int)slot, (int)func);
466 	return;
467 }
468 #else
469 static void
470 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
471 {
472 	device_printf(sc->dev,
473 		      "Nforce 4 chipset on non-x86/x86_64!?!?!\n");
474 	return;
475 }
476 #endif
477 
478 
479 static int
480 mxge_dma_test(mxge_softc_t *sc, int test_type)
481 {
482 	mxge_cmd_t cmd;
483 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
484 	int status;
485 	uint32_t len;
486 	char *test = " ";
487 
488 
489 	/* Run a small DMA test.
490 	 * The magic multipliers to the length tell the firmware
491 	 * to do DMA read, write, or read+write tests.  The
492 	 * results are returned in cmd.data0.  The upper 16
493 	 * bits of the return is the number of transfers completed.
494 	 * The lower 16 bits is the time in 0.5us ticks that the
495 	 * transfers took to complete.
496 	 */
497 
498 	len = sc->tx_boundary;
499 
500 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
501 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
502 	cmd.data2 = len * 0x10000;
503 	status = mxge_send_cmd(sc, test_type, &cmd);
504 	if (status != 0) {
505 		test = "read";
506 		goto abort;
507 	}
508 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
509 		(cmd.data0 & 0xffff);
510 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
511 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
512 	cmd.data2 = len * 0x1;
513 	status = mxge_send_cmd(sc, test_type, &cmd);
514 	if (status != 0) {
515 		test = "write";
516 		goto abort;
517 	}
518 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
519 		(cmd.data0 & 0xffff);
520 
521 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
522 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
523 	cmd.data2 = len * 0x10001;
524 	status = mxge_send_cmd(sc, test_type, &cmd);
525 	if (status != 0) {
526 		test = "read/write";
527 		goto abort;
528 	}
529 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
530 		(cmd.data0 & 0xffff);
531 
532 abort:
533 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
534 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
535 			      test, status);
536 
537 	return status;
538 }
539 
540 /*
541  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
542  * when the PCI-E Completion packets are aligned on an 8-byte
543  * boundary.  Some PCI-E chip sets always align Completion packets; on
544  * the ones that do not, the alignment can be enforced by enabling
545  * ECRC generation (if supported).
546  *
547  * When PCI-E Completion packets are not aligned, it is actually more
548  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
549  *
550  * If the driver can neither enable ECRC nor verify that it has
551  * already been enabled, then it must use a firmware image which works
552  * around unaligned completion packets (ethp_z8e.dat), and it should
553  * also ensure that it never gives the device a Read-DMA which is
554  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
555  * enabled, then the driver should use the aligned (eth_z8e.dat)
556  * firmware image, and set tx_boundary to 4KB.
557  */
558 
559 static int
560 mxge_firmware_probe(mxge_softc_t *sc)
561 {
562 	device_t dev = sc->dev;
563 	int reg, status;
564 	uint16_t pectl;
565 
566 	sc->tx_boundary = 4096;
567 	/*
568 	 * Verify the max read request size was set to 4KB
569 	 * before trying the test with 4KB.
570 	 */
571 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
572 		pectl = pci_read_config(dev, reg + 0x8, 2);
573 		if ((pectl & (5 << 12)) != (5 << 12)) {
574 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
575 				      pectl);
576 			sc->tx_boundary = 2048;
577 		}
578 	}
579 
580 	/*
581 	 * load the optimized firmware (which assumes aligned PCIe
582 	 * completions) in order to see if it works on this host.
583 	 */
584 	sc->fw_name = mxge_fw_aligned;
585 	status = mxge_load_firmware(sc, 1);
586 	if (status != 0) {
587 		return status;
588 	}
589 
590 	/*
591 	 * Enable ECRC if possible
592 	 */
593 	mxge_enable_nvidia_ecrc(sc);
594 
595 	/*
596 	 * Run a DMA test which watches for unaligned completions and
597 	 * aborts on the first one seen.
598 	 */
599 
600 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
601 	if (status == 0)
602 		return 0; /* keep the aligned firmware */
603 
604 	if (status != E2BIG)
605 		device_printf(dev, "DMA test failed: %d\n", status);
606 	if (status == ENOSYS)
607 		device_printf(dev, "Falling back to ethp! "
608 			      "Please install up to date fw\n");
609 	return status;
610 }
611 
612 static int
613 mxge_select_firmware(mxge_softc_t *sc)
614 {
615 	int aligned = 0;
616 
617 
618 	if (mxge_force_firmware != 0) {
619 		if (mxge_force_firmware == 1)
620 			aligned = 1;
621 		else
622 			aligned = 0;
623 		if (mxge_verbose)
624 			device_printf(sc->dev,
625 				      "Assuming %s completions (forced)\n",
626 				      aligned ? "aligned" : "unaligned");
627 		goto abort;
628 	}
629 
630 	/* if the PCIe link width is 4 or less, we can use the aligned
631 	   firmware and skip any checks */
632 	if (sc->link_width != 0 && sc->link_width <= 4) {
633 		device_printf(sc->dev,
634 			      "PCIe x%d Link, expect reduced performance\n",
635 			      sc->link_width);
636 		aligned = 1;
637 		goto abort;
638 	}
639 
640 	if (0 == mxge_firmware_probe(sc))
641 		return 0;
642 
643 abort:
644 	if (aligned) {
645 		sc->fw_name = mxge_fw_aligned;
646 		sc->tx_boundary = 4096;
647 	} else {
648 		sc->fw_name = mxge_fw_unaligned;
649 		sc->tx_boundary = 2048;
650 	}
651 	return (mxge_load_firmware(sc, 0));
652 }
653 
654 union qualhack
655 {
656         const char *ro_char;
657         char *rw_char;
658 };
659 
660 static int
661 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
662 {
663 
664 
665 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
666 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
667 			      be32toh(hdr->mcp_type));
668 		return EIO;
669 	}
670 
671 	/* save firmware version for sysctl */
672 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
673 	if (mxge_verbose)
674 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
675 
676 	ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
677 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
678 
679 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
680 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
681 		device_printf(sc->dev, "Found firmware version %s\n",
682 			      sc->fw_version);
683 		device_printf(sc->dev, "Driver needs %d.%d\n",
684 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
685 		return EINVAL;
686 	}
687 	return 0;
688 
689 }
690 
691 static void *
692 z_alloc(void *nil, u_int items, u_int size)
693 {
694         void *ptr;
695 
696         ptr = kmalloc(items * size, M_TEMP, M_NOWAIT);
697         return ptr;
698 }
699 
700 static void
701 z_free(void *nil, void *ptr)
702 {
703         kfree(ptr, M_TEMP);
704 }
705 
706 
707 static int
708 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
709 {
710 	z_stream zs;
711 	char *inflate_buffer;
712 	const struct firmware *fw;
713 	const mcp_gen_header_t *hdr;
714 	unsigned hdr_offset;
715 	int status;
716 	unsigned int i;
717 	size_t fw_len;
718 
719 	fw = firmware_get(sc->fw_name);
720 	if (fw == NULL) {
721 		device_printf(sc->dev, "Could not find firmware image %s\n",
722 			      sc->fw_name);
723 		return ENOENT;
724 	}
725 
726 
727 
728 	/* setup zlib and decompress f/w */
729 	bzero(&zs, sizeof (zs));
730 	zs.zalloc = z_alloc;
731 	zs.zfree = z_free;
732 	status = inflateInit(&zs);
733 	if (status != Z_OK) {
734 		status = EIO;
735 		goto abort_with_fw;
736 	}
737 
738 	/* the uncompressed size is stored as the firmware version,
739 	   which would otherwise go unused */
740 	fw_len = (size_t) fw->version;
741 	inflate_buffer = kmalloc(fw_len, M_TEMP, M_NOWAIT);
742 	if (inflate_buffer == NULL)
743 		goto abort_with_zs;
744 	zs.avail_in = fw->datasize;
745 	zs.next_in = __DECONST(char *, fw->data);
746 	zs.avail_out = fw_len;
747 	zs.next_out = inflate_buffer;
748 	status = inflate(&zs, Z_FINISH);
749 	if (status != Z_STREAM_END) {
750 		device_printf(sc->dev, "zlib %d\n", status);
751 		status = EIO;
752 		goto abort_with_buffer;
753 	}
754 
755 	/* check id */
756 	hdr_offset = htobe32(*(const uint32_t *)
757 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
758 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
759 		device_printf(sc->dev, "Bad firmware file");
760 		status = EIO;
761 		goto abort_with_buffer;
762 	}
763 	hdr = (const void*)(inflate_buffer + hdr_offset);
764 
765 	status = mxge_validate_firmware(sc, hdr);
766 	if (status != 0)
767 		goto abort_with_buffer;
768 
769 	/* Copy the inflated firmware to NIC SRAM. */
770 	for (i = 0; i < fw_len; i += 256) {
771 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
772 			      inflate_buffer + i,
773 			      min(256U, (unsigned)(fw_len - i)));
774 		wmb();
775 		wmb();
776 	}
777 
778 	*limit = fw_len;
779 	status = 0;
780 abort_with_buffer:
781 	kfree(inflate_buffer, M_TEMP);
782 abort_with_zs:
783 	inflateEnd(&zs);
784 abort_with_fw:
785 	firmware_put(fw, FIRMWARE_UNLOAD);
786 	return status;
787 }
788 
789 /*
790  * Enable or disable periodic RDMAs from the host to make certain
791  * chipsets resend dropped PCIe messages
792  */
793 
794 static void
795 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
796 {
797 	char buf_bytes[72];
798 	volatile uint32_t *confirm;
799 	volatile char *submit;
800 	uint32_t *buf, dma_low, dma_high;
801 	int i;
802 
803 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
804 
805 	/* clear confirmation addr */
806 	confirm = (volatile uint32_t *)sc->cmd;
807 	*confirm = 0;
808 	wmb();
809 
810 	/* send an rdma command to the PCIe engine, and wait for the
811 	   response in the confirmation address.  The firmware should
812 	   write a -1 there to indicate it is alive and well
813 	*/
814 
815 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
816 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
817 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
818 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
819 	buf[2] = htobe32(0xffffffff);		/* confirm data */
820 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
821 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
822 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
823 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
824 	buf[5] = htobe32(enable);			/* enable? */
825 
826 
827 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
828 
829 	mxge_pio_copy(submit, buf, 64);
830 	wmb();
831 	DELAY(1000);
832 	wmb();
833 	i = 0;
834 	while (*confirm != 0xffffffff && i < 20) {
835 		DELAY(1000);
836 		i++;
837 	}
838 	if (*confirm != 0xffffffff) {
839 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
840 			      (enable ? "enable" : "disable"), confirm,
841 			      *confirm);
842 	}
843 	return;
844 }
845 
846 static int
847 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
848 {
849 	mcp_cmd_t *buf;
850 	char buf_bytes[sizeof(*buf) + 8];
851 	volatile mcp_cmd_response_t *response = sc->cmd;
852 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
853 	uint32_t dma_low, dma_high;
854 	int err, sleep_total = 0;
855 
856 	/*
857 	 * We may be called during attach, before if_serializer is available.
858 	 * This is not a fast path, just check for NULL
859 	 */
860 
861 	if (sc->ifp->if_serializer)
862 		ASSERT_SERIALIZED(sc->ifp->if_serializer);
863 
864 	/* ensure buf is aligned to 8 bytes */
865 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
866 
867 	buf->data0 = htobe32(data->data0);
868 	buf->data1 = htobe32(data->data1);
869 	buf->data2 = htobe32(data->data2);
870 	buf->cmd = htobe32(cmd);
871 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
872 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
873 
874 	buf->response_addr.low = htobe32(dma_low);
875 	buf->response_addr.high = htobe32(dma_high);
876 
877 
878 	response->result = 0xffffffff;
879 	wmb();
880 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
881 
882 	/* wait up to 20ms */
883 	err = EAGAIN;
884 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
885 		bus_dmamap_sync(sc->cmd_dma.dmat,
886 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
887 		wmb();
888 		switch (be32toh(response->result)) {
889 		case 0:
890 			data->data0 = be32toh(response->data);
891 			err = 0;
892 			break;
893 		case 0xffffffff:
894 			DELAY(1000);
895 			break;
896 		case MXGEFW_CMD_UNKNOWN:
897 			err = ENOSYS;
898 			break;
899 		case MXGEFW_CMD_ERROR_UNALIGNED:
900 			err = E2BIG;
901 			break;
902 		case MXGEFW_CMD_ERROR_BUSY:
903 			err = EBUSY;
904 			break;
905 		default:
906 			device_printf(sc->dev,
907 				      "mxge: command %d "
908 				      "failed, result = %d\n",
909 				      cmd, be32toh(response->result));
910 			err = ENXIO;
911 			break;
912 		}
913 		if (err != EAGAIN)
914 			break;
915 	}
916 	if (err == EAGAIN)
917 		device_printf(sc->dev, "mxge: command %d timed out"
918 			      "result = %d\n",
919 			      cmd, be32toh(response->result));
920 	return err;
921 }
922 
923 static int
924 mxge_adopt_running_firmware(mxge_softc_t *sc)
925 {
926 	struct mcp_gen_header *hdr;
927 	const size_t bytes = sizeof (struct mcp_gen_header);
928 	size_t hdr_offset;
929 	int status;
930 
931 	/* find running firmware header */
932 	hdr_offset = htobe32(*(volatile uint32_t *)
933 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
934 
935 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
936 		device_printf(sc->dev,
937 			      "Running firmware has bad header offset (%d)\n",
938 			      (int)hdr_offset);
939 		return EIO;
940 	}
941 
942 	/* copy header of running firmware from SRAM to host memory to
943 	 * validate firmware */
944 	hdr = kmalloc(bytes, M_DEVBUF, M_NOWAIT);
945 	if (hdr == NULL) {
946 		device_printf(sc->dev, "could not kmalloc firmware hdr\n");
947 		return ENOMEM;
948 	}
949 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
950 				rman_get_bushandle(sc->mem_res),
951 				hdr_offset, (char *)hdr, bytes);
952 	status = mxge_validate_firmware(sc, hdr);
953 	kfree(hdr, M_DEVBUF);
954 
955 	/*
956 	 * check to see if adopted firmware has bug where adopting
957 	 * it will cause broadcasts to be filtered unless the NIC
958 	 * is kept in ALLMULTI mode
959 	 */
960 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
961 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
962 		sc->adopted_rx_filter_bug = 1;
963 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
964 			      "working around rx filter bug\n",
965 			      sc->fw_ver_major, sc->fw_ver_minor,
966 			      sc->fw_ver_tiny);
967 	}
968 
969 	return status;
970 }
971 
972 
973 static int
974 mxge_load_firmware(mxge_softc_t *sc, int adopt)
975 {
976 	volatile uint32_t *confirm;
977 	volatile char *submit;
978 	char buf_bytes[72];
979 	uint32_t *buf, size, dma_low, dma_high;
980 	int status, i;
981 
982 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
983 
984 	size = sc->sram_size;
985 	status = mxge_load_firmware_helper(sc, &size);
986 	if (status) {
987 		if (!adopt)
988 			return status;
989 		/* Try to use the currently running firmware, if
990 		   it is new enough */
991 		status = mxge_adopt_running_firmware(sc);
992 		if (status) {
993 			device_printf(sc->dev,
994 				      "failed to adopt running firmware\n");
995 			return status;
996 		}
997 		device_printf(sc->dev,
998 			      "Successfully adopted running firmware\n");
999 		if (sc->tx_boundary == 4096) {
1000 			device_printf(sc->dev,
1001 				"Using firmware currently running on NIC"
1002 				 ".  For optimal\n");
1003 			device_printf(sc->dev,
1004 				 "performance consider loading optimized "
1005 				 "firmware\n");
1006 		}
1007 		sc->fw_name = mxge_fw_unaligned;
1008 		sc->tx_boundary = 2048;
1009 		return 0;
1010 	}
1011 	/* clear confirmation addr */
1012 	confirm = (volatile uint32_t *)sc->cmd;
1013 	*confirm = 0;
1014 	wmb();
1015 	/* send a reload command to the bootstrap MCP, and wait for the
1016 	   response in the confirmation address.  The firmware should
1017 	   write a -1 there to indicate it is alive and well
1018 	*/
1019 
1020 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1021 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1022 
1023 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1024 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1025 	buf[2] = htobe32(0xffffffff);	/* confirm data */
1026 
1027 	/* FIX: All newest firmware should un-protect the bottom of
1028 	   the sram before handoff. However, the very first interfaces
1029 	   do not. Therefore the handoff copy must skip the first 8 bytes
1030 	*/
1031 					/* where the code starts*/
1032 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1033 	buf[4] = htobe32(size - 8); 	/* length of code */
1034 	buf[5] = htobe32(8);		/* where to copy to */
1035 	buf[6] = htobe32(0);		/* where to jump to */
1036 
1037 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1038 	mxge_pio_copy(submit, buf, 64);
1039 	wmb();
1040 	DELAY(1000);
1041 	wmb();
1042 	i = 0;
1043 	while (*confirm != 0xffffffff && i < 20) {
1044 		DELAY(1000*10);
1045 		i++;
1046 		bus_dmamap_sync(sc->cmd_dma.dmat,
1047 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1048 	}
1049 	if (*confirm != 0xffffffff) {
1050 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1051 			confirm, *confirm);
1052 
1053 		return ENXIO;
1054 	}
1055 	return 0;
1056 }
1057 
1058 static int
1059 mxge_update_mac_address(mxge_softc_t *sc)
1060 {
1061 	mxge_cmd_t cmd;
1062 	uint8_t *addr = sc->mac_addr;
1063 	int status;
1064 
1065 
1066 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1067 		     | (addr[2] << 8) | addr[3]);
1068 
1069 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1070 
1071 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1072 	return status;
1073 }
1074 
1075 static int
1076 mxge_change_pause(mxge_softc_t *sc, int pause)
1077 {
1078 	mxge_cmd_t cmd;
1079 	int status;
1080 
1081 	if (pause)
1082 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1083 				       &cmd);
1084 	else
1085 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1086 				       &cmd);
1087 
1088 	if (status) {
1089 		device_printf(sc->dev, "Failed to set flow control mode\n");
1090 		return ENXIO;
1091 	}
1092 	sc->pause = pause;
1093 	return 0;
1094 }
1095 
1096 static void
1097 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1098 {
1099 	mxge_cmd_t cmd;
1100 	int status;
1101 
1102 	if( sc->ifp->if_serializer)
1103 		ASSERT_SERIALIZED(sc->ifp->if_serializer);
1104 	if (mxge_always_promisc)
1105 		promisc = 1;
1106 
1107 	if (promisc)
1108 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1109 				       &cmd);
1110 	else
1111 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1112 				       &cmd);
1113 
1114 	if (status) {
1115 		device_printf(sc->dev, "Failed to set promisc mode\n");
1116 	}
1117 }
1118 
1119 static void
1120 mxge_set_multicast_list(mxge_softc_t *sc)
1121 {
1122 	mxge_cmd_t cmd;
1123 	struct ifmultiaddr *ifma;
1124 	struct ifnet *ifp = sc->ifp;
1125 	int err;
1126 
1127 	if (ifp->if_serializer)
1128 		ASSERT_SERIALIZED(ifp->if_serializer);
1129 
1130 	/* This firmware is known to not support multicast */
1131 	if (!sc->fw_multicast_support)
1132 		return;
1133 
1134 	/* Disable multicast filtering while we play with the lists*/
1135 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1136 	if (err != 0) {
1137 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1138 		       " error status: %d\n", err);
1139 		return;
1140 	}
1141 
1142 	if (sc->adopted_rx_filter_bug)
1143 		return;
1144 
1145 	if (ifp->if_flags & IFF_ALLMULTI)
1146 		/* request to disable multicast filtering, so quit here */
1147 		return;
1148 
1149 	/* Flush all the filters */
1150 
1151 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1152 	if (err != 0) {
1153 		device_printf(sc->dev,
1154 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1155 			      ", error status: %d\n", err);
1156 		return;
1157 	}
1158 
1159 	/* Walk the multicast list, and add each address */
1160 
1161 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1162 		if (ifma->ifma_addr->sa_family != AF_LINK)
1163 			continue;
1164 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1165 		      &cmd.data0, 4);
1166 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1167 		      &cmd.data1, 2);
1168 		cmd.data0 = htonl(cmd.data0);
1169 		cmd.data1 = htonl(cmd.data1);
1170 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1171 		if (err != 0) {
1172 			device_printf(sc->dev, "Failed "
1173 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1174 			       "%d\t", err);
1175 			/* abort, leaving multicast filtering off */
1176 			return;
1177 		}
1178 	}
1179 	/* Enable multicast filtering */
1180 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1181 	if (err != 0) {
1182 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1183 		       ", error status: %d\n", err);
1184 	}
1185 }
1186 
1187 static int
1188 mxge_max_mtu(mxge_softc_t *sc)
1189 {
1190 	mxge_cmd_t cmd;
1191 	int status;
1192 
1193 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1194 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1195 
1196 	/* try to set nbufs to see if it we can
1197 	   use virtually contiguous jumbos */
1198 	cmd.data0 = 0;
1199 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1200 			       &cmd);
1201 	if (status == 0)
1202 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1203 
1204 	/* otherwise, we're limited to MJUMPAGESIZE */
1205 	return MJUMPAGESIZE - MXGEFW_PAD;
1206 }
1207 
1208 static int
1209 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1210 {
1211 	struct mxge_slice_state *ss;
1212 	mxge_rx_done_t *rx_done;
1213 	volatile uint32_t *irq_claim;
1214 	mxge_cmd_t cmd;
1215 	int slice, status;
1216 
1217 	/* try to send a reset command to the card to see if it
1218 	   is alive */
1219 	memset(&cmd, 0, sizeof (cmd));
1220 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1221 	if (status != 0) {
1222 		device_printf(sc->dev, "failed reset\n");
1223 		return ENXIO;
1224 	}
1225 
1226 	mxge_dummy_rdma(sc, 1);
1227 
1228 
1229 	/* set the intrq size */
1230 	cmd.data0 = sc->rx_ring_size;
1231 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1232 
1233 	/*
1234 	 * Even though we already know how many slices are supported
1235 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1236 	 * has magic side effects, and must be called after a reset.
1237 	 * It must be called prior to calling any RSS related cmds,
1238 	 * including assigning an interrupt queue for anything but
1239 	 * slice 0.  It must also be called *after*
1240 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1241 	 * the firmware to compute offsets.
1242 	 */
1243 
1244 	if (sc->num_slices > 1) {
1245 		/* ask the maximum number of slices it supports */
1246 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1247 					   &cmd);
1248 		if (status != 0) {
1249 			device_printf(sc->dev,
1250 				      "failed to get number of slices\n");
1251 			return status;
1252 		}
1253 		/*
1254 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1255 		 * to setting up the interrupt queue DMA
1256 		 */
1257 		cmd.data0 = sc->num_slices;
1258 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1259 #ifdef IFNET_BUF_RING
1260 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1261 #endif
1262 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1263 					   &cmd);
1264 		if (status != 0) {
1265 			device_printf(sc->dev,
1266 				      "failed to set number of slices\n");
1267 			return status;
1268 		}
1269 	}
1270 
1271 
1272 	if (interrupts_setup) {
1273 		/* Now exchange information about interrupts  */
1274 		for (slice = 0; slice < sc->num_slices; slice++) {
1275 			rx_done = &sc->ss[slice].rx_done;
1276 			memset(rx_done->entry, 0, sc->rx_ring_size);
1277 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1278 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1279 			cmd.data2 = slice;
1280 			status |= mxge_send_cmd(sc,
1281 						MXGEFW_CMD_SET_INTRQ_DMA,
1282 						&cmd);
1283 		}
1284 	}
1285 
1286 	status |= mxge_send_cmd(sc,
1287 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1288 
1289 
1290 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1291 
1292 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1293 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1294 
1295 
1296 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1297 				&cmd);
1298 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1299 	if (status != 0) {
1300 		device_printf(sc->dev, "failed set interrupt parameters\n");
1301 		return status;
1302 	}
1303 
1304 
1305 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1306 
1307 
1308 	/* run a DMA benchmark */
1309 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1310 
1311 	for (slice = 0; slice < sc->num_slices; slice++) {
1312 		ss = &sc->ss[slice];
1313 
1314 		ss->irq_claim = irq_claim + (2 * slice);
1315 		/* reset mcp/driver shared state back to 0 */
1316 		ss->rx_done.idx = 0;
1317 		ss->rx_done.cnt = 0;
1318 		ss->tx.req = 0;
1319 		ss->tx.done = 0;
1320 		ss->tx.pkt_done = 0;
1321 		ss->tx.queue_active = 0;
1322 		ss->tx.activate = 0;
1323 		ss->tx.deactivate = 0;
1324 		ss->tx.wake = 0;
1325 		ss->tx.defrag = 0;
1326 		ss->tx.stall = 0;
1327 		ss->rx_big.cnt = 0;
1328 		ss->rx_small.cnt = 0;
1329 		ss->lro_bad_csum = 0;
1330 		ss->lro_queued = 0;
1331 		ss->lro_flushed = 0;
1332 		if (ss->fw_stats != NULL) {
1333 			ss->fw_stats->valid = 0;
1334 			ss->fw_stats->send_done_count = 0;
1335 		}
1336 	}
1337 	sc->rdma_tags_available = 15;
1338 	status = mxge_update_mac_address(sc);
1339 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1340 	mxge_change_pause(sc, sc->pause);
1341 	mxge_set_multicast_list(sc);
1342 	return status;
1343 }
1344 
1345 static int
1346 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1347 {
1348         mxge_softc_t *sc;
1349         unsigned int intr_coal_delay;
1350         int err;
1351 
1352         sc = arg1;
1353         intr_coal_delay = sc->intr_coal_delay;
1354         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1355         if (err != 0) {
1356                 return err;
1357         }
1358         if (intr_coal_delay == sc->intr_coal_delay)
1359                 return 0;
1360 
1361         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1362                 return EINVAL;
1363 
1364 	lwkt_serialize_enter(sc->ifp->if_serializer);
1365 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1366 	sc->intr_coal_delay = intr_coal_delay;
1367 
1368 	lwkt_serialize_exit(sc->ifp->if_serializer);
1369         return err;
1370 }
1371 
1372 static int
1373 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1374 {
1375         mxge_softc_t *sc;
1376         unsigned int enabled;
1377         int err;
1378 
1379         sc = arg1;
1380         enabled = sc->pause;
1381         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1382         if (err != 0) {
1383                 return err;
1384         }
1385         if (enabled == sc->pause)
1386                 return 0;
1387 
1388 	lwkt_serialize_enter(sc->ifp->if_serializer);
1389 	err = mxge_change_pause(sc, enabled);
1390 	lwkt_serialize_exit(sc->ifp->if_serializer);
1391         return err;
1392 }
1393 
1394 static int
1395 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1396 {
1397 	struct ifnet *ifp;
1398 	int err = 0;
1399 
1400 	ifp = sc->ifp;
1401 	if (lro_cnt == 0)
1402 		ifp->if_capenable &= ~IFCAP_LRO;
1403 	else
1404 		ifp->if_capenable |= IFCAP_LRO;
1405 	sc->lro_cnt = lro_cnt;
1406 	if (ifp->if_flags & IFF_RUNNING) {
1407 		mxge_close(sc);
1408 		err = mxge_open(sc);
1409 	}
1410 	return err;
1411 }
1412 
1413 static int
1414 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1415 {
1416 	mxge_softc_t *sc;
1417 	unsigned int lro_cnt;
1418 	int err;
1419 
1420 	sc = arg1;
1421 	lro_cnt = sc->lro_cnt;
1422 	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1423 	if (err != 0)
1424 		return err;
1425 
1426 	if (lro_cnt == sc->lro_cnt)
1427 		return 0;
1428 
1429 	if (lro_cnt > 128)
1430 		return EINVAL;
1431 
1432 	lwkt_serialize_enter(sc->ifp->if_serializer);
1433 	err = mxge_change_lro_locked(sc, lro_cnt);
1434 	lwkt_serialize_exit(sc->ifp->if_serializer);
1435 	return err;
1436 }
1437 
1438 static int
1439 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1440 {
1441         int err;
1442 
1443         if (arg1 == NULL)
1444                 return EFAULT;
1445         arg2 = be32toh(*(int *)arg1);
1446         arg1 = NULL;
1447         err = sysctl_handle_int(oidp, arg1, arg2, req);
1448 
1449         return err;
1450 }
1451 
1452 static void
1453 mxge_rem_sysctls(mxge_softc_t *sc)
1454 {
1455 	struct mxge_slice_state *ss;
1456 	int slice;
1457 
1458 	if (sc->slice_sysctl_tree == NULL)
1459 		return;
1460 
1461 	for (slice = 0; slice < sc->num_slices; slice++) {
1462 		ss = &sc->ss[slice];
1463 		if (ss == NULL || ss->sysctl_tree == NULL)
1464 			continue;
1465 		sysctl_ctx_free(&ss->sysctl_ctx);
1466 		ss->sysctl_tree = NULL;
1467 	}
1468 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1469 	sc->slice_sysctl_tree = NULL;
1470 	sysctl_ctx_free(&sc->sysctl_ctx);
1471 	sc->sysctl_tree = NULL;
1472 
1473 }
1474 
1475 static void
1476 mxge_add_sysctls(mxge_softc_t *sc)
1477 {
1478 	struct sysctl_ctx_list *ctx;
1479 	struct sysctl_oid_list *children;
1480 	mcp_irq_data_t *fw;
1481 	struct mxge_slice_state *ss;
1482 	int slice;
1483 	char slice_num[8];
1484 
1485 	ctx = &sc->sysctl_ctx;
1486 	sysctl_ctx_init(ctx);
1487 	sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1488 					  OID_AUTO,
1489 					  device_get_nameunit(sc->dev),
1490 					  CTLFLAG_RD, 0, "");
1491 	if (sc->sysctl_tree == NULL) {
1492 		device_printf(sc->dev, "can't add sysctl node\n");
1493 		return;
1494 	}
1495 
1496 	children = SYSCTL_CHILDREN(sc->sysctl_tree);
1497 	fw = sc->ss[0].fw_stats;
1498 
1499 	/* random information */
1500 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1501 		       "firmware_version",
1502 		       CTLFLAG_RD, &sc->fw_version,
1503 		       0, "firmware version");
1504 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1505 		       "serial_number",
1506 		       CTLFLAG_RD, &sc->serial_number_string,
1507 		       0, "serial number");
1508 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1509 		       "product_code",
1510 		       CTLFLAG_RD, &sc->product_code_string,
1511 		       0, "product_code");
1512 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1513 		       "pcie_link_width",
1514 		       CTLFLAG_RD, &sc->link_width,
1515 		       0, "tx_boundary");
1516 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1517 		       "tx_boundary",
1518 		       CTLFLAG_RD, &sc->tx_boundary,
1519 		       0, "tx_boundary");
1520 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1521 		       "write_combine",
1522 		       CTLFLAG_RD, &sc->wc,
1523 		       0, "write combining PIO?");
1524 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1525 		       "read_dma_MBs",
1526 		       CTLFLAG_RD, &sc->read_dma,
1527 		       0, "DMA Read speed in MB/s");
1528 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1529 		       "write_dma_MBs",
1530 		       CTLFLAG_RD, &sc->write_dma,
1531 		       0, "DMA Write speed in MB/s");
1532 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1533 		       "read_write_dma_MBs",
1534 		       CTLFLAG_RD, &sc->read_write_dma,
1535 		       0, "DMA concurrent Read/Write speed in MB/s");
1536 
1537 
1538 	/* performance related tunables */
1539 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1540 			"intr_coal_delay",
1541 			CTLTYPE_INT|CTLFLAG_RW, sc,
1542 			0, mxge_change_intr_coal,
1543 			"I", "interrupt coalescing delay in usecs");
1544 
1545 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1546 			"flow_control_enabled",
1547 			CTLTYPE_INT|CTLFLAG_RW, sc,
1548 			0, mxge_change_flow_control,
1549 			"I", "interrupt coalescing delay in usecs");
1550 
1551 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1552 		       "deassert_wait",
1553 		       CTLFLAG_RW, &mxge_deassert_wait,
1554 		       0, "Wait for IRQ line to go low in ihandler");
1555 
1556 	/* stats block from firmware is in network byte order.
1557 	   Need to swap it */
1558 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1559 			"link_up",
1560 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1561 			0, mxge_handle_be32,
1562 			"I", "link up");
1563 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1564 			"rdma_tags_available",
1565 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1566 			0, mxge_handle_be32,
1567 			"I", "rdma_tags_available");
1568 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1569 			"dropped_bad_crc32",
1570 			CTLTYPE_INT|CTLFLAG_RD,
1571 			&fw->dropped_bad_crc32,
1572 			0, mxge_handle_be32,
1573 			"I", "dropped_bad_crc32");
1574 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1575 			"dropped_bad_phy",
1576 			CTLTYPE_INT|CTLFLAG_RD,
1577 			&fw->dropped_bad_phy,
1578 			0, mxge_handle_be32,
1579 			"I", "dropped_bad_phy");
1580 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1581 			"dropped_link_error_or_filtered",
1582 			CTLTYPE_INT|CTLFLAG_RD,
1583 			&fw->dropped_link_error_or_filtered,
1584 			0, mxge_handle_be32,
1585 			"I", "dropped_link_error_or_filtered");
1586 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1587 			"dropped_link_overflow",
1588 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1589 			0, mxge_handle_be32,
1590 			"I", "dropped_link_overflow");
1591 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1592 			"dropped_multicast_filtered",
1593 			CTLTYPE_INT|CTLFLAG_RD,
1594 			&fw->dropped_multicast_filtered,
1595 			0, mxge_handle_be32,
1596 			"I", "dropped_multicast_filtered");
1597 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1598 			"dropped_no_big_buffer",
1599 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1600 			0, mxge_handle_be32,
1601 			"I", "dropped_no_big_buffer");
1602 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1603 			"dropped_no_small_buffer",
1604 			CTLTYPE_INT|CTLFLAG_RD,
1605 			&fw->dropped_no_small_buffer,
1606 			0, mxge_handle_be32,
1607 			"I", "dropped_no_small_buffer");
1608 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1609 			"dropped_overrun",
1610 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1611 			0, mxge_handle_be32,
1612 			"I", "dropped_overrun");
1613 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1614 			"dropped_pause",
1615 			CTLTYPE_INT|CTLFLAG_RD,
1616 			&fw->dropped_pause,
1617 			0, mxge_handle_be32,
1618 			"I", "dropped_pause");
1619 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1620 			"dropped_runt",
1621 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1622 			0, mxge_handle_be32,
1623 			"I", "dropped_runt");
1624 
1625 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1626 			"dropped_unicast_filtered",
1627 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1628 			0, mxge_handle_be32,
1629 			"I", "dropped_unicast_filtered");
1630 
1631 	/* verbose printing? */
1632 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1633 		       "verbose",
1634 		       CTLFLAG_RW, &mxge_verbose,
1635 		       0, "verbose printing");
1636 
1637 	/* lro */
1638 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1639 			"lro_cnt",
1640 			CTLTYPE_INT|CTLFLAG_RW, sc,
1641 			0, mxge_change_lro,
1642 			"I", "number of lro merge queues");
1643 
1644 
1645 	/* add counters exported for debugging from all slices */
1646 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1647 	sc->slice_sysctl_tree =
1648 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1649 				"slice", CTLFLAG_RD, 0, "");
1650 
1651 	for (slice = 0; slice < sc->num_slices; slice++) {
1652 		ss = &sc->ss[slice];
1653 		sysctl_ctx_init(&ss->sysctl_ctx);
1654 		ctx = &ss->sysctl_ctx;
1655 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1656 		ksprintf(slice_num, "%d", slice);
1657 		ss->sysctl_tree =
1658 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1659 					CTLFLAG_RD, 0, "");
1660 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1661 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1662 			       "rx_small_cnt",
1663 			       CTLFLAG_RD, &ss->rx_small.cnt,
1664 			       0, "rx_small_cnt");
1665 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1666 			       "rx_big_cnt",
1667 			       CTLFLAG_RD, &ss->rx_big.cnt,
1668 			       0, "rx_small_cnt");
1669 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1670 			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1671 			       0, "number of lro merge queues flushed");
1672 
1673 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1674 			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1675 			       0, "number of frames appended to lro merge"
1676 			       "queues");
1677 
1678 #ifndef IFNET_BUF_RING
1679 		/* only transmit from slice 0 for now */
1680 		if (slice > 0)
1681 			continue;
1682 #endif
1683 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1684 			       "tx_req",
1685 			       CTLFLAG_RD, &ss->tx.req,
1686 			       0, "tx_req");
1687 
1688 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1689 			       "tx_done",
1690 			       CTLFLAG_RD, &ss->tx.done,
1691 			       0, "tx_done");
1692 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1693 			       "tx_pkt_done",
1694 			       CTLFLAG_RD, &ss->tx.pkt_done,
1695 			       0, "tx_done");
1696 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1697 			       "tx_stall",
1698 			       CTLFLAG_RD, &ss->tx.stall,
1699 			       0, "tx_stall");
1700 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1701 			       "tx_wake",
1702 			       CTLFLAG_RD, &ss->tx.wake,
1703 			       0, "tx_wake");
1704 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1705 			       "tx_defrag",
1706 			       CTLFLAG_RD, &ss->tx.defrag,
1707 			       0, "tx_defrag");
1708 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1709 			       "tx_queue_active",
1710 			       CTLFLAG_RD, &ss->tx.queue_active,
1711 			       0, "tx_queue_active");
1712 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1713 			       "tx_activate",
1714 			       CTLFLAG_RD, &ss->tx.activate,
1715 			       0, "tx_activate");
1716 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1717 			       "tx_deactivate",
1718 			       CTLFLAG_RD, &ss->tx.deactivate,
1719 			       0, "tx_deactivate");
1720 	}
1721 }
1722 
1723 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1724    backwards one at a time and handle ring wraps */
1725 
1726 static inline void
1727 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1728 			    mcp_kreq_ether_send_t *src, int cnt)
1729 {
1730         int idx, starting_slot;
1731         starting_slot = tx->req;
1732         while (cnt > 1) {
1733                 cnt--;
1734                 idx = (starting_slot + cnt) & tx->mask;
1735                 mxge_pio_copy(&tx->lanai[idx],
1736 			      &src[cnt], sizeof(*src));
1737                 wmb();
1738         }
1739 }
1740 
1741 /*
1742  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1743  * at most 32 bytes at a time, so as to avoid involving the software
1744  * pio handler in the nic.   We re-write the first segment's flags
1745  * to mark them valid only after writing the entire chain
1746  */
1747 
1748 static inline void
1749 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1750                   int cnt)
1751 {
1752         int idx, i;
1753         uint32_t *src_ints;
1754 	volatile uint32_t *dst_ints;
1755         mcp_kreq_ether_send_t *srcp;
1756 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1757 	uint8_t last_flags;
1758 
1759         idx = tx->req & tx->mask;
1760 
1761 	last_flags = src->flags;
1762 	src->flags = 0;
1763         wmb();
1764         dst = dstp = &tx->lanai[idx];
1765         srcp = src;
1766 
1767         if ((idx + cnt) < tx->mask) {
1768                 for (i = 0; i < (cnt - 1); i += 2) {
1769                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1770                         wmb(); /* force write every 32 bytes */
1771                         srcp += 2;
1772                         dstp += 2;
1773                 }
1774         } else {
1775                 /* submit all but the first request, and ensure
1776                    that it is submitted below */
1777                 mxge_submit_req_backwards(tx, src, cnt);
1778                 i = 0;
1779         }
1780         if (i < cnt) {
1781                 /* submit the first request */
1782                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1783                 wmb(); /* barrier before setting valid flag */
1784         }
1785 
1786         /* re-write the last 32-bits with the valid flags */
1787         src->flags = last_flags;
1788         src_ints = (uint32_t *)src;
1789         src_ints+=3;
1790         dst_ints = (volatile uint32_t *)dst;
1791         dst_ints+=3;
1792         *dst_ints =  *src_ints;
1793         tx->req += cnt;
1794         wmb();
1795 }
1796 
1797 #if IFCAP_TSO4
1798 
1799 static void
1800 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1801 	       int busdma_seg_cnt, int ip_off)
1802 {
1803 	mxge_tx_ring_t *tx;
1804 	mcp_kreq_ether_send_t *req;
1805 	bus_dma_segment_t *seg;
1806 	struct ip *ip;
1807 	struct tcphdr *tcp;
1808 	uint32_t low, high_swapped;
1809 	int len, seglen, cum_len, cum_len_next;
1810 	int next_is_first, chop, cnt, rdma_count, small;
1811 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1812 	uint8_t flags, flags_next;
1813 	static int once;
1814 
1815 	mss = m->m_pkthdr.tso_segsz;
1816 
1817 	/* negative cum_len signifies to the
1818 	 * send loop that we are still in the
1819 	 * header portion of the TSO packet.
1820 	 */
1821 
1822 	/* ensure we have the ethernet, IP and TCP
1823 	   header together in the first mbuf, copy
1824 	   it to a scratch buffer if not */
1825 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1826 		m_copydata(m, 0, ip_off + sizeof (*ip),
1827 			   ss->scratch);
1828 		ip = (struct ip *)(ss->scratch + ip_off);
1829 	} else {
1830 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1831 	}
1832 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1833 			    + sizeof (*tcp))) {
1834 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1835 			   + sizeof (*tcp),  ss->scratch);
1836 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1837 	}
1838 
1839 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1840 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1841 
1842 	/* TSO implies checksum offload on this hardware */
1843 	cksum_offset = ip_off + (ip->ip_hl << 2);
1844 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1845 
1846 
1847 	/* for TSO, pseudo_hdr_offset holds mss.
1848 	 * The firmware figures out where to put
1849 	 * the checksum by parsing the header. */
1850 	pseudo_hdr_offset = htobe16(mss);
1851 
1852 	tx = &ss->tx;
1853 	req = tx->req_list;
1854 	seg = tx->seg_list;
1855 	cnt = 0;
1856 	rdma_count = 0;
1857 	/* "rdma_count" is the number of RDMAs belonging to the
1858 	 * current packet BEFORE the current send request. For
1859 	 * non-TSO packets, this is equal to "count".
1860 	 * For TSO packets, rdma_count needs to be reset
1861 	 * to 0 after a segment cut.
1862 	 *
1863 	 * The rdma_count field of the send request is
1864 	 * the number of RDMAs of the packet starting at
1865 	 * that request. For TSO send requests with one ore more cuts
1866 	 * in the middle, this is the number of RDMAs starting
1867 	 * after the last cut in the request. All previous
1868 	 * segments before the last cut implicitly have 1 RDMA.
1869 	 *
1870 	 * Since the number of RDMAs is not known beforehand,
1871 	 * it must be filled-in retroactively - after each
1872 	 * segmentation cut or at the end of the entire packet.
1873 	 */
1874 
1875 	while (busdma_seg_cnt) {
1876 		/* Break the busdma segment up into pieces*/
1877 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1878 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1879 		len = seg->ds_len;
1880 
1881 		while (len) {
1882 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1883 			seglen = len;
1884 			cum_len_next = cum_len + seglen;
1885 			(req-rdma_count)->rdma_count = rdma_count + 1;
1886 			if (__predict_true(cum_len >= 0)) {
1887 				/* payload */
1888 				chop = (cum_len_next > mss);
1889 				cum_len_next = cum_len_next % mss;
1890 				next_is_first = (cum_len_next == 0);
1891 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1892 				flags_next |= next_is_first *
1893 					MXGEFW_FLAGS_FIRST;
1894 				rdma_count |= -(chop | next_is_first);
1895 				rdma_count += chop & !next_is_first;
1896 			} else if (cum_len_next >= 0) {
1897 				/* header ends */
1898 				rdma_count = -1;
1899 				cum_len_next = 0;
1900 				seglen = -cum_len;
1901 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1902 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1903 					MXGEFW_FLAGS_FIRST |
1904 					(small * MXGEFW_FLAGS_SMALL);
1905 			    }
1906 
1907 			req->addr_high = high_swapped;
1908 			req->addr_low = htobe32(low);
1909 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1910 			req->pad = 0;
1911 			req->rdma_count = 1;
1912 			req->length = htobe16(seglen);
1913 			req->cksum_offset = cksum_offset;
1914 			req->flags = flags | ((cum_len & 1) *
1915 					      MXGEFW_FLAGS_ALIGN_ODD);
1916 			low += seglen;
1917 			len -= seglen;
1918 			cum_len = cum_len_next;
1919 			flags = flags_next;
1920 			req++;
1921 			cnt++;
1922 			rdma_count++;
1923 			if (__predict_false(cksum_offset > seglen))
1924 				cksum_offset -= seglen;
1925 			else
1926 				cksum_offset = 0;
1927 			if (__predict_false(cnt > tx->max_desc))
1928 				goto drop;
1929 		}
1930 		busdma_seg_cnt--;
1931 		seg++;
1932 	}
1933 	(req-rdma_count)->rdma_count = rdma_count;
1934 
1935 	do {
1936 		req--;
1937 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1938 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1939 
1940 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1941 	mxge_submit_req(tx, tx->req_list, cnt);
1942 #ifdef IFNET_BUF_RING
1943 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1944 		/* tell the NIC to start polling this slice */
1945 		*tx->send_go = 1;
1946 		tx->queue_active = 1;
1947 		tx->activate++;
1948 		wmb();
1949 	}
1950 #endif
1951 	return;
1952 
1953 drop:
1954 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1955 	m_freem(m);
1956 	ss->oerrors++;
1957 	if (!once) {
1958 		kprintf("tx->max_desc exceeded via TSO!\n");
1959 		kprintf("mss = %d, %ld, %d!\n", mss,
1960 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1961 		once = 1;
1962 	}
1963 	return;
1964 
1965 }
1966 
1967 #endif /* IFCAP_TSO4 */
1968 
1969 #ifdef MXGE_NEW_VLAN_API
1970 /*
1971  * We reproduce the software vlan tag insertion from
1972  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1973  * vlan tag insertion. We need to advertise this in order to have the
1974  * vlan interface respect our csum offload flags.
1975  */
1976 static struct mbuf *
1977 mxge_vlan_tag_insert(struct mbuf *m)
1978 {
1979 	struct ether_vlan_header *evl;
1980 
1981 	M_PREPEND(m, EVL_ENCAPLEN, MB_DONTWAIT);
1982 	if (__predict_false(m == NULL))
1983 		return NULL;
1984 	if (m->m_len < sizeof(*evl)) {
1985 		m = m_pullup(m, sizeof(*evl));
1986 		if (__predict_false(m == NULL))
1987 			return NULL;
1988 	}
1989 	/*
1990 	 * Transform the Ethernet header into an Ethernet header
1991 	 * with 802.1Q encapsulation.
1992 	 */
1993 	evl = mtod(m, struct ether_vlan_header *);
1994 	bcopy((char *)evl + EVL_ENCAPLEN,
1995 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1996 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1997 	evl->evl_tag = htons(m->m_pkthdr.ether_vlantag);
1998 	m->m_flags &= ~M_VLANTAG;
1999 	return m;
2000 }
2001 #endif /* MXGE_NEW_VLAN_API */
2002 
2003 static void
2004 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2005 {
2006 	mxge_softc_t *sc;
2007 	mcp_kreq_ether_send_t *req;
2008 	bus_dma_segment_t *seg;
2009 	struct mbuf *m_tmp;
2010 	mxge_tx_ring_t *tx;
2011 	struct ip *ip;
2012 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
2013 	uint16_t pseudo_hdr_offset;
2014         uint8_t flags, cksum_offset;
2015 
2016 
2017 	sc = ss->sc;
2018 	tx = &ss->tx;
2019 
2020 	ip_off = sizeof (struct ether_header);
2021 #ifdef MXGE_NEW_VLAN_API
2022 	if (m->m_flags & M_VLANTAG) {
2023 		m = mxge_vlan_tag_insert(m);
2024 		if (__predict_false(m == NULL))
2025 			goto drop;
2026 		ip_off += EVL_ENCAPLEN;
2027 	}
2028 #endif
2029 	/* (try to) map the frame for DMA */
2030 	idx = tx->req & tx->mask;
2031 	err = bus_dmamap_load_mbuf_segment(tx->dmat, tx->info[idx].map,
2032 					   m, tx->seg_list, 1, &cnt,
2033 					   BUS_DMA_NOWAIT);
2034 	if (__predict_false(err == EFBIG)) {
2035 		/* Too many segments in the chain.  Try
2036 		   to defrag */
2037 		m_tmp = m_defrag(m, MB_DONTWAIT);
2038 		if (m_tmp == NULL) {
2039 			goto drop;
2040 		}
2041 		ss->tx.defrag++;
2042 		m = m_tmp;
2043 		err = bus_dmamap_load_mbuf_segment(tx->dmat,
2044 					      tx->info[idx].map,
2045 					      m, tx->seg_list, 1, &cnt,
2046 					      BUS_DMA_NOWAIT);
2047 	}
2048 	if (__predict_false(err != 0)) {
2049 		device_printf(sc->dev, "bus_dmamap_load_mbuf_segment returned %d"
2050 			      " packet len = %d\n", err, m->m_pkthdr.len);
2051 		goto drop;
2052 	}
2053 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2054 			BUS_DMASYNC_PREWRITE);
2055 	tx->info[idx].m = m;
2056 
2057 #if IFCAP_TSO4
2058 	/* TSO is different enough, we handle it in another routine */
2059 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2060 		mxge_encap_tso(ss, m, cnt, ip_off);
2061 		return;
2062 	}
2063 #endif
2064 
2065 	req = tx->req_list;
2066 	cksum_offset = 0;
2067 	pseudo_hdr_offset = 0;
2068 	flags = MXGEFW_FLAGS_NO_TSO;
2069 
2070 	/* checksum offloading? */
2071 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2072 		/* ensure ip header is in first mbuf, copy
2073 		   it to a scratch buffer if not */
2074 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2075 			m_copydata(m, 0, ip_off + sizeof (*ip),
2076 				   ss->scratch);
2077 			ip = (struct ip *)(ss->scratch + ip_off);
2078 		} else {
2079 			ip = (struct ip *)(mtod(m, char *) + ip_off);
2080 		}
2081 		cksum_offset = ip_off + (ip->ip_hl << 2);
2082 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2083 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2084 		req->cksum_offset = cksum_offset;
2085 		flags |= MXGEFW_FLAGS_CKSUM;
2086 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2087 	} else {
2088 		odd_flag = 0;
2089 	}
2090 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2091 		flags |= MXGEFW_FLAGS_SMALL;
2092 
2093 	/* convert segments into a request list */
2094 	cum_len = 0;
2095 	seg = tx->seg_list;
2096 	req->flags = MXGEFW_FLAGS_FIRST;
2097 	for (i = 0; i < cnt; i++) {
2098 		req->addr_low =
2099 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2100 		req->addr_high =
2101 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2102 		req->length = htobe16(seg->ds_len);
2103 		req->cksum_offset = cksum_offset;
2104 		if (cksum_offset > seg->ds_len)
2105 			cksum_offset -= seg->ds_len;
2106 		else
2107 			cksum_offset = 0;
2108 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2109 		req->pad = 0; /* complete solid 16-byte block */
2110 		req->rdma_count = 1;
2111 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2112 		cum_len += seg->ds_len;
2113 		seg++;
2114 		req++;
2115 		req->flags = 0;
2116 	}
2117 	req--;
2118 	/* pad runts to 60 bytes */
2119 	if (cum_len < 60) {
2120 		req++;
2121 		req->addr_low =
2122 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2123 		req->addr_high =
2124 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2125 		req->length = htobe16(60 - cum_len);
2126 		req->cksum_offset = 0;
2127 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2128 		req->pad = 0; /* complete solid 16-byte block */
2129 		req->rdma_count = 1;
2130 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2131 		cnt++;
2132 	}
2133 
2134 	tx->req_list[0].rdma_count = cnt;
2135 #if 0
2136 	/* print what the firmware will see */
2137 	for (i = 0; i < cnt; i++) {
2138 		kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2139 		    "cso:%d, flags:0x%x, rdma:%d\n",
2140 		    i, (int)ntohl(tx->req_list[i].addr_high),
2141 		    (int)ntohl(tx->req_list[i].addr_low),
2142 		    (int)ntohs(tx->req_list[i].length),
2143 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2144 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2145 		    tx->req_list[i].rdma_count);
2146 	}
2147 	kprintf("--------------\n");
2148 #endif
2149 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2150 	mxge_submit_req(tx, tx->req_list, cnt);
2151 #ifdef IFNET_BUF_RING
2152 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2153 		/* tell the NIC to start polling this slice */
2154 		*tx->send_go = 1;
2155 		tx->queue_active = 1;
2156 		tx->activate++;
2157 		wmb();
2158 	}
2159 #endif
2160 	return;
2161 
2162 drop:
2163 	m_freem(m);
2164 	ss->oerrors++;
2165 	return;
2166 }
2167 
2168 static inline void
2169 mxge_start_locked(struct mxge_slice_state *ss)
2170 {
2171 	mxge_softc_t *sc;
2172 	struct mbuf *m;
2173 	struct ifnet *ifp;
2174 	mxge_tx_ring_t *tx;
2175 
2176 	sc = ss->sc;
2177 	ifp = sc->ifp;
2178 	tx = &ss->tx;
2179 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2180 		m = ifq_dequeue(&ifp->if_snd, NULL);
2181 		if (m == NULL) {
2182 			return;
2183 		}
2184 		/* let BPF see it */
2185 		BPF_MTAP(ifp, m);
2186 
2187 		/* give it to the nic */
2188 		mxge_encap(ss, m);
2189 	}
2190 	/* ran out of transmit slots */
2191 	if (!ifq_is_oactive(&ifp->if_snd)) {
2192 		ifq_set_oactive(&ifp->if_snd);
2193 		tx->stall++;
2194 	}
2195 }
2196 
2197 static void
2198 mxge_start(struct ifnet *ifp, struct ifaltq_subque *ifsq)
2199 {
2200 	mxge_softc_t *sc = ifp->if_softc;
2201 	struct mxge_slice_state *ss;
2202 
2203 	ASSERT_ALTQ_SQ_DEFAULT(ifp, ifsq);
2204 	ASSERT_SERIALIZED(sc->ifp->if_serializer);
2205 	/* only use the first slice for now */
2206 	ss = &sc->ss[0];
2207 	mxge_start_locked(ss);
2208 }
2209 
2210 /*
2211  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2212  * at most 32 bytes at a time, so as to avoid involving the software
2213  * pio handler in the nic.   We re-write the first segment's low
2214  * DMA address to mark it valid only after we write the entire chunk
2215  * in a burst
2216  */
2217 static inline void
2218 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2219 		mcp_kreq_ether_recv_t *src)
2220 {
2221 	uint32_t low;
2222 
2223 	low = src->addr_low;
2224 	src->addr_low = 0xffffffff;
2225 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2226 	wmb();
2227 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2228 	wmb();
2229 	src->addr_low = low;
2230 	dst->addr_low = low;
2231 	wmb();
2232 }
2233 
2234 static int
2235 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2236 {
2237 	bus_dma_segment_t seg;
2238 	struct mbuf *m;
2239 	mxge_rx_ring_t *rx = &ss->rx_small;
2240 	int cnt, err;
2241 
2242 	m = m_gethdr(MB_DONTWAIT, MT_DATA);
2243 	if (m == NULL) {
2244 		rx->alloc_fail++;
2245 		err = ENOBUFS;
2246 		goto done;
2247 	}
2248 	m->m_len = m->m_pkthdr.len = MHLEN;
2249 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2250 				      &seg, 1, &cnt, BUS_DMA_NOWAIT);
2251 	if (err != 0) {
2252 		kprintf("can't dmamap small (%d)\n", err);
2253 		m_free(m);
2254 		goto done;
2255 	}
2256 	rx->info[idx].m = m;
2257 	rx->shadow[idx].addr_low =
2258 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2259 	rx->shadow[idx].addr_high =
2260 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2261 
2262 done:
2263 	if ((idx & 7) == 7)
2264 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2265 	return err;
2266 }
2267 
2268 
2269 static int
2270 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2271 {
2272 	bus_dma_segment_t seg[3];
2273 	struct mbuf *m;
2274 	mxge_rx_ring_t *rx = &ss->rx_big;
2275 	int cnt, err, i;
2276 
2277 	if (rx->cl_size == MCLBYTES)
2278 		m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2279 	else {
2280 #if 0
2281 		m = m_getjcl(MB_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2282 #else
2283 		/*
2284 		 * XXX: allocate normal sized buffers for big buffers.
2285 		 * We should be fine as long as we don't get any jumbo frames
2286 		 */
2287 		m = m_getcl(MB_DONTWAIT, MT_DATA, M_PKTHDR);
2288 #endif
2289 	}
2290 	if (m == NULL) {
2291 		rx->alloc_fail++;
2292 		err = ENOBUFS;
2293 		goto done;
2294 	}
2295 	m->m_pkthdr.len = 0;
2296 	m->m_len = m->m_pkthdr.len = rx->mlen;
2297 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2298 				      seg, 1, &cnt, BUS_DMA_NOWAIT);
2299 	if (err != 0) {
2300 		kprintf("can't dmamap big (%d)\n", err);
2301 		m_free(m);
2302 		goto done;
2303 	}
2304 	rx->info[idx].m = m;
2305 	rx->shadow[idx].addr_low =
2306 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2307 	rx->shadow[idx].addr_high =
2308 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2309 
2310 #if MXGE_VIRT_JUMBOS
2311 	for (i = 1; i < cnt; i++) {
2312 		rx->shadow[idx + i].addr_low =
2313 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2314 		rx->shadow[idx + i].addr_high =
2315 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2316        }
2317 #endif
2318 
2319 done:
2320        for (i = 0; i < rx->nbufs; i++) {
2321 		if ((idx & 7) == 7) {
2322 			mxge_submit_8rx(&rx->lanai[idx - 7],
2323 					&rx->shadow[idx - 7]);
2324 		}
2325 		idx++;
2326 	}
2327 	return err;
2328 }
2329 
2330 /*
2331  *  Myri10GE hardware checksums are not valid if the sender
2332  *  padded the frame with non-zero padding.  This is because
2333  *  the firmware just does a simple 16-bit 1s complement
2334  *  checksum across the entire frame, excluding the first 14
2335  *  bytes.  It is best to simply to check the checksum and
2336  *  tell the stack about it only if the checksum is good
2337  */
2338 
2339 static inline uint16_t
2340 mxge_rx_csum(struct mbuf *m, int csum)
2341 {
2342 	struct ether_header *eh;
2343 	struct ip *ip;
2344 	uint16_t c;
2345 
2346 	eh = mtod(m, struct ether_header *);
2347 
2348 	/* only deal with IPv4 TCP & UDP for now */
2349 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2350 		return 1;
2351 	ip = (struct ip *)(eh + 1);
2352 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2353 			    ip->ip_p != IPPROTO_UDP))
2354 		return 1;
2355 #ifdef INET
2356 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2357 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2358 			    - (ip->ip_hl << 2) + ip->ip_p));
2359 #else
2360 	c = 1;
2361 #endif
2362 	c ^= 0xffff;
2363 	return (c);
2364 }
2365 
2366 static void
2367 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2368 {
2369 	struct ether_vlan_header *evl;
2370 	uint32_t partial;
2371 
2372 	evl = mtod(m, struct ether_vlan_header *);
2373 
2374 	/*
2375 	 * fix checksum by subtracting EVL_ENCAPLEN bytes
2376 	 * after what the firmware thought was the end of the ethernet
2377 	 * header.
2378 	 */
2379 
2380 	/* put checksum into host byte order */
2381 	*csum = ntohs(*csum);
2382 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2383 	(*csum) += ~partial;
2384 	(*csum) +=  ((*csum) < ~partial);
2385 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2386 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2387 
2388 	/* restore checksum to network byte order;
2389 	   later consumers expect this */
2390 	*csum = htons(*csum);
2391 
2392 	/* save the tag */
2393 #ifdef MXGE_NEW_VLAN_API
2394 	m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2395 #else
2396 	{
2397 		struct m_tag *mtag;
2398 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2399 				   MB_DONTWAIT);
2400 		if (mtag == NULL)
2401 			return;
2402 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2403 		m_tag_prepend(m, mtag);
2404 	}
2405 
2406 #endif
2407 	m->m_flags |= M_VLANTAG;
2408 
2409 	/*
2410 	 * Remove the 802.1q header by copying the Ethernet
2411 	 * addresses over it and adjusting the beginning of
2412 	 * the data in the mbuf.  The encapsulated Ethernet
2413 	 * type field is already in place.
2414 	 */
2415 	bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2416 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2417 	m_adj(m, EVL_ENCAPLEN);
2418 }
2419 
2420 
2421 static inline void
2422 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2423 {
2424 	mxge_softc_t *sc;
2425 	struct ifnet *ifp;
2426 	struct mbuf *m;
2427 	struct ether_header *eh;
2428 	mxge_rx_ring_t *rx;
2429 	bus_dmamap_t old_map;
2430 	int idx;
2431 	uint16_t tcpudp_csum;
2432 
2433 	sc = ss->sc;
2434 	ifp = sc->ifp;
2435 	rx = &ss->rx_big;
2436 	idx = rx->cnt & rx->mask;
2437 	rx->cnt += rx->nbufs;
2438 	/* save a pointer to the received mbuf */
2439 	m = rx->info[idx].m;
2440 	/* try to replace the received mbuf */
2441 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2442 		/* drop the frame -- the old mbuf is re-cycled */
2443 		IFNET_STAT_INC(ifp, ierrors, 1);
2444 		return;
2445 	}
2446 
2447 	/* unmap the received buffer */
2448 	old_map = rx->info[idx].map;
2449 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2450 	bus_dmamap_unload(rx->dmat, old_map);
2451 
2452 	/* swap the bus_dmamap_t's */
2453 	rx->info[idx].map = rx->extra_map;
2454 	rx->extra_map = old_map;
2455 
2456 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2457 	 * aligned */
2458 	m->m_data += MXGEFW_PAD;
2459 
2460 	m->m_pkthdr.rcvif = ifp;
2461 	m->m_len = m->m_pkthdr.len = len;
2462 	ss->ipackets++;
2463 	eh = mtod(m, struct ether_header *);
2464 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2465 		mxge_vlan_tag_remove(m, &csum);
2466 	}
2467 	/* if the checksum is valid, mark it in the mbuf header */
2468 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2469 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2470 			return;
2471 		/* otherwise, it was a UDP frame, or a TCP frame which
2472 		   we could not do LRO on.  Tell the stack that the
2473 		   checksum is good */
2474 		m->m_pkthdr.csum_data = 0xffff;
2475 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2476 	}
2477 #if 0
2478 	/* flowid only valid if RSS hashing is enabled */
2479 	if (sc->num_slices > 1) {
2480 		m->m_pkthdr.flowid = (ss - sc->ss);
2481 		m->m_flags |= M_FLOWID;
2482 	}
2483 #endif
2484 	ifp->if_input(ifp, m);
2485 }
2486 
2487 static inline void
2488 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2489 {
2490 	mxge_softc_t *sc;
2491 	struct ifnet *ifp;
2492 	struct ether_header *eh;
2493 	struct mbuf *m;
2494 	mxge_rx_ring_t *rx;
2495 	bus_dmamap_t old_map;
2496 	int idx;
2497 	uint16_t tcpudp_csum;
2498 
2499 	sc = ss->sc;
2500 	ifp = sc->ifp;
2501 	rx = &ss->rx_small;
2502 	idx = rx->cnt & rx->mask;
2503 	rx->cnt++;
2504 	/* save a pointer to the received mbuf */
2505 	m = rx->info[idx].m;
2506 	/* try to replace the received mbuf */
2507 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2508 		/* drop the frame -- the old mbuf is re-cycled */
2509 		IFNET_STAT_INC(ifp, ierrors, 1);
2510 		return;
2511 	}
2512 
2513 	/* unmap the received buffer */
2514 	old_map = rx->info[idx].map;
2515 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2516 	bus_dmamap_unload(rx->dmat, old_map);
2517 
2518 	/* swap the bus_dmamap_t's */
2519 	rx->info[idx].map = rx->extra_map;
2520 	rx->extra_map = old_map;
2521 
2522 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2523 	 * aligned */
2524 	m->m_data += MXGEFW_PAD;
2525 
2526 	m->m_pkthdr.rcvif = ifp;
2527 	m->m_len = m->m_pkthdr.len = len;
2528 	ss->ipackets++;
2529 	eh = mtod(m, struct ether_header *);
2530 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2531 		mxge_vlan_tag_remove(m, &csum);
2532 	}
2533 	/* if the checksum is valid, mark it in the mbuf header */
2534 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2535 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2536 			return;
2537 		/* otherwise, it was a UDP frame, or a TCP frame which
2538 		   we could not do LRO on.  Tell the stack that the
2539 		   checksum is good */
2540 		m->m_pkthdr.csum_data = 0xffff;
2541 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2542 	}
2543 #if 0
2544 	/* flowid only valid if RSS hashing is enabled */
2545 	if (sc->num_slices > 1) {
2546 		m->m_pkthdr.flowid = (ss - sc->ss);
2547 		m->m_flags |= M_FLOWID;
2548 	}
2549 #endif
2550 	ifp->if_input(ifp, m);
2551 }
2552 
2553 /*
2554  * XXX
2555  *
2556  * Inlining the call to this function causes mxge_intr() to grow too large
2557  * for GCC's stack size limits (which shouldn't take into account inlining
2558  * of leaf functions at one call site anyway). Inlining is definitely a
2559  * good idea in this case though, so mark the function appropriately.
2560  */
2561 static inline __always_inline void
2562 mxge_clean_rx_done(struct mxge_slice_state *ss)
2563 {
2564 	mxge_rx_done_t *rx_done = &ss->rx_done;
2565 	int limit = 0;
2566 	uint16_t length;
2567 	uint16_t checksum;
2568 
2569 	while (rx_done->entry[rx_done->idx].length != 0) {
2570 		length = ntohs(rx_done->entry[rx_done->idx].length);
2571 		rx_done->entry[rx_done->idx].length = 0;
2572 		checksum = rx_done->entry[rx_done->idx].checksum;
2573 		if (length <= (MHLEN - MXGEFW_PAD))
2574 			mxge_rx_done_small(ss, length, checksum);
2575 		else
2576 			mxge_rx_done_big(ss, length, checksum);
2577 		rx_done->cnt++;
2578 		rx_done->idx = rx_done->cnt & rx_done->mask;
2579 
2580 		/* limit potential for livelock */
2581 		if (__predict_false(++limit > rx_done->mask / 2))
2582 			break;
2583 	}
2584 #ifdef INET
2585 	while (!SLIST_EMPTY(&ss->lro_active)) {
2586 		struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2587 		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2588 		mxge_lro_flush(ss, lro);
2589 	}
2590 #endif
2591 }
2592 
2593 
2594 static inline void
2595 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2596 {
2597 	struct ifnet *ifp;
2598 	mxge_tx_ring_t *tx;
2599 	struct mbuf *m;
2600 	bus_dmamap_t map;
2601 	int idx;
2602 
2603 	tx = &ss->tx;
2604 	ifp = ss->sc->ifp;
2605 	ASSERT_SERIALIZED(ifp->if_serializer);
2606 	while (tx->pkt_done != mcp_idx) {
2607 		idx = tx->done & tx->mask;
2608 		tx->done++;
2609 		m = tx->info[idx].m;
2610 		/* mbuf and DMA map only attached to the first
2611 		   segment per-mbuf */
2612 		if (m != NULL) {
2613 			ss->obytes += m->m_pkthdr.len;
2614 			if (m->m_flags & M_MCAST)
2615 				ss->omcasts++;
2616 			ss->opackets++;
2617 			tx->info[idx].m = NULL;
2618 			map = tx->info[idx].map;
2619 			bus_dmamap_unload(tx->dmat, map);
2620 			m_freem(m);
2621 		}
2622 		if (tx->info[idx].flag) {
2623 			tx->info[idx].flag = 0;
2624 			tx->pkt_done++;
2625 		}
2626 	}
2627 
2628 	/* If we have space, clear OACTIVE to tell the stack that
2629            its OK to send packets */
2630 	if (ifq_is_oactive(&ifp->if_snd) &&
2631 	    tx->req - tx->done < (tx->mask + 1)/4) {
2632 		ifq_clr_oactive(&ifp->if_snd);
2633 		ss->tx.wake++;
2634 		mxge_start_locked(ss);
2635 	}
2636 #ifdef IFNET_BUF_RING
2637 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2638 		/* let the NIC stop polling this queue, since there
2639 		 * are no more transmits pending */
2640 		if (tx->req == tx->done) {
2641 			*tx->send_stop = 1;
2642 			tx->queue_active = 0;
2643 			tx->deactivate++;
2644 			wmb();
2645 		}
2646 	}
2647 #endif
2648 
2649 }
2650 
2651 static struct mxge_media_type mxge_xfp_media_types[] =
2652 {
2653 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2654 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2655 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2656 	{0,		(1 << 5),	"10GBASE-ER"},
2657 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2658 	{0,		(1 << 3),	"10GBASE-SW"},
2659 	{0,		(1 << 2),	"10GBASE-LW"},
2660 	{0,		(1 << 1),	"10GBASE-EW"},
2661 	{0,		(1 << 0),	"Reserved"}
2662 };
2663 static struct mxge_media_type mxge_sfp_media_types[] =
2664 {
2665 	{0,		(1 << 7),	"Reserved"},
2666 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2667 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2668 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"}
2669 };
2670 
2671 static void
2672 mxge_set_media(mxge_softc_t *sc, int type)
2673 {
2674 	sc->media_flags |= type;
2675 	ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2676 	ifmedia_set(&sc->media, sc->media_flags);
2677 }
2678 
2679 
2680 /*
2681  * Determine the media type for a NIC.  Some XFPs will identify
2682  * themselves only when their link is up, so this is initiated via a
2683  * link up interrupt.  However, this can potentially take up to
2684  * several milliseconds, so it is run via the watchdog routine, rather
2685  * than in the interrupt handler itself.   This need only be done
2686  * once, not each time the link is up.
2687  */
2688 static void
2689 mxge_media_probe(mxge_softc_t *sc)
2690 {
2691 	mxge_cmd_t cmd;
2692 	char *cage_type;
2693 	char *ptr;
2694 	struct mxge_media_type *mxge_media_types = NULL;
2695 	int i, err, ms, mxge_media_type_entries;
2696 	uint32_t byte;
2697 
2698 	sc->need_media_probe = 0;
2699 
2700 	/* if we've already set a media type, we're done */
2701 	if (sc->media_flags  != (IFM_ETHER | IFM_AUTO))
2702 		return;
2703 
2704 	/*
2705 	 * parse the product code to deterimine the interface type
2706 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2707 	 * after the 3rd dash in the driver's cached copy of the
2708 	 * EEPROM's product code string.
2709 	 */
2710 	ptr = sc->product_code_string;
2711 	if (ptr == NULL) {
2712 		device_printf(sc->dev, "Missing product code\n");
2713 	}
2714 
2715 	for (i = 0; i < 3; i++, ptr++) {
2716 		ptr = index(ptr, '-');
2717 		if (ptr == NULL) {
2718 			device_printf(sc->dev,
2719 				      "only %d dashes in PC?!?\n", i);
2720 			return;
2721 		}
2722 	}
2723 	if (*ptr == 'C') {
2724 		/* -C is CX4 */
2725 		mxge_set_media(sc, IFM_10G_CX4);
2726 		return;
2727 	}
2728 	else if (*ptr == 'Q') {
2729 		/* -Q is Quad Ribbon Fiber */
2730 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2731 		/* FreeBSD has no media type for Quad ribbon fiber */
2732 		return;
2733 	}
2734 
2735 	if (*ptr == 'R') {
2736 		/* -R is XFP */
2737 		mxge_media_types = mxge_xfp_media_types;
2738 		mxge_media_type_entries = NELEM(mxge_xfp_media_types);
2739 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2740 		cage_type = "XFP";
2741 	}
2742 
2743 	if (*ptr == 'S' || *(ptr +1) == 'S') {
2744 		/* -S or -2S is SFP+ */
2745 		mxge_media_types = mxge_sfp_media_types;
2746 		mxge_media_type_entries = NELEM(mxge_sfp_media_types);
2747 		cage_type = "SFP+";
2748 		byte = 3;
2749 	}
2750 
2751 	if (mxge_media_types == NULL) {
2752 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2753 		return;
2754 	}
2755 
2756 	/*
2757 	 * At this point we know the NIC has an XFP cage, so now we
2758 	 * try to determine what is in the cage by using the
2759 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2760 	 * register.  We read just one byte, which may take over
2761 	 * a millisecond
2762 	 */
2763 
2764 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2765 	cmd.data1 = byte;
2766 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2767 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2768 		device_printf(sc->dev, "failed to read XFP\n");
2769 	}
2770 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2771 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2772 	}
2773 	if (err != MXGEFW_CMD_OK) {
2774 		return;
2775 	}
2776 
2777 	/* now we wait for the data to be cached */
2778 	cmd.data0 = byte;
2779 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2780 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2781 		DELAY(1000);
2782 		cmd.data0 = byte;
2783 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2784 	}
2785 	if (err != MXGEFW_CMD_OK) {
2786 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2787 			      cage_type, err, ms);
2788 		return;
2789 	}
2790 
2791 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2792 		if (mxge_verbose)
2793 			device_printf(sc->dev, "%s:%s\n", cage_type,
2794 				      mxge_media_types[0].name);
2795 		mxge_set_media(sc, IFM_10G_CX4);
2796 		return;
2797 	}
2798 	for (i = 1; i < mxge_media_type_entries; i++) {
2799 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2800 			if (mxge_verbose)
2801 				device_printf(sc->dev, "%s:%s\n",
2802 					      cage_type,
2803 					      mxge_media_types[i].name);
2804 
2805 			mxge_set_media(sc, mxge_media_types[i].flag);
2806 			return;
2807 		}
2808 	}
2809 	device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2810 		      cmd.data0);
2811 
2812 	return;
2813 }
2814 
2815 static void
2816 mxge_intr(void *arg)
2817 {
2818 	struct mxge_slice_state *ss = arg;
2819 	mxge_softc_t *sc = ss->sc;
2820 	mcp_irq_data_t *stats = ss->fw_stats;
2821 	mxge_tx_ring_t *tx = &ss->tx;
2822 	mxge_rx_done_t *rx_done = &ss->rx_done;
2823 	uint32_t send_done_count;
2824 	uint8_t valid;
2825 
2826 
2827 #ifndef IFNET_BUF_RING
2828 	/* an interrupt on a non-zero slice is implicitly valid
2829 	   since MSI-X irqs are not shared */
2830 	if (ss != sc->ss) {
2831 		mxge_clean_rx_done(ss);
2832 		*ss->irq_claim = be32toh(3);
2833 		return;
2834 	}
2835 #endif
2836 
2837 	/* make sure the DMA has finished */
2838 	if (!stats->valid) {
2839 		return;
2840 	}
2841 	valid = stats->valid;
2842 
2843 	if (sc->legacy_irq) {
2844 		/* lower legacy IRQ  */
2845 		*sc->irq_deassert = 0;
2846 		if (!mxge_deassert_wait)
2847 			/* don't wait for conf. that irq is low */
2848 			stats->valid = 0;
2849 	} else {
2850 		stats->valid = 0;
2851 	}
2852 
2853 	/* loop while waiting for legacy irq deassertion */
2854 	do {
2855 		/* check for transmit completes and receives */
2856 		send_done_count = be32toh(stats->send_done_count);
2857 		while ((send_done_count != tx->pkt_done) ||
2858 		       (rx_done->entry[rx_done->idx].length != 0)) {
2859 			if (send_done_count != tx->pkt_done)
2860 				mxge_tx_done(ss, (int)send_done_count);
2861 			mxge_clean_rx_done(ss);
2862 			send_done_count = be32toh(stats->send_done_count);
2863 		}
2864 		if (sc->legacy_irq && mxge_deassert_wait)
2865 			wmb();
2866 	} while (*((volatile uint8_t *) &stats->valid));
2867 
2868 	/* fw link & error stats meaningful only on the first slice */
2869 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2870 		if (sc->link_state != stats->link_up) {
2871 			sc->link_state = stats->link_up;
2872 			if (sc->link_state) {
2873 				sc->ifp->if_link_state = LINK_STATE_UP;
2874 				if_link_state_change(sc->ifp);
2875 				if (mxge_verbose)
2876 					device_printf(sc->dev, "link up\n");
2877 			} else {
2878 				sc->ifp->if_link_state = LINK_STATE_DOWN;
2879 				if_link_state_change(sc->ifp);
2880 				if (mxge_verbose)
2881 					device_printf(sc->dev, "link down\n");
2882 			}
2883 			sc->need_media_probe = 1;
2884 		}
2885 		if (sc->rdma_tags_available !=
2886 		    be32toh(stats->rdma_tags_available)) {
2887 			sc->rdma_tags_available =
2888 				be32toh(stats->rdma_tags_available);
2889 			device_printf(sc->dev, "RDMA timed out! %d tags "
2890 				      "left\n", sc->rdma_tags_available);
2891 		}
2892 
2893 		if (stats->link_down) {
2894 			sc->down_cnt += stats->link_down;
2895 			sc->link_state = 0;
2896 			sc->ifp->if_link_state = LINK_STATE_DOWN;
2897 			if_link_state_change(sc->ifp);
2898 		}
2899 	}
2900 
2901 	/* check to see if we have rx token to pass back */
2902 	if (valid & 0x1)
2903 	    *ss->irq_claim = be32toh(3);
2904 	*(ss->irq_claim + 1) = be32toh(3);
2905 }
2906 
2907 static void
2908 mxge_init(void *arg)
2909 {
2910 }
2911 
2912 
2913 
2914 static void
2915 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2916 {
2917 	struct lro_entry *lro_entry;
2918 	int i;
2919 
2920 	while (!SLIST_EMPTY(&ss->lro_free)) {
2921 		lro_entry = SLIST_FIRST(&ss->lro_free);
2922 		SLIST_REMOVE_HEAD(&ss->lro_free, next);
2923 		kfree(lro_entry, M_DEVBUF);
2924 	}
2925 
2926 	for (i = 0; i <= ss->rx_big.mask; i++) {
2927 		if (ss->rx_big.info[i].m == NULL)
2928 			continue;
2929 		bus_dmamap_unload(ss->rx_big.dmat,
2930 				  ss->rx_big.info[i].map);
2931 		m_freem(ss->rx_big.info[i].m);
2932 		ss->rx_big.info[i].m = NULL;
2933 	}
2934 
2935 	for (i = 0; i <= ss->rx_small.mask; i++) {
2936 		if (ss->rx_small.info[i].m == NULL)
2937 			continue;
2938 		bus_dmamap_unload(ss->rx_small.dmat,
2939 				  ss->rx_small.info[i].map);
2940 		m_freem(ss->rx_small.info[i].m);
2941 		ss->rx_small.info[i].m = NULL;
2942 	}
2943 
2944 	/* transmit ring used only on the first slice */
2945 	if (ss->tx.info == NULL)
2946 		return;
2947 
2948 	for (i = 0; i <= ss->tx.mask; i++) {
2949 		ss->tx.info[i].flag = 0;
2950 		if (ss->tx.info[i].m == NULL)
2951 			continue;
2952 		bus_dmamap_unload(ss->tx.dmat,
2953 				  ss->tx.info[i].map);
2954 		m_freem(ss->tx.info[i].m);
2955 		ss->tx.info[i].m = NULL;
2956 	}
2957 }
2958 
2959 static void
2960 mxge_free_mbufs(mxge_softc_t *sc)
2961 {
2962 	int slice;
2963 
2964 	for (slice = 0; slice < sc->num_slices; slice++)
2965 		mxge_free_slice_mbufs(&sc->ss[slice]);
2966 }
2967 
2968 static void
2969 mxge_free_slice_rings(struct mxge_slice_state *ss)
2970 {
2971 	int i;
2972 
2973 
2974 	if (ss->rx_done.entry != NULL)
2975 		mxge_dma_free(&ss->rx_done.dma);
2976 	ss->rx_done.entry = NULL;
2977 
2978 	if (ss->tx.req_bytes != NULL)
2979 		kfree(ss->tx.req_bytes, M_DEVBUF);
2980 	ss->tx.req_bytes = NULL;
2981 
2982 	if (ss->tx.seg_list != NULL)
2983 		kfree(ss->tx.seg_list, M_DEVBUF);
2984 	ss->tx.seg_list = NULL;
2985 
2986 	if (ss->rx_small.shadow != NULL)
2987 		kfree(ss->rx_small.shadow, M_DEVBUF);
2988 	ss->rx_small.shadow = NULL;
2989 
2990 	if (ss->rx_big.shadow != NULL)
2991 		kfree(ss->rx_big.shadow, M_DEVBUF);
2992 	ss->rx_big.shadow = NULL;
2993 
2994 	if (ss->tx.info != NULL) {
2995 		if (ss->tx.dmat != NULL) {
2996 			for (i = 0; i <= ss->tx.mask; i++) {
2997 				bus_dmamap_destroy(ss->tx.dmat,
2998 						   ss->tx.info[i].map);
2999 			}
3000 			bus_dma_tag_destroy(ss->tx.dmat);
3001 		}
3002 		kfree(ss->tx.info, M_DEVBUF);
3003 	}
3004 	ss->tx.info = NULL;
3005 
3006 	if (ss->rx_small.info != NULL) {
3007 		if (ss->rx_small.dmat != NULL) {
3008 			for (i = 0; i <= ss->rx_small.mask; i++) {
3009 				bus_dmamap_destroy(ss->rx_small.dmat,
3010 						   ss->rx_small.info[i].map);
3011 			}
3012 			bus_dmamap_destroy(ss->rx_small.dmat,
3013 					   ss->rx_small.extra_map);
3014 			bus_dma_tag_destroy(ss->rx_small.dmat);
3015 		}
3016 		kfree(ss->rx_small.info, M_DEVBUF);
3017 	}
3018 	ss->rx_small.info = NULL;
3019 
3020 	if (ss->rx_big.info != NULL) {
3021 		if (ss->rx_big.dmat != NULL) {
3022 			for (i = 0; i <= ss->rx_big.mask; i++) {
3023 				bus_dmamap_destroy(ss->rx_big.dmat,
3024 						   ss->rx_big.info[i].map);
3025 			}
3026 			bus_dmamap_destroy(ss->rx_big.dmat,
3027 					   ss->rx_big.extra_map);
3028 			bus_dma_tag_destroy(ss->rx_big.dmat);
3029 		}
3030 		kfree(ss->rx_big.info, M_DEVBUF);
3031 	}
3032 	ss->rx_big.info = NULL;
3033 }
3034 
3035 static void
3036 mxge_free_rings(mxge_softc_t *sc)
3037 {
3038 	int slice;
3039 
3040 	for (slice = 0; slice < sc->num_slices; slice++)
3041 		mxge_free_slice_rings(&sc->ss[slice]);
3042 }
3043 
3044 static int
3045 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3046 		       int tx_ring_entries)
3047 {
3048 	mxge_softc_t *sc = ss->sc;
3049 	size_t bytes;
3050 	int err, i;
3051 
3052 	err = ENOMEM;
3053 
3054 	/* allocate per-slice receive resources */
3055 
3056 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3057 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3058 
3059 	/* allocate the rx shadow rings */
3060 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3061 	ss->rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3062 
3063 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3064 	ss->rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3065 
3066 	/* allocate the rx host info rings */
3067 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3068 	ss->rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3069 
3070 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3071 	ss->rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3072 
3073 	/* allocate the rx busdma resources */
3074 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3075 				 1,			/* alignment */
3076 				 4096,			/* boundary */
3077 				 BUS_SPACE_MAXADDR,	/* low */
3078 				 BUS_SPACE_MAXADDR,	/* high */
3079 				 NULL, NULL,		/* filter */
3080 				 MHLEN,			/* maxsize */
3081 				 1,			/* num segs */
3082 				 MHLEN,			/* maxsegsize */
3083 				 BUS_DMA_ALLOCNOW,	/* flags */
3084 				 &ss->rx_small.dmat);	/* tag */
3085 	if (err != 0) {
3086 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3087 			      err);
3088 		return err;
3089 	}
3090 
3091 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3092 				 1,			/* alignment */
3093 #if MXGE_VIRT_JUMBOS
3094 				 4096,			/* boundary */
3095 #else
3096 				 0,			/* boundary */
3097 #endif
3098 				 BUS_SPACE_MAXADDR,	/* low */
3099 				 BUS_SPACE_MAXADDR,	/* high */
3100 				 NULL, NULL,		/* filter */
3101 				 3*4096,		/* maxsize */
3102 #if MXGE_VIRT_JUMBOS
3103 				 3,			/* num segs */
3104 				 4096,			/* maxsegsize*/
3105 #else
3106 				 1,			/* num segs */
3107 				 MJUM9BYTES,		/* maxsegsize*/
3108 #endif
3109 				 BUS_DMA_ALLOCNOW,	/* flags */
3110 				 &ss->rx_big.dmat);	/* tag */
3111 	if (err != 0) {
3112 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3113 			      err);
3114 		return err;
3115 	}
3116 	for (i = 0; i <= ss->rx_small.mask; i++) {
3117 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3118 					&ss->rx_small.info[i].map);
3119 		if (err != 0) {
3120 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3121 				      err);
3122 			return err;
3123 		}
3124 	}
3125 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3126 				&ss->rx_small.extra_map);
3127 	if (err != 0) {
3128 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3129 			      err);
3130 		return err;
3131 	}
3132 
3133 	for (i = 0; i <= ss->rx_big.mask; i++) {
3134 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3135 					&ss->rx_big.info[i].map);
3136 		if (err != 0) {
3137 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3138 				      err);
3139 			return err;
3140 		}
3141 	}
3142 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3143 				&ss->rx_big.extra_map);
3144 	if (err != 0) {
3145 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3146 			      err);
3147 		return err;
3148 	}
3149 
3150 	/* now allocate TX resouces */
3151 
3152 #ifndef IFNET_BUF_RING
3153 	/* only use a single TX ring for now */
3154 	if (ss != ss->sc->ss)
3155 		return 0;
3156 #endif
3157 
3158 	ss->tx.mask = tx_ring_entries - 1;
3159 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3160 
3161 
3162 	/* allocate the tx request copy block */
3163 	bytes = 8 +
3164 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3165 	ss->tx.req_bytes = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3166 	/* ensure req_list entries are aligned to 8 bytes */
3167 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3168 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3169 
3170 	/* allocate the tx busdma segment list */
3171 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3172 	ss->tx.seg_list = (bus_dma_segment_t *)
3173 		kmalloc(bytes, M_DEVBUF, M_WAITOK);
3174 	if (ss->tx.seg_list == NULL)
3175 		return err;
3176 
3177 	/* allocate the tx host info ring */
3178 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3179 	ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3180 
3181 	/* allocate the tx busdma resources */
3182 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3183 				 1,			/* alignment */
3184 				 sc->tx_boundary,	/* boundary */
3185 				 BUS_SPACE_MAXADDR,	/* low */
3186 				 BUS_SPACE_MAXADDR,	/* high */
3187 				 NULL, NULL,		/* filter */
3188 				 65536 + 256,		/* maxsize */
3189 				 ss->tx.max_desc - 2,	/* num segs */
3190 				 sc->tx_boundary,	/* maxsegsz */
3191 				 BUS_DMA_ALLOCNOW,	/* flags */
3192 				 &ss->tx.dmat);		/* tag */
3193 
3194 	if (err != 0) {
3195 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3196 			      err);
3197 		return err;
3198 	}
3199 
3200 	/* now use these tags to setup dmamaps for each slot
3201 	   in the ring */
3202 	for (i = 0; i <= ss->tx.mask; i++) {
3203 		err = bus_dmamap_create(ss->tx.dmat, 0,
3204 					&ss->tx.info[i].map);
3205 		if (err != 0) {
3206 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3207 				      err);
3208 			return err;
3209 		}
3210 	}
3211 	return 0;
3212 
3213 }
3214 
3215 static int
3216 mxge_alloc_rings(mxge_softc_t *sc)
3217 {
3218 	mxge_cmd_t cmd;
3219 	int tx_ring_size;
3220 	int tx_ring_entries, rx_ring_entries;
3221 	int err, slice;
3222 
3223 	/* get ring sizes */
3224 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3225 	tx_ring_size = cmd.data0;
3226 	if (err != 0) {
3227 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3228 		goto abort;
3229 	}
3230 
3231 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3232 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3233 	ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3234 	ifq_set_ready(&sc->ifp->if_snd);
3235 
3236 	for (slice = 0; slice < sc->num_slices; slice++) {
3237 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3238 					     rx_ring_entries,
3239 					     tx_ring_entries);
3240 		if (err != 0)
3241 			goto abort;
3242 	}
3243 	return 0;
3244 
3245 abort:
3246 	mxge_free_rings(sc);
3247 	return err;
3248 
3249 }
3250 
3251 
3252 static void
3253 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3254 {
3255 	int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3256 
3257 	if (bufsize < MCLBYTES) {
3258 		/* easy, everything fits in a single buffer */
3259 		*big_buf_size = MCLBYTES;
3260 		*cl_size = MCLBYTES;
3261 		*nbufs = 1;
3262 		return;
3263 	}
3264 
3265 	if (bufsize < MJUMPAGESIZE) {
3266 		/* still easy, everything still fits in a single buffer */
3267 		*big_buf_size = MJUMPAGESIZE;
3268 		*cl_size = MJUMPAGESIZE;
3269 		*nbufs = 1;
3270 		return;
3271 	}
3272 #if MXGE_VIRT_JUMBOS
3273 	/* now we need to use virtually contiguous buffers */
3274 	*cl_size = MJUM9BYTES;
3275 	*big_buf_size = 4096;
3276 	*nbufs = mtu / 4096 + 1;
3277 	/* needs to be a power of two, so round up */
3278 	if (*nbufs == 3)
3279 		*nbufs = 4;
3280 #else
3281 	*cl_size = MJUM9BYTES;
3282 	*big_buf_size = MJUM9BYTES;
3283 	*nbufs = 1;
3284 #endif
3285 }
3286 
3287 static int
3288 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3289 {
3290 	mxge_softc_t *sc;
3291 	mxge_cmd_t cmd;
3292 	bus_dmamap_t map;
3293 	struct lro_entry *lro_entry;
3294 	int err, i, slice;
3295 
3296 
3297 	sc = ss->sc;
3298 	slice = ss - sc->ss;
3299 
3300 	SLIST_INIT(&ss->lro_free);
3301 	SLIST_INIT(&ss->lro_active);
3302 
3303 	for (i = 0; i < sc->lro_cnt; i++) {
3304 		lro_entry = (struct lro_entry *)
3305 			kmalloc(sizeof (*lro_entry), M_DEVBUF,
3306 			       M_NOWAIT | M_ZERO);
3307 		if (lro_entry == NULL) {
3308 			sc->lro_cnt = i;
3309 			break;
3310 		}
3311 		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3312 	}
3313 	/* get the lanai pointers to the send and receive rings */
3314 
3315 	err = 0;
3316 #ifndef IFNET_BUF_RING
3317 	/* We currently only send from the first slice */
3318 	if (slice == 0) {
3319 #endif
3320 		cmd.data0 = slice;
3321 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3322 		ss->tx.lanai =
3323 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3324 		ss->tx.send_go = (volatile uint32_t *)
3325 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3326 		ss->tx.send_stop = (volatile uint32_t *)
3327 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3328 #ifndef IFNET_BUF_RING
3329 	}
3330 #endif
3331 	cmd.data0 = slice;
3332 	err |= mxge_send_cmd(sc,
3333 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3334 	ss->rx_small.lanai =
3335 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3336 	cmd.data0 = slice;
3337 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3338 	ss->rx_big.lanai =
3339 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3340 
3341 	if (err != 0) {
3342 		device_printf(sc->dev,
3343 			      "failed to get ring sizes or locations\n");
3344 		return EIO;
3345 	}
3346 
3347 	/* stock receive rings */
3348 	for (i = 0; i <= ss->rx_small.mask; i++) {
3349 		map = ss->rx_small.info[i].map;
3350 		err = mxge_get_buf_small(ss, map, i);
3351 		if (err) {
3352 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3353 				      i, ss->rx_small.mask + 1);
3354 			return ENOMEM;
3355 		}
3356 	}
3357 	for (i = 0; i <= ss->rx_big.mask; i++) {
3358 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3359 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3360 	}
3361 	ss->rx_big.nbufs = nbufs;
3362 	ss->rx_big.cl_size = cl_size;
3363 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3364 		EVL_ENCAPLEN + MXGEFW_PAD;
3365 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3366 		map = ss->rx_big.info[i].map;
3367 		err = mxge_get_buf_big(ss, map, i);
3368 		if (err) {
3369 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3370 				      i, ss->rx_big.mask + 1);
3371 			return ENOMEM;
3372 		}
3373 	}
3374 	return 0;
3375 }
3376 
3377 static int
3378 mxge_open(mxge_softc_t *sc)
3379 {
3380 	mxge_cmd_t cmd;
3381 	int err, big_bytes, nbufs, slice, cl_size, i;
3382 	bus_addr_t bus;
3383 	volatile uint8_t *itable;
3384 	struct mxge_slice_state *ss;
3385 
3386 	ASSERT_SERIALIZED(sc->ifp->if_serializer);
3387 	/* Copy the MAC address in case it was overridden */
3388 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3389 
3390 	err = mxge_reset(sc, 1);
3391 	if (err != 0) {
3392 		device_printf(sc->dev, "failed to reset\n");
3393 		return EIO;
3394 	}
3395 
3396 	if (sc->num_slices > 1) {
3397 		/* setup the indirection table */
3398 		cmd.data0 = sc->num_slices;
3399 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3400 				    &cmd);
3401 
3402 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3403 				     &cmd);
3404 		if (err != 0) {
3405 			device_printf(sc->dev,
3406 				      "failed to setup rss tables\n");
3407 			return err;
3408 		}
3409 
3410 		/* just enable an identity mapping */
3411 		itable = sc->sram + cmd.data0;
3412 		for (i = 0; i < sc->num_slices; i++)
3413 			itable[i] = (uint8_t)i;
3414 
3415 		cmd.data0 = 1;
3416 		cmd.data1 = mxge_rss_hash_type;
3417 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3418 		if (err != 0) {
3419 			device_printf(sc->dev, "failed to enable slices\n");
3420 			return err;
3421 		}
3422 	}
3423 
3424 
3425 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3426 
3427 	cmd.data0 = nbufs;
3428 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3429 			    &cmd);
3430 	/* error is only meaningful if we're trying to set
3431 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3432 	if (err && nbufs > 1) {
3433 		device_printf(sc->dev,
3434 			      "Failed to set alway-use-n to %d\n",
3435 			      nbufs);
3436 		return EIO;
3437 	}
3438 	/* Give the firmware the mtu and the big and small buffer
3439 	   sizes.  The firmware wants the big buf size to be a power
3440 	   of two. Luckily, FreeBSD's clusters are powers of two */
3441 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3442 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3443 	cmd.data0 = MHLEN - MXGEFW_PAD;
3444 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3445 			     &cmd);
3446 	cmd.data0 = big_bytes;
3447 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3448 
3449 	if (err != 0) {
3450 		device_printf(sc->dev, "failed to setup params\n");
3451 		goto abort;
3452 	}
3453 
3454 	/* Now give him the pointer to the stats block */
3455 	for (slice = 0;
3456 #ifdef IFNET_BUF_RING
3457 	     slice < sc->num_slices;
3458 #else
3459 	     slice < 1;
3460 #endif
3461 	     slice++) {
3462 		ss = &sc->ss[slice];
3463 		cmd.data0 =
3464 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3465 		cmd.data1 =
3466 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3467 		cmd.data2 = sizeof(struct mcp_irq_data);
3468 		cmd.data2 |= (slice << 16);
3469 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3470 	}
3471 
3472 	if (err != 0) {
3473 		bus = sc->ss->fw_stats_dma.bus_addr;
3474 		bus += offsetof(struct mcp_irq_data, send_done_count);
3475 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3476 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3477 		err = mxge_send_cmd(sc,
3478 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3479 				    &cmd);
3480 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3481 		sc->fw_multicast_support = 0;
3482 	} else {
3483 		sc->fw_multicast_support = 1;
3484 	}
3485 
3486 	if (err != 0) {
3487 		device_printf(sc->dev, "failed to setup params\n");
3488 		goto abort;
3489 	}
3490 
3491 	for (slice = 0; slice < sc->num_slices; slice++) {
3492 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3493 		if (err != 0) {
3494 			device_printf(sc->dev, "couldn't open slice %d\n",
3495 				      slice);
3496 			goto abort;
3497 		}
3498 	}
3499 
3500 	/* Finally, start the firmware running */
3501 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3502 	if (err) {
3503 		device_printf(sc->dev, "Couldn't bring up link\n");
3504 		goto abort;
3505 	}
3506 	sc->ifp->if_flags |= IFF_RUNNING;
3507 	ifq_clr_oactive(&sc->ifp->if_snd);
3508 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3509 
3510 	return 0;
3511 
3512 
3513 abort:
3514 	mxge_free_mbufs(sc);
3515 
3516 	return err;
3517 }
3518 
3519 static int
3520 mxge_close(mxge_softc_t *sc)
3521 {
3522 	mxge_cmd_t cmd;
3523 	int err, old_down_cnt;
3524 #ifdef IFNET_BUF_RING
3525 	struct mxge_slice_state *ss;
3526 	int slice;
3527 #endif
3528 
3529 	ASSERT_SERIALIZED(sc->ifp->if_serializer);
3530 	callout_stop(&sc->co_hdl);
3531 #ifdef IFNET_BUF_RING
3532 	for (slice = 0; slice < sc->num_slices; slice++) {
3533 		ss = &sc->ss[slice];
3534 		ss->if_flags &= ~IFF_RUNNING;
3535 	}
3536 #endif
3537 	sc->ifp->if_flags &= ~IFF_RUNNING;
3538 	old_down_cnt = sc->down_cnt;
3539 	wmb();
3540 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3541 	if (err) {
3542 		device_printf(sc->dev, "Couldn't bring down link\n");
3543 	}
3544 	if (old_down_cnt == sc->down_cnt) {
3545 		/* wait for down irq */
3546 		DELAY(10 * sc->intr_coal_delay);
3547 	}
3548 	wmb();
3549 	if (old_down_cnt == sc->down_cnt) {
3550 		device_printf(sc->dev, "never got down irq\n");
3551 	}
3552 
3553 	mxge_free_mbufs(sc);
3554 
3555 	return 0;
3556 }
3557 
3558 static void
3559 mxge_setup_cfg_space(mxge_softc_t *sc)
3560 {
3561 	device_t dev = sc->dev;
3562 	int reg;
3563 	uint16_t cmd, lnk, pectl;
3564 
3565 	/* find the PCIe link width and set max read request to 4KB*/
3566 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3567 		lnk = pci_read_config(dev, reg + 0x12, 2);
3568 		sc->link_width = (lnk >> 4) & 0x3f;
3569 
3570 		pectl = pci_read_config(dev, reg + 0x8, 2);
3571 		pectl = (pectl & ~0x7000) | (5 << 12);
3572 		pci_write_config(dev, reg + 0x8, pectl, 2);
3573 	}
3574 
3575 	/* Enable DMA and Memory space access */
3576 	pci_enable_busmaster(dev);
3577 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3578 	cmd |= PCIM_CMD_MEMEN;
3579 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3580 }
3581 
3582 static uint32_t
3583 mxge_read_reboot(mxge_softc_t *sc)
3584 {
3585 	device_t dev = sc->dev;
3586 	uint32_t vs;
3587 
3588 	/* find the vendor specific offset */
3589 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3590 		device_printf(sc->dev,
3591 			      "could not find vendor specific offset\n");
3592 		return (uint32_t)-1;
3593 	}
3594 	/* enable read32 mode */
3595 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3596 	/* tell NIC which register to read */
3597 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3598 	return (pci_read_config(dev, vs + 0x14, 4));
3599 }
3600 
3601 static int
3602 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3603 {
3604 	struct pci_devinfo *dinfo;
3605 	mxge_tx_ring_t *tx;
3606 	int err;
3607 	uint32_t reboot;
3608 	uint16_t cmd;
3609 
3610 	err = ENXIO;
3611 
3612 	device_printf(sc->dev, "Watchdog reset!\n");
3613 
3614 	/*
3615 	 * check to see if the NIC rebooted.  If it did, then all of
3616 	 * PCI config space has been reset, and things like the
3617 	 * busmaster bit will be zero.  If this is the case, then we
3618 	 * must restore PCI config space before the NIC can be used
3619 	 * again
3620 	 */
3621 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3622 	if (cmd == 0xffff) {
3623 		/*
3624 		 * maybe the watchdog caught the NIC rebooting; wait
3625 		 * up to 100ms for it to finish.  If it does not come
3626 		 * back, then give up
3627 		 */
3628 		DELAY(1000*100);
3629 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3630 		if (cmd == 0xffff) {
3631 			device_printf(sc->dev, "NIC disappeared!\n");
3632 			return (err);
3633 		}
3634 	}
3635 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3636 		/* print the reboot status */
3637 		reboot = mxge_read_reboot(sc);
3638 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3639 			      reboot);
3640 		/* restore PCI configuration space */
3641 		dinfo = device_get_ivars(sc->dev);
3642 		pci_cfg_restore(sc->dev, dinfo);
3643 
3644 		/* and redo any changes we made to our config space */
3645 		mxge_setup_cfg_space(sc);
3646 
3647 		if (sc->ifp->if_flags & IFF_RUNNING) {
3648 			mxge_close(sc);
3649 			err = mxge_open(sc);
3650 		}
3651 	} else {
3652 		tx = &sc->ss[slice].tx;
3653 		device_printf(sc->dev,
3654 			      "NIC did not reboot, slice %d ring state:\n",
3655 			      slice);
3656 		device_printf(sc->dev,
3657 			      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3658 			      tx->req, tx->done, tx->queue_active);
3659 		device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3660 			      tx->activate, tx->deactivate);
3661 		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3662 			      tx->pkt_done,
3663 			      be32toh(sc->ss->fw_stats->send_done_count));
3664 		device_printf(sc->dev, "not resetting\n");
3665 	}
3666 	return (err);
3667 }
3668 
3669 static int
3670 mxge_watchdog(mxge_softc_t *sc)
3671 {
3672 	mxge_tx_ring_t *tx;
3673 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3674 	int i, err = 0;
3675 
3676 	/* see if we have outstanding transmits, which
3677 	   have been pending for more than mxge_ticks */
3678 	for (i = 0;
3679 #ifdef IFNET_BUF_RING
3680 	     (i < sc->num_slices) && (err == 0);
3681 #else
3682 	     (i < 1) && (err == 0);
3683 #endif
3684 	     i++) {
3685 		tx = &sc->ss[i].tx;
3686 		if (tx->req != tx->done &&
3687 		    tx->watchdog_req != tx->watchdog_done &&
3688 		    tx->done == tx->watchdog_done) {
3689 			/* check for pause blocking before resetting */
3690 			if (tx->watchdog_rx_pause == rx_pause)
3691 				err = mxge_watchdog_reset(sc, i);
3692 			else
3693 				device_printf(sc->dev, "Flow control blocking "
3694 					      "xmits, check link partner\n");
3695 		}
3696 
3697 		tx->watchdog_req = tx->req;
3698 		tx->watchdog_done = tx->done;
3699 		tx->watchdog_rx_pause = rx_pause;
3700 	}
3701 
3702 	if (sc->need_media_probe)
3703 		mxge_media_probe(sc);
3704 	return (err);
3705 }
3706 
3707 static void
3708 mxge_update_stats(mxge_softc_t *sc)
3709 {
3710 	struct mxge_slice_state *ss;
3711 	u_long ipackets = 0;
3712 	u_long opackets = 0;
3713 #ifdef IFNET_BUF_RING
3714 	u_long obytes = 0;
3715 	u_long omcasts = 0;
3716 	u_long odrops = 0;
3717 #endif
3718 	u_long oerrors = 0;
3719 	int slice;
3720 
3721 	for (slice = 0; slice < sc->num_slices; slice++) {
3722 		ss = &sc->ss[slice];
3723 		ipackets += ss->ipackets;
3724 		opackets += ss->opackets;
3725 #ifdef IFNET_BUF_RING
3726 		obytes += ss->obytes;
3727 		omcasts += ss->omcasts;
3728 		odrops += ss->tx.br->br_drops;
3729 #endif
3730 		oerrors += ss->oerrors;
3731 	}
3732 	IFNET_STAT_SET(sc->ifp, ipackets, ipackets);
3733 	IFNET_STAT_SET(sc->ifp, opackets, opackets);
3734 #ifdef IFNET_BUF_RING
3735 	sc->ifp->if_obytes = obytes;
3736 	sc->ifp->if_omcasts = omcasts;
3737 	sc->ifp->if_snd.ifq_drops = odrops;
3738 #endif
3739 	IFNET_STAT_SET(sc->ifp, oerrors, oerrors);
3740 }
3741 
3742 static void
3743 mxge_tick(void *arg)
3744 {
3745 	mxge_softc_t *sc = arg;
3746 	int err = 0;
3747 
3748 	lwkt_serialize_enter(sc->ifp->if_serializer);
3749 	/* aggregate stats from different slices */
3750 	mxge_update_stats(sc);
3751 	if (!sc->watchdog_countdown) {
3752 		err = mxge_watchdog(sc);
3753 		sc->watchdog_countdown = 4;
3754 	}
3755 	sc->watchdog_countdown--;
3756 	if (err == 0)
3757 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3758 	lwkt_serialize_exit(sc->ifp->if_serializer);
3759 }
3760 
3761 static int
3762 mxge_media_change(struct ifnet *ifp)
3763 {
3764 	return EINVAL;
3765 }
3766 
3767 static int
3768 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3769 {
3770 	struct ifnet *ifp = sc->ifp;
3771 	int real_mtu, old_mtu;
3772 	int err = 0;
3773 
3774 	if (ifp->if_serializer)
3775 		ASSERT_SERIALIZED(ifp->if_serializer);
3776 
3777 	real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3778 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3779 		return EINVAL;
3780 	old_mtu = ifp->if_mtu;
3781 	ifp->if_mtu = mtu;
3782 	if (ifp->if_flags & IFF_RUNNING) {
3783 		mxge_close(sc);
3784 		err = mxge_open(sc);
3785 		if (err != 0) {
3786 			ifp->if_mtu = old_mtu;
3787 			mxge_close(sc);
3788 			(void) mxge_open(sc);
3789 		}
3790 	}
3791 	return err;
3792 }
3793 
3794 static void
3795 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3796 {
3797 	mxge_softc_t *sc = ifp->if_softc;
3798 
3799 
3800 	if (sc == NULL)
3801 		return;
3802 	ifmr->ifm_status = IFM_AVALID;
3803 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3804 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3805 	ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3806 }
3807 
3808 static int
3809 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data, struct ucred *cr)
3810 {
3811 	mxge_softc_t *sc = ifp->if_softc;
3812 	struct ifreq *ifr = (struct ifreq *)data;
3813 	int err, mask;
3814 
3815 	(void)cr;
3816 	err = 0;
3817 	ASSERT_SERIALIZED(ifp->if_serializer);
3818 	switch (command) {
3819 	case SIOCSIFADDR:
3820 	case SIOCGIFADDR:
3821 		err = ether_ioctl(ifp, command, data);
3822 		break;
3823 
3824 	case SIOCSIFMTU:
3825 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3826 		break;
3827 
3828 	case SIOCSIFFLAGS:
3829 		if (sc->dying) {
3830 			return EINVAL;
3831 		}
3832 		if (ifp->if_flags & IFF_UP) {
3833 			if (!(ifp->if_flags & IFF_RUNNING)) {
3834 				err = mxge_open(sc);
3835 			} else {
3836 				/* take care of promis can allmulti
3837 				   flag chages */
3838 				mxge_change_promisc(sc,
3839 						    ifp->if_flags & IFF_PROMISC);
3840 				mxge_set_multicast_list(sc);
3841 			}
3842 		} else {
3843 			if (ifp->if_flags & IFF_RUNNING) {
3844 				mxge_close(sc);
3845 			}
3846 		}
3847 		break;
3848 
3849 	case SIOCADDMULTI:
3850 	case SIOCDELMULTI:
3851 		mxge_set_multicast_list(sc);
3852 		break;
3853 
3854 	case SIOCSIFCAP:
3855 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3856 		if (mask & IFCAP_TXCSUM) {
3857 			if (IFCAP_TXCSUM & ifp->if_capenable) {
3858 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3859 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3860 						      | CSUM_TSO);
3861 			} else {
3862 				ifp->if_capenable |= IFCAP_TXCSUM;
3863 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3864 			}
3865 		} else if (mask & IFCAP_RXCSUM) {
3866 			if (IFCAP_RXCSUM & ifp->if_capenable) {
3867 				ifp->if_capenable &= ~IFCAP_RXCSUM;
3868 				sc->csum_flag = 0;
3869 			} else {
3870 				ifp->if_capenable |= IFCAP_RXCSUM;
3871 				sc->csum_flag = 1;
3872 			}
3873 		}
3874 		if (mask & IFCAP_TSO4) {
3875 			if (IFCAP_TSO4 & ifp->if_capenable) {
3876 				ifp->if_capenable &= ~IFCAP_TSO4;
3877 				ifp->if_hwassist &= ~CSUM_TSO;
3878 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
3879 				ifp->if_capenable |= IFCAP_TSO4;
3880 				ifp->if_hwassist |= CSUM_TSO;
3881 			} else {
3882 				kprintf("mxge requires tx checksum offload"
3883 				       " be enabled to use TSO\n");
3884 				err = EINVAL;
3885 			}
3886 		}
3887 		if (mask & IFCAP_LRO) {
3888 			if (IFCAP_LRO & ifp->if_capenable)
3889 				err = mxge_change_lro_locked(sc, 0);
3890 			else
3891 				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
3892 		}
3893 		if (mask & IFCAP_VLAN_HWTAGGING)
3894 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3895 		VLAN_CAPABILITIES(ifp);
3896 
3897 		break;
3898 
3899 	case SIOCGIFMEDIA:
3900 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3901 				    &sc->media, command);
3902                 break;
3903 
3904 	default:
3905 		err = ENOTTY;
3906         }
3907 	return err;
3908 }
3909 
3910 static void
3911 mxge_fetch_tunables(mxge_softc_t *sc)
3912 {
3913 
3914 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
3915 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
3916 			  &mxge_flow_control);
3917 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
3918 			  &mxge_intr_coal_delay);
3919 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
3920 			  &mxge_nvidia_ecrc_enable);
3921 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
3922 			  &mxge_force_firmware);
3923 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
3924 			  &mxge_deassert_wait);
3925 	TUNABLE_INT_FETCH("hw.mxge.verbose",
3926 			  &mxge_verbose);
3927 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
3928 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
3929 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
3930 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
3931 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
3932 	if (sc->lro_cnt != 0)
3933 		mxge_lro_cnt = sc->lro_cnt;
3934 
3935 	if (bootverbose)
3936 		mxge_verbose = 1;
3937 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
3938 		mxge_intr_coal_delay = 30;
3939 	if (mxge_ticks == 0)
3940 		mxge_ticks = hz / 2;
3941 	sc->pause = mxge_flow_control;
3942 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
3943 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
3944 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
3945 	}
3946 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
3947 	    mxge_initial_mtu < ETHER_MIN_LEN)
3948 		mxge_initial_mtu = ETHERMTU_JUMBO;
3949 }
3950 
3951 
3952 static void
3953 mxge_free_slices(mxge_softc_t *sc)
3954 {
3955 	struct mxge_slice_state *ss;
3956 	int i;
3957 
3958 
3959 	if (sc->ss == NULL)
3960 		return;
3961 
3962 	for (i = 0; i < sc->num_slices; i++) {
3963 		ss = &sc->ss[i];
3964 		if (ss->fw_stats != NULL) {
3965 			mxge_dma_free(&ss->fw_stats_dma);
3966 			ss->fw_stats = NULL;
3967 #ifdef IFNET_BUF_RING
3968 			if (ss->tx.br != NULL) {
3969 				drbr_free(ss->tx.br, M_DEVBUF);
3970 				ss->tx.br = NULL;
3971 			}
3972 #endif
3973 		}
3974 		if (ss->rx_done.entry != NULL) {
3975 			mxge_dma_free(&ss->rx_done.dma);
3976 			ss->rx_done.entry = NULL;
3977 		}
3978 	}
3979 	kfree(sc->ss, M_DEVBUF);
3980 	sc->ss = NULL;
3981 }
3982 
3983 static int
3984 mxge_alloc_slices(mxge_softc_t *sc)
3985 {
3986 	mxge_cmd_t cmd;
3987 	struct mxge_slice_state *ss;
3988 	size_t bytes;
3989 	int err, i, max_intr_slots;
3990 
3991 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3992 	if (err != 0) {
3993 		device_printf(sc->dev, "Cannot determine rx ring size\n");
3994 		return err;
3995 	}
3996 	sc->rx_ring_size = cmd.data0;
3997 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
3998 
3999 	bytes = sizeof (*sc->ss) * sc->num_slices;
4000 	sc->ss = kmalloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4001 	if (sc->ss == NULL)
4002 		return (ENOMEM);
4003 	for (i = 0; i < sc->num_slices; i++) {
4004 		ss = &sc->ss[i];
4005 
4006 		ss->sc = sc;
4007 
4008 		/* allocate per-slice rx interrupt queues */
4009 
4010 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4011 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4012 		if (err != 0)
4013 			goto abort;
4014 		ss->rx_done.entry = ss->rx_done.dma.addr;
4015 		bzero(ss->rx_done.entry, bytes);
4016 
4017 		/*
4018 		 * allocate the per-slice firmware stats; stats
4019 		 * (including tx) are used used only on the first
4020 		 * slice for now
4021 		 */
4022 #ifndef IFNET_BUF_RING
4023 		if (i > 0)
4024 			continue;
4025 #endif
4026 
4027 		bytes = sizeof (*ss->fw_stats);
4028 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4029 				     sizeof (*ss->fw_stats), 64);
4030 		if (err != 0)
4031 			goto abort;
4032 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4033 #ifdef IFNET_BUF_RING
4034 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4035 					   &ss->tx.lock);
4036 #endif
4037 	}
4038 
4039 	return (0);
4040 
4041 abort:
4042 	mxge_free_slices(sc);
4043 	return (ENOMEM);
4044 }
4045 
4046 static void
4047 mxge_slice_probe(mxge_softc_t *sc)
4048 {
4049 	mxge_cmd_t cmd;
4050 	char *old_fw;
4051 	int msix_cnt, status, max_intr_slots;
4052 
4053 	sc->num_slices = 1;
4054 	/*
4055 	 *  don't enable multiple slices if they are not enabled,
4056 	 *  or if this is not an SMP system
4057 	 */
4058 
4059 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
4060 		return;
4061 
4062 	/* see how many MSI-X interrupts are available */
4063 	msix_cnt = pci_msix_count(sc->dev);
4064 	if (msix_cnt < 2)
4065 		return;
4066 
4067 	/* now load the slice aware firmware see what it supports */
4068 	old_fw = sc->fw_name;
4069 	if (old_fw == mxge_fw_aligned)
4070 		sc->fw_name = mxge_fw_rss_aligned;
4071 	else
4072 		sc->fw_name = mxge_fw_rss_unaligned;
4073 	status = mxge_load_firmware(sc, 0);
4074 	if (status != 0) {
4075 		device_printf(sc->dev, "Falling back to a single slice\n");
4076 		return;
4077 	}
4078 
4079 	/* try to send a reset command to the card to see if it
4080 	   is alive */
4081 	memset(&cmd, 0, sizeof (cmd));
4082 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4083 	if (status != 0) {
4084 		device_printf(sc->dev, "failed reset\n");
4085 		goto abort_with_fw;
4086 	}
4087 
4088 	/* get rx ring size */
4089 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4090 	if (status != 0) {
4091 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4092 		goto abort_with_fw;
4093 	}
4094 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4095 
4096 	/* tell it the size of the interrupt queues */
4097 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4098 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4099 	if (status != 0) {
4100 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4101 		goto abort_with_fw;
4102 	}
4103 
4104 	/* ask the maximum number of slices it supports */
4105 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4106 	if (status != 0) {
4107 		device_printf(sc->dev,
4108 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4109 		goto abort_with_fw;
4110 	}
4111 	sc->num_slices = cmd.data0;
4112 	if (sc->num_slices > msix_cnt)
4113 		sc->num_slices = msix_cnt;
4114 
4115 	if (mxge_max_slices == -1) {
4116 		/* cap to number of CPUs in system */
4117 		if (sc->num_slices > ncpus)
4118 			sc->num_slices = ncpus;
4119 	} else {
4120 		if (sc->num_slices > mxge_max_slices)
4121 			sc->num_slices = mxge_max_slices;
4122 	}
4123 	/* make sure it is a power of two */
4124 	while (sc->num_slices & (sc->num_slices - 1))
4125 		sc->num_slices--;
4126 
4127 	if (mxge_verbose)
4128 		device_printf(sc->dev, "using %d slices\n",
4129 			      sc->num_slices);
4130 
4131 	return;
4132 
4133 abort_with_fw:
4134 	sc->fw_name = old_fw;
4135 	(void) mxge_load_firmware(sc, 0);
4136 }
4137 
4138 #if 0
4139 static int
4140 mxge_add_msix_irqs(mxge_softc_t *sc)
4141 {
4142 	size_t bytes;
4143 	int count, err, i, rid;
4144 
4145 	rid = PCIR_BAR(2);
4146 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4147 						    &rid, RF_ACTIVE);
4148 
4149 	if (sc->msix_table_res == NULL) {
4150 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4151 		return ENXIO;
4152 	}
4153 
4154 	count = sc->num_slices;
4155 	err = pci_alloc_msix(sc->dev, &count);
4156 	if (err != 0) {
4157 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4158 			      "err = %d \n", sc->num_slices, err);
4159 		goto abort_with_msix_table;
4160 	}
4161 	if (count < sc->num_slices) {
4162 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4163 			      count, sc->num_slices);
4164 		device_printf(sc->dev,
4165 			      "Try setting hw.mxge.max_slices to %d\n",
4166 			      count);
4167 		err = ENOSPC;
4168 		goto abort_with_msix;
4169 	}
4170 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4171 	sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4172 	if (sc->msix_irq_res == NULL) {
4173 		err = ENOMEM;
4174 		goto abort_with_msix;
4175 	}
4176 
4177 	for (i = 0; i < sc->num_slices; i++) {
4178 		rid = i + 1;
4179 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4180 							  SYS_RES_IRQ,
4181 							  &rid, RF_ACTIVE);
4182 		if (sc->msix_irq_res[i] == NULL) {
4183 			device_printf(sc->dev, "couldn't allocate IRQ res"
4184 				      " for message %d\n", i);
4185 			err = ENXIO;
4186 			goto abort_with_res;
4187 		}
4188 	}
4189 
4190 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4191 	sc->msix_ih =  kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4192 
4193 	for (i = 0; i < sc->num_slices; i++) {
4194 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4195 				     INTR_MPSAFE,
4196 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i],
4197 				     sc->ifp->if_serializer);
4198 		if (err != 0) {
4199 			device_printf(sc->dev, "couldn't setup intr for "
4200 				      "message %d\n", i);
4201 			goto abort_with_intr;
4202 		}
4203 	}
4204 
4205 	if (mxge_verbose) {
4206 		device_printf(sc->dev, "using %d msix IRQs:",
4207 			      sc->num_slices);
4208 		for (i = 0; i < sc->num_slices; i++)
4209 			kprintf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4210 		kprintf("\n");
4211 	}
4212 	return (0);
4213 
4214 abort_with_intr:
4215 	for (i = 0; i < sc->num_slices; i++) {
4216 		if (sc->msix_ih[i] != NULL) {
4217 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4218 					  sc->msix_ih[i]);
4219 			sc->msix_ih[i] = NULL;
4220 		}
4221 	}
4222 	kfree(sc->msix_ih, M_DEVBUF);
4223 
4224 
4225 abort_with_res:
4226 	for (i = 0; i < sc->num_slices; i++) {
4227 		rid = i + 1;
4228 		if (sc->msix_irq_res[i] != NULL)
4229 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4230 					     sc->msix_irq_res[i]);
4231 		sc->msix_irq_res[i] = NULL;
4232 	}
4233 	kfree(sc->msix_irq_res, M_DEVBUF);
4234 
4235 
4236 abort_with_msix:
4237 	pci_release_msi(sc->dev);
4238 
4239 abort_with_msix_table:
4240 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4241 			     sc->msix_table_res);
4242 
4243 	return err;
4244 }
4245 #endif
4246 
4247 static int
4248 mxge_add_single_irq(mxge_softc_t *sc)
4249 {
4250 	int err, rid;
4251 #ifdef OLD_MSI
4252 	int count;
4253 
4254 	count = pci_msi_count(sc->dev);
4255 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4256 		rid = 1;
4257 	} else {
4258 		rid = 0;
4259 		sc->legacy_irq = 1;
4260 	}
4261 #else
4262 	rid = 0;
4263 	sc->legacy_irq = 1;
4264 #endif
4265 	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4266 					 1, RF_SHAREABLE | RF_ACTIVE);
4267 	if (sc->irq_res == NULL) {
4268 		device_printf(sc->dev, "could not alloc interrupt\n");
4269 		return ENXIO;
4270 	}
4271 	if (mxge_verbose)
4272 		device_printf(sc->dev, "using %s irq %ld\n",
4273 			      sc->legacy_irq ? "INTx" : "MSI",
4274 			      rman_get_start(sc->irq_res));
4275 	err = bus_setup_intr(sc->dev, sc->irq_res,
4276 			     INTR_MPSAFE,
4277 			     mxge_intr, &sc->ss[0], &sc->ih,
4278 			     sc->ifp->if_serializer);
4279 	if (err != 0) {
4280 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4281 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4282 		if (!sc->legacy_irq)
4283 			pci_release_msi(sc->dev);
4284 	}
4285 	return err;
4286 }
4287 
4288 #if 0
4289 static void
4290 mxge_rem_msix_irqs(mxge_softc_t *sc)
4291 {
4292 	int i, rid;
4293 
4294 	for (i = 0; i < sc->num_slices; i++) {
4295 		if (sc->msix_ih[i] != NULL) {
4296 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4297 					  sc->msix_ih[i]);
4298 			sc->msix_ih[i] = NULL;
4299 		}
4300 	}
4301 	kfree(sc->msix_ih, M_DEVBUF);
4302 
4303 	for (i = 0; i < sc->num_slices; i++) {
4304 		rid = i + 1;
4305 		if (sc->msix_irq_res[i] != NULL)
4306 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4307 					     sc->msix_irq_res[i]);
4308 		sc->msix_irq_res[i] = NULL;
4309 	}
4310 	kfree(sc->msix_irq_res, M_DEVBUF);
4311 
4312 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4313 			     sc->msix_table_res);
4314 
4315 	pci_release_msi(sc->dev);
4316 	return;
4317 }
4318 #endif
4319 
4320 static void
4321 mxge_rem_single_irq(mxge_softc_t *sc)
4322 {
4323 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4324 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4325 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4326 	if (!sc->legacy_irq)
4327 		pci_release_msi(sc->dev);
4328 }
4329 
4330 static void
4331 mxge_rem_irq(mxge_softc_t *sc)
4332 {
4333 #if 0
4334 	if (sc->num_slices > 1)
4335 		mxge_rem_msix_irqs(sc);
4336 	else
4337 #endif
4338 		mxge_rem_single_irq(sc);
4339 }
4340 
4341 static int
4342 mxge_add_irq(mxge_softc_t *sc)
4343 {
4344 #if 0
4345 	int err;
4346 
4347 	if (sc->num_slices > 1)
4348 		err = mxge_add_msix_irqs(sc);
4349 	else
4350 		err = mxge_add_single_irq(sc);
4351 
4352 	if (0 && err == 0 && sc->num_slices > 1) {
4353 		mxge_rem_msix_irqs(sc);
4354 		err = mxge_add_msix_irqs(sc);
4355 	}
4356 	return err;
4357 #else
4358 	return mxge_add_single_irq(sc);
4359 #endif
4360 }
4361 
4362 
4363 static int
4364 mxge_attach(device_t dev)
4365 {
4366 	mxge_softc_t *sc = device_get_softc(dev);
4367 	struct ifnet *ifp = &sc->arpcom.ac_if;
4368 	int err, rid;
4369 
4370 	/*
4371 	 * avoid rewriting half the lines in this file to use
4372 	 * &sc->arpcom.ac_if instead
4373 	 */
4374 	sc->ifp = ifp;
4375 	sc->dev = dev;
4376 	mxge_fetch_tunables(sc);
4377 
4378 	err = bus_dma_tag_create(NULL,			/* parent */
4379 				 1,			/* alignment */
4380 				 0,			/* boundary */
4381 				 BUS_SPACE_MAXADDR,	/* low */
4382 				 BUS_SPACE_MAXADDR,	/* high */
4383 				 NULL, NULL,		/* filter */
4384 				 65536 + 256,		/* maxsize */
4385 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4386 				 65536,			/* maxsegsize */
4387 				 0,			/* flags */
4388 				 &sc->parent_dmat);	/* tag */
4389 
4390 	if (err != 0) {
4391 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4392 			      err);
4393 		goto abort_with_nothing;
4394 	}
4395 
4396 	sc->ifp = ifp;
4397 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4398 
4399 	callout_init_mp(&sc->co_hdl);
4400 
4401 	mxge_setup_cfg_space(sc);
4402 
4403 	/* Map the board into the kernel */
4404 	rid = PCIR_BARS;
4405 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4406 					 ~0, 1, RF_ACTIVE);
4407 	if (sc->mem_res == NULL) {
4408 		device_printf(dev, "could not map memory\n");
4409 		err = ENXIO;
4410 		goto abort_with_nothing;
4411 	}
4412 	sc->sram = rman_get_virtual(sc->mem_res);
4413 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4414 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4415 		device_printf(dev, "impossible memory region size %ld\n",
4416 			      rman_get_size(sc->mem_res));
4417 		err = ENXIO;
4418 		goto abort_with_mem_res;
4419 	}
4420 
4421 	/* make NULL terminated copy of the EEPROM strings section of
4422 	   lanai SRAM */
4423 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4424 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4425 				rman_get_bushandle(sc->mem_res),
4426 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4427 				sc->eeprom_strings,
4428 				MXGE_EEPROM_STRINGS_SIZE - 2);
4429 	err = mxge_parse_strings(sc);
4430 	if (err != 0)
4431 		goto abort_with_mem_res;
4432 
4433 	/* Enable write combining for efficient use of PCIe bus */
4434 	mxge_enable_wc(sc);
4435 
4436 	/* Allocate the out of band dma memory */
4437 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4438 			     sizeof (mxge_cmd_t), 64);
4439 	if (err != 0)
4440 		goto abort_with_mem_res;
4441 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4442 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4443 	if (err != 0)
4444 		goto abort_with_cmd_dma;
4445 
4446 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4447 	if (err != 0)
4448 		goto abort_with_zeropad_dma;
4449 
4450 	/* select & load the firmware */
4451 	err = mxge_select_firmware(sc);
4452 	if (err != 0)
4453 		goto abort_with_dmabench;
4454 	sc->intr_coal_delay = mxge_intr_coal_delay;
4455 
4456 	mxge_slice_probe(sc);
4457 	err = mxge_alloc_slices(sc);
4458 	if (err != 0)
4459 		goto abort_with_dmabench;
4460 
4461 	err = mxge_reset(sc, 0);
4462 	if (err != 0)
4463 		goto abort_with_slices;
4464 
4465 	err = mxge_alloc_rings(sc);
4466 	if (err != 0) {
4467 		device_printf(sc->dev, "failed to allocate rings\n");
4468 		goto abort_with_dmabench;
4469 	}
4470 
4471 	ifp->if_baudrate = IF_Gbps(10UL);
4472 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4473 		IFCAP_VLAN_MTU;
4474 #ifdef INET
4475 	ifp->if_capabilities |= IFCAP_LRO;
4476 #endif
4477 
4478 #ifdef MXGE_NEW_VLAN_API
4479 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4480 #endif
4481 
4482 	sc->max_mtu = mxge_max_mtu(sc);
4483 	if (sc->max_mtu >= 9000)
4484 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4485 	else
4486 		device_printf(dev, "MTU limited to %d.  Install "
4487 			      "latest firmware for 9000 byte jumbo support\n",
4488 			      sc->max_mtu - ETHER_HDR_LEN);
4489 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4490 	ifp->if_capenable = ifp->if_capabilities;
4491 	if (sc->lro_cnt == 0)
4492 		ifp->if_capenable &= ~IFCAP_LRO;
4493 	sc->csum_flag = 1;
4494         ifp->if_init = mxge_init;
4495         ifp->if_softc = sc;
4496         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4497         ifp->if_ioctl = mxge_ioctl;
4498         ifp->if_start = mxge_start;
4499 	/* Initialise the ifmedia structure */
4500 	ifmedia_init(&sc->media, 0, mxge_media_change,
4501 		     mxge_media_status);
4502 	mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4503 	mxge_media_probe(sc);
4504 	sc->dying = 0;
4505 	ether_ifattach(ifp, sc->mac_addr, NULL);
4506 	/* ether_ifattach sets mtu to ETHERMTU */
4507 	if (mxge_initial_mtu != ETHERMTU) {
4508 		lwkt_serialize_enter(ifp->if_serializer);
4509 		mxge_change_mtu(sc, mxge_initial_mtu);
4510 		lwkt_serialize_exit(ifp->if_serializer);
4511 	}
4512 	/* must come after ether_ifattach() */
4513 	err = mxge_add_irq(sc);
4514 	if (err != 0) {
4515 		device_printf(sc->dev, "failed to add irq\n");
4516 		goto abort_with_rings;
4517 	}
4518 
4519 	mxge_add_sysctls(sc);
4520 #ifdef IFNET_BUF_RING
4521 	ifp->if_transmit = mxge_transmit;
4522 	ifp->if_qflush = mxge_qflush;
4523 #endif
4524 	return 0;
4525 
4526 abort_with_rings:
4527 	mxge_free_rings(sc);
4528 abort_with_slices:
4529 	mxge_free_slices(sc);
4530 abort_with_dmabench:
4531 	mxge_dma_free(&sc->dmabench_dma);
4532 abort_with_zeropad_dma:
4533 	mxge_dma_free(&sc->zeropad_dma);
4534 abort_with_cmd_dma:
4535 	mxge_dma_free(&sc->cmd_dma);
4536 abort_with_mem_res:
4537 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4538 	pci_disable_busmaster(dev);
4539 	bus_dma_tag_destroy(sc->parent_dmat);
4540 abort_with_nothing:
4541 	return err;
4542 }
4543 
4544 static int
4545 mxge_detach(device_t dev)
4546 {
4547 	mxge_softc_t *sc = device_get_softc(dev);
4548 
4549 	lwkt_serialize_enter(sc->ifp->if_serializer);
4550 	sc->dying = 1;
4551 	if (sc->ifp->if_flags & IFF_RUNNING)
4552 		mxge_close(sc);
4553 	/*
4554 	 * XXX: race: the callout callback could be spinning on
4555 	 * the serializer and run anyway
4556 	 */
4557 	callout_stop(&sc->co_hdl);
4558 	lwkt_serialize_exit(sc->ifp->if_serializer);
4559 
4560 	ether_ifdetach(sc->ifp);
4561 	ifmedia_removeall(&sc->media);
4562 	mxge_dummy_rdma(sc, 0);
4563 	mxge_rem_sysctls(sc);
4564 	mxge_rem_irq(sc);
4565 	mxge_free_rings(sc);
4566 	mxge_free_slices(sc);
4567 	mxge_dma_free(&sc->dmabench_dma);
4568 	mxge_dma_free(&sc->zeropad_dma);
4569 	mxge_dma_free(&sc->cmd_dma);
4570 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4571 	pci_disable_busmaster(dev);
4572 	bus_dma_tag_destroy(sc->parent_dmat);
4573 	return 0;
4574 }
4575 
4576 static int
4577 mxge_shutdown(device_t dev)
4578 {
4579 	return 0;
4580 }
4581