1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright 2007-2009 Myricom, Inc.  All rights reserved.
29  * Use is subject to license terms.
30  */
31 
32 #ifndef	lint
33 static const char __idstring[] =
34 	"@(#)$Id: myri10ge.c,v 1.186 2009-06-29 13:47:22 gallatin Exp $";
35 #endif
36 
37 #define	MXGEFW_NDIS
38 #include "myri10ge_var.h"
39 #include "rss_eth_z8e.h"
40 #include "rss_ethp_z8e.h"
41 #include "mcp_gen_header.h"
42 
43 #define	MYRI10GE_MAX_ETHER_MTU 9014
44 
45 #define	MYRI10GE_ETH_STOPPED 0
46 #define	MYRI10GE_ETH_STOPPING 1
47 #define	MYRI10GE_ETH_STARTING 2
48 #define	MYRI10GE_ETH_RUNNING 3
49 #define	MYRI10GE_ETH_OPEN_FAILED 4
50 #define	MYRI10GE_ETH_SUSPENDED_RUNNING 5
51 
52 static int myri10ge_small_bytes = 510;
53 static int myri10ge_intr_coal_delay = 125;
54 static int myri10ge_flow_control = 1;
55 #if #cpu(i386) || defined __i386 || defined i386 ||	\
56 	defined __i386__ || #cpu(x86_64) || defined __x86_64__
57 static int myri10ge_nvidia_ecrc_enable = 1;
58 #endif
59 static int myri10ge_mtu_override = 0;
60 static int myri10ge_tx_copylen = 512;
61 static int myri10ge_deassert_wait = 1;
62 static int myri10ge_verbose = 0;
63 static int myri10ge_watchdog_reset = 0;
64 static int myri10ge_use_msix = 1;
65 static int myri10ge_max_slices = -1;
66 static int myri10ge_use_msi = 1;
67 int myri10ge_force_firmware = 0;
68 static boolean_t myri10ge_use_lso = B_TRUE;
69 static int myri10ge_rss_hash = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
70 static int myri10ge_tx_hash = 1;
71 static int myri10ge_lro = 0;
72 static int myri10ge_lro_cnt = 8;
73 int myri10ge_lro_max_aggr = 2;
74 static int myri10ge_lso_copy = 0;
75 static mblk_t *myri10ge_send_wrapper(void *arg, mblk_t *mp);
76 int myri10ge_tx_handles_initial = 128;
77 
78 static 	kmutex_t myri10ge_param_lock;
79 static void* myri10ge_db_lastfree;
80 
81 static int myri10ge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
82 static int myri10ge_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
83 static int myri10ge_quiesce(dev_info_t *dip);
84 
85 DDI_DEFINE_STREAM_OPS(myri10ge_ops, nulldev, nulldev, myri10ge_attach,
86     myri10ge_detach, nodev, NULL, D_MP, NULL, myri10ge_quiesce);
87 
88 
89 static struct modldrv modldrv = {
90 	&mod_driverops,
91 	"Myricom 10G driver (10GbE)",
92 	&myri10ge_ops,
93 };
94 
95 
96 static struct modlinkage modlinkage = {
97 	MODREV_1,
98 	{&modldrv, NULL},
99 };
100 
101 unsigned char myri10ge_broadcastaddr[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
102 
103 static ddi_dma_attr_t myri10ge_misc_dma_attr = {
104 	DMA_ATTR_V0,			/* version number. */
105 	(uint64_t)0, 			/* low address */
106 	(uint64_t)0xffffffffffffffffULL, /* high address */
107 	(uint64_t)0x7ffffff,		/* address counter max */
108 	(uint64_t)4096,			/* alignment */
109 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
110 	(uint32_t)0x1,			/* minimum transfer size */
111 	(uint64_t)0x7fffffff,		/* maximum transfer size */
112 	(uint64_t)0x7fffffff,		/* maximum segment size */
113 	1,				/* scatter/gather list length */
114 	1,				/* granularity */
115 	0				/* attribute flags */
116 };
117 
118 /*
119  * The Myri10GE NIC has the following constraints on receive buffers:
120  * 1) Buffers which cross a 4KB boundary must be aligned to 4KB
121  * 2) Buffers which are not aligned to 4KB must not cross a 4KB boundary
122  */
123 
124 static ddi_dma_attr_t myri10ge_rx_jumbo_dma_attr = {
125 	DMA_ATTR_V0,			/* version number. */
126 	(uint64_t)0, 			/* low address */
127 	(uint64_t)0xffffffffffffffffULL, /* high address */
128 	(uint64_t)0x7ffffff,		/* address counter max */
129 	(uint64_t)4096,			/* alignment */
130 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
131 	(uint32_t)0x1,			/* minimum transfer size */
132 	(uint64_t)0x7fffffff,		/* maximum transfer size */
133 	UINT64_MAX,			/* maximum segment size */
134 	1,				/* scatter/gather list length */
135 	1,				/* granularity */
136 	0				/* attribute flags */
137 };
138 
139 static ddi_dma_attr_t myri10ge_rx_std_dma_attr = {
140 	DMA_ATTR_V0,			/* version number. */
141 	(uint64_t)0, 			/* low address */
142 	(uint64_t)0xffffffffffffffffULL, /* high address */
143 	(uint64_t)0x7ffffff,		/* address counter max */
144 #if defined sparc64 || defined __sparcv9
145 	(uint64_t)4096,			/* alignment */
146 #else
147 	(uint64_t)0x80,			/* alignment */
148 #endif
149 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
150 	(uint32_t)0x1,			/* minimum transfer size */
151 	(uint64_t)0x7fffffff,		/* maximum transfer size */
152 #if defined sparc64 || defined __sparcv9
153 	UINT64_MAX,			/* maximum segment size */
154 #else
155 	(uint64_t)0xfff,		/* maximum segment size */
156 #endif
157 	1,				/* scatter/gather list length */
158 	1,				/* granularity */
159 	0				/* attribute flags */
160 };
161 
162 static ddi_dma_attr_t myri10ge_tx_dma_attr = {
163 	DMA_ATTR_V0,			/* version number. */
164 	(uint64_t)0, 			/* low address */
165 	(uint64_t)0xffffffffffffffffULL, /* high address */
166 	(uint64_t)0x7ffffff,		/* address counter max */
167 	(uint64_t)1,			/* alignment */
168 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
169 	(uint32_t)0x1,			/* minimum transfer size */
170 	(uint64_t)0x7fffffff,		/* maximum transfer size */
171 	UINT64_MAX,			/* maximum segment size */
172 	INT32_MAX,			/* scatter/gather list length */
173 	1,				/* granularity */
174 	0			/* attribute flags */
175 };
176 
177 #if defined sparc64 || defined __sparcv9
178 #define	WC 0
179 #else
180 #define	WC 1
181 #endif
182 
183 struct ddi_device_acc_attr myri10ge_dev_access_attr = {
184 	DDI_DEVICE_ATTR_V0,		/* version */
185 	DDI_NEVERSWAP_ACC,		/* endian flash */
186 #if WC
187 	DDI_MERGING_OK_ACC		/* data order */
188 #else
189 	DDI_STRICTORDER_ACC
190 #endif
191 };
192 
193 static void myri10ge_watchdog(void *arg);
194 
195 #ifdef MYRICOM_PRIV
196 int myri10ge_mtu = MYRI10GE_MAX_ETHER_MTU + MXGEFW_PAD + VLAN_TAGSZ;
197 #else
198 int myri10ge_mtu = ETHERMAX + MXGEFW_PAD + VLAN_TAGSZ;
199 #endif
200 int myri10ge_bigbufs_initial = 1024;
201 int myri10ge_bigbufs_max = 4096;
202 
203 
204 caddr_t
205 myri10ge_dma_alloc(dev_info_t *dip, size_t len,
206     ddi_dma_attr_t *attr, ddi_device_acc_attr_t  *accattr,
207     uint_t alloc_flags, int bind_flags, struct myri10ge_dma_stuff *dma,
208     int warn, int (*wait)(caddr_t))
209 {
210 	caddr_t  kaddr;
211 	size_t real_length;
212 	ddi_dma_cookie_t cookie;
213 	uint_t count;
214 	int err;
215 
216 	err = ddi_dma_alloc_handle(dip, attr, wait,
217 	    NULL, &dma->handle);
218 	if (err != DDI_SUCCESS) {
219 		if (warn)
220 			cmn_err(CE_WARN,
221 			    "myri10ge: ddi_dma_alloc_handle failed\n");
222 		goto abort_with_nothing;
223 	}
224 
225 	err = ddi_dma_mem_alloc(dma->handle, len, accattr, alloc_flags,
226 	    wait, NULL, &kaddr, &real_length,
227 	    &dma->acc_handle);
228 	if (err != DDI_SUCCESS) {
229 		if (warn)
230 			cmn_err(CE_WARN,
231 			    "myri10ge: ddi_dma_mem_alloc failed\n");
232 		goto abort_with_handle;
233 	}
234 
235 	err = ddi_dma_addr_bind_handle(dma->handle, NULL, kaddr, len,
236 	    bind_flags, wait, NULL, &cookie, &count);
237 
238 	if (err != DDI_SUCCESS) {
239 		if (warn)
240 			cmn_err(CE_WARN,
241 			    "myri10ge: ddi_dma_addr_bind_handle failed\n");
242 		goto abort_with_mem;
243 	}
244 
245 	if (count != 1) {
246 		if (warn)
247 			cmn_err(CE_WARN,
248 			    "myri10ge: got too many dma segments ");
249 		goto abort_with_bind;
250 	}
251 	dma->low = htonl(MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress));
252 	dma->high = htonl(MYRI10GE_HIGHPART_TO_U32(cookie.dmac_laddress));
253 	return (kaddr);
254 
255 abort_with_bind:
256 	(void) ddi_dma_unbind_handle(dma->handle);
257 
258 abort_with_mem:
259 	ddi_dma_mem_free(&dma->acc_handle);
260 
261 abort_with_handle:
262 	ddi_dma_free_handle(&dma->handle);
263 abort_with_nothing:
264 	if (warn) {
265 		cmn_err(CE_WARN, "myri10ge: myri10ge_dma_alloc failed.\n  ");
266 		cmn_err(CE_WARN, "args: dip=%p len=0x%lx ddi_dma_attr=%p\n",
267 		    (void*) dip, len, (void*) attr);
268 		cmn_err(CE_WARN,
269 		    "args: ddi_device_acc_attr=%p  alloc_flags=0x%x\n",
270 		    (void*) accattr, alloc_flags);
271 		cmn_err(CE_WARN, "args: bind_flags=0x%x  dmastuff=%p",
272 		    bind_flags, (void*) dma);
273 	}
274 	return (NULL);
275 
276 }
277 
278 void
279 myri10ge_dma_free(struct myri10ge_dma_stuff *dma)
280 {
281 	(void) ddi_dma_unbind_handle(dma->handle);
282 	ddi_dma_mem_free(&dma->acc_handle);
283 	ddi_dma_free_handle(&dma->handle);
284 }
285 
286 static inline void
287 myri10ge_pio_copy32(void *to, uint32_t *from32, size_t size)
288 {
289 	register volatile uint32_t *to32;
290 	size_t i;
291 
292 	to32 = (volatile uint32_t *) to;
293 	for (i = (size / 4); i; i--) {
294 		*to32 = *from32;
295 		to32++;
296 		from32++;
297 	}
298 }
299 
300 #if defined(_LP64)
301 static inline void
302 myri10ge_pio_copy64(void *to, uint64_t *from64, size_t size)
303 {
304 	register volatile uint64_t *to64;
305 	size_t i;
306 
307 	to64 = (volatile uint64_t *) to;
308 	for (i = (size / 8); i; i--) {
309 		*to64 = *from64;
310 		to64++;
311 		from64++;
312 	}
313 }
314 #endif
315 
316 /*
317  * This routine copies memory from the host to the NIC.
318  * The "size" argument must always be a multiple of
319  * the size of long (4 or 8 bytes), and to/from must also
320  * be naturally aligned.
321  */
322 static inline void
323 myri10ge_pio_copy(void *to, void *from, size_t size)
324 {
325 #if !defined(_LP64)
326 	ASSERT((size % 4) == 0);
327 	myri10ge_pio_copy32(to, (uint32_t *)from, size);
328 #else
329 	ASSERT((size % 8) == 0);
330 	myri10ge_pio_copy64(to, (uint64_t *)from, size);
331 #endif
332 }
333 
334 
335 /*
336  * Due to various bugs in Solaris (especially bug 6186772 where the
337  * TCP/UDP checksum is calculated incorrectly on mblk chains with more
338  * than two elements), and the design bug where hardware checksums are
339  * ignored on mblk chains with more than 2 elements, we need to
340  * allocate private pool of physically contiguous receive buffers.
341  */
342 
343 static void
344 myri10ge_jpool_init(struct myri10ge_slice_state *ss)
345 {
346 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
347 
348 	bzero(jpool, sizeof (*jpool));
349 	mutex_init(&jpool->mtx, NULL, MUTEX_DRIVER,
350 	    ss->mgp->icookie);
351 	jpool->head = NULL;
352 }
353 
354 static void
355 myri10ge_jpool_fini(struct myri10ge_slice_state *ss)
356 {
357 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
358 
359 	if (jpool->head != NULL) {
360 		cmn_err(CE_WARN,
361 		    "%s: BUG! myri10ge_jpool_fini called on non-empty pool\n",
362 		    ss->mgp->name);
363 	}
364 	mutex_destroy(&jpool->mtx);
365 }
366 
367 
368 /*
369  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
370  * at most 32 bytes at a time, so as to avoid involving the software
371  * pio handler in the nic.   We re-write the first segment's low
372  * DMA address to mark it valid only after we write the entire chunk
373  * in a burst
374  */
375 static inline void
376 myri10ge_submit_8rx(mcp_kreq_ether_recv_t *dst, mcp_kreq_ether_recv_t *src)
377 {
378 	src->addr_low |= BE_32(1);
379 	myri10ge_pio_copy(dst, src, 4 * sizeof (*src));
380 	mb();
381 	myri10ge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
382 	mb();
383 	src->addr_low &= ~(BE_32(1));
384 	dst->addr_low = src->addr_low;
385 	mb();
386 }
387 
388 static void
389 myri10ge_pull_jpool(struct myri10ge_slice_state *ss)
390 {
391 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
392 	struct myri10ge_jpool_entry *jtail, *j, *jfree;
393 	volatile uintptr_t *putp;
394 	uintptr_t put;
395 	int i;
396 
397 	/* find tail */
398 	jtail = NULL;
399 	if (jpool->head != NULL) {
400 		j = jpool->head;
401 		while (j->next != NULL)
402 			j = j->next;
403 		jtail = j;
404 	}
405 
406 	/*
407 	 * iterate over all per-CPU caches, and add contents into
408 	 * jpool
409 	 */
410 	for (i = 0; i < MYRI10GE_MAX_CPUS; i++) {
411 		/* take per-CPU free list */
412 		putp = (void *)&jpool->cpu[i & MYRI10GE_MAX_CPU_MASK].head;
413 		if (*putp == NULL)
414 			continue;
415 		put = atomic_swap_ulong(putp, 0);
416 		jfree = (struct myri10ge_jpool_entry *)put;
417 
418 		/* append to pool */
419 		if (jtail == NULL) {
420 			jpool->head = jfree;
421 		} else {
422 			jtail->next = jfree;
423 		}
424 		j = jfree;
425 		while (j->next != NULL)
426 			j = j->next;
427 		jtail = j;
428 	}
429 }
430 
431 /*
432  * Transfers buffers from the free pool to the nic
433  * Must be called holding the jpool mutex.
434  */
435 
436 static inline void
437 myri10ge_restock_jumbos(struct myri10ge_slice_state *ss)
438 {
439 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
440 	struct myri10ge_jpool_entry *j;
441 	myri10ge_rx_ring_t *rx;
442 	int i, idx, limit;
443 
444 	rx = &ss->rx_big;
445 	limit = ss->j_rx_cnt + (rx->mask + 1);
446 
447 	for (i = rx->cnt; i != limit; i++) {
448 		idx = i & (rx->mask);
449 		j = jpool->head;
450 		if (j == NULL) {
451 			myri10ge_pull_jpool(ss);
452 			j = jpool->head;
453 			if (j == NULL) {
454 				break;
455 			}
456 		}
457 		jpool->head = j->next;
458 		rx->info[idx].j = j;
459 		rx->shadow[idx].addr_low = j->dma.low;
460 		rx->shadow[idx].addr_high = j->dma.high;
461 		/* copy 4 descriptors (32-bytes) to the mcp at a time */
462 		if ((idx & 7) == 7) {
463 			myri10ge_submit_8rx(&rx->lanai[idx - 7],
464 			    &rx->shadow[idx - 7]);
465 		}
466 	}
467 	rx->cnt = i;
468 }
469 
470 /*
471  * Transfer buffers from the nic to the free pool.
472  * Should be called holding the jpool mutex
473  */
474 
475 static inline void
476 myri10ge_unstock_jumbos(struct myri10ge_slice_state *ss)
477 {
478 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
479 	struct myri10ge_jpool_entry *j;
480 	myri10ge_rx_ring_t *rx;
481 	int i;
482 
483 	mutex_enter(&jpool->mtx);
484 	rx = &ss->rx_big;
485 
486 	for (i = 0; i < rx->mask + 1; i++) {
487 		j = rx->info[i].j;
488 		rx->info[i].j = NULL;
489 		if (j == NULL)
490 			continue;
491 		j->next = jpool->head;
492 		jpool->head = j;
493 	}
494 	mutex_exit(&jpool->mtx);
495 
496 }
497 
498 
499 /*
500  * Free routine which is called when the mblk allocated via
501  * esballoc() is freed.   Here we return the jumbo buffer
502  * to the free pool, and possibly pass some jumbo buffers
503  * to the nic
504  */
505 
506 static void
507 myri10ge_jfree_rtn(void *arg)
508 {
509 	struct myri10ge_jpool_entry *j = (struct myri10ge_jpool_entry *)arg;
510 	struct myri10ge_jpool_stuff *jpool;
511 	volatile uintptr_t *putp;
512 	uintptr_t old, new;
513 
514 	jpool = &j->ss->jpool;
515 
516 	/* prepend buffer locklessly to per-CPU freelist */
517 	putp = (void *)&jpool->cpu[CPU->cpu_seqid & MYRI10GE_MAX_CPU_MASK].head;
518 	new = (uintptr_t)j;
519 	do {
520 		old = *putp;
521 		j->next = (void *)old;
522 	} while (atomic_cas_ulong(putp, old, new) != old);
523 }
524 
525 static void
526 myri10ge_remove_jbuf(struct myri10ge_jpool_entry *j)
527 {
528 	(void) ddi_dma_unbind_handle(j->dma_handle);
529 	ddi_dma_mem_free(&j->acc_handle);
530 	ddi_dma_free_handle(&j->dma_handle);
531 	kmem_free(j, sizeof (*j));
532 }
533 
534 
535 /*
536  * Allocates one physically contiguous descriptor
537  * and add it to the jumbo buffer pool.
538  */
539 
540 static int
541 myri10ge_add_jbuf(struct myri10ge_slice_state *ss)
542 {
543 	struct myri10ge_jpool_entry *j;
544 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
545 	ddi_dma_attr_t *rx_dma_attr;
546 	size_t real_length;
547 	ddi_dma_cookie_t cookie;
548 	uint_t count;
549 	int err;
550 
551 	if (myri10ge_mtu < 2048)
552 		rx_dma_attr = &myri10ge_rx_std_dma_attr;
553 	else
554 		rx_dma_attr = &myri10ge_rx_jumbo_dma_attr;
555 
556 again:
557 	j = (struct myri10ge_jpool_entry *)
558 	    kmem_alloc(sizeof (*j), KM_SLEEP);
559 	err = ddi_dma_alloc_handle(ss->mgp->dip, rx_dma_attr,
560 	    DDI_DMA_DONTWAIT, NULL, &j->dma_handle);
561 	if (err != DDI_SUCCESS)
562 		goto abort_with_j;
563 
564 	err = ddi_dma_mem_alloc(j->dma_handle, myri10ge_mtu,
565 	    &myri10ge_dev_access_attr,  DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
566 	    NULL, &j->buf, &real_length, &j->acc_handle);
567 	if (err != DDI_SUCCESS)
568 		goto abort_with_handle;
569 
570 	err = ddi_dma_addr_bind_handle(j->dma_handle, NULL, j->buf,
571 	    real_length, DDI_DMA_READ|DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
572 	    NULL, &cookie, &count);
573 	if (err != DDI_SUCCESS)
574 		goto abort_with_mem;
575 
576 	/*
577 	 * Make certain std MTU buffers do not cross a 4KB boundary:
578 	 *
579 	 * Setting dma_attr_align=4096 will do this, but the system
580 	 * will only allocate 1 RX buffer per 4KB page, rather than 2.
581 	 * Setting dma_attr_granular=4096 *seems* to work around this,
582 	 * but I'm paranoid about future systems no longer honoring
583 	 * this, so fall back to the safe, but memory wasting way if a
584 	 * buffer crosses a 4KB boundary.
585 	 */
586 
587 	if (rx_dma_attr == &myri10ge_rx_std_dma_attr &&
588 	    rx_dma_attr->dma_attr_align != 4096) {
589 		uint32_t start, end;
590 
591 		start = MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress);
592 		end = start + myri10ge_mtu;
593 		if (((end >> 12) != (start >> 12)) && (start & 4095U)) {
594 			printf("std buffer crossed a 4KB boundary!\n");
595 			myri10ge_remove_jbuf(j);
596 			rx_dma_attr->dma_attr_align = 4096;
597 			rx_dma_attr->dma_attr_seg = UINT64_MAX;
598 			goto again;
599 		}
600 	}
601 
602 	j->dma.low =
603 	    htonl(MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress));
604 	j->dma.high =
605 	    htonl(MYRI10GE_HIGHPART_TO_U32(cookie.dmac_laddress));
606 	j->ss = ss;
607 
608 
609 	j->free_func.free_func = myri10ge_jfree_rtn;
610 	j->free_func.free_arg = (char *)j;
611 	mutex_enter(&jpool->mtx);
612 	j->next = jpool->head;
613 	jpool->head = j;
614 	jpool->num_alloc++;
615 	mutex_exit(&jpool->mtx);
616 	return (0);
617 
618 abort_with_mem:
619 	ddi_dma_mem_free(&j->acc_handle);
620 
621 abort_with_handle:
622 	ddi_dma_free_handle(&j->dma_handle);
623 
624 abort_with_j:
625 	kmem_free(j, sizeof (*j));
626 
627 	/*
628 	 * If an allocation failed, perhaps it failed because it could
629 	 * not satisfy granularity requirement.  Disable that, and
630 	 * try agin.
631 	 */
632 	if (rx_dma_attr == &myri10ge_rx_std_dma_attr &&
633 	    rx_dma_attr->dma_attr_align != 4096) {
634 			cmn_err(CE_NOTE,
635 			    "!alloc failed, reverting to gran=1\n");
636 			rx_dma_attr->dma_attr_align = 4096;
637 			rx_dma_attr->dma_attr_seg = UINT64_MAX;
638 			goto again;
639 	}
640 	return (err);
641 }
642 
643 static int
644 myri10ge_jfree_cnt(struct myri10ge_jpool_stuff *jpool)
645 {
646 	int i;
647 	struct myri10ge_jpool_entry *j;
648 
649 	mutex_enter(&jpool->mtx);
650 	j = jpool->head;
651 	i = 0;
652 	while (j != NULL) {
653 		i++;
654 		j = j->next;
655 	}
656 	mutex_exit(&jpool->mtx);
657 	return (i);
658 }
659 
660 static int
661 myri10ge_add_jbufs(struct myri10ge_slice_state *ss, int num, int total)
662 {
663 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
664 	int allocated = 0;
665 	int err;
666 	int needed;
667 
668 	/*
669 	 * if total is set, user wants "num" jbufs in the pool,
670 	 * otherwise the user wants to "num" additional jbufs
671 	 * added to the pool
672 	 */
673 	if (total && jpool->num_alloc) {
674 		allocated = myri10ge_jfree_cnt(jpool);
675 		needed = num - allocated;
676 	} else {
677 		needed = num;
678 	}
679 
680 	while (needed > 0) {
681 		needed--;
682 		err = myri10ge_add_jbuf(ss);
683 		if (err == 0) {
684 			allocated++;
685 		}
686 	}
687 	return (allocated);
688 }
689 
690 static void
691 myri10ge_remove_jbufs(struct myri10ge_slice_state *ss)
692 {
693 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
694 	struct myri10ge_jpool_entry *j;
695 
696 	mutex_enter(&jpool->mtx);
697 	myri10ge_pull_jpool(ss);
698 	while (jpool->head != NULL) {
699 		jpool->num_alloc--;
700 		j = jpool->head;
701 		jpool->head = j->next;
702 		myri10ge_remove_jbuf(j);
703 	}
704 	mutex_exit(&jpool->mtx);
705 }
706 
707 static void
708 myri10ge_carve_up_jbufs_into_small_ring(struct myri10ge_slice_state *ss)
709 {
710 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
711 	struct myri10ge_jpool_entry *j = NULL;
712 	caddr_t ptr;
713 	uint32_t dma_low, dma_high;
714 	int idx, len;
715 	unsigned int alloc_size;
716 
717 	dma_low = dma_high = len = 0;
718 	alloc_size = myri10ge_small_bytes + MXGEFW_PAD;
719 	ptr = NULL;
720 	for (idx = 0; idx < ss->rx_small.mask + 1; idx++) {
721 		/* Allocate a jumbo frame and carve it into small frames */
722 		if (len < alloc_size) {
723 			mutex_enter(&jpool->mtx);
724 			/* remove jumbo from freelist */
725 			j = jpool->head;
726 			jpool->head = j->next;
727 			/* place it onto small list */
728 			j->next = ss->small_jpool;
729 			ss->small_jpool = j;
730 			mutex_exit(&jpool->mtx);
731 			len = myri10ge_mtu;
732 			dma_low = ntohl(j->dma.low);
733 			dma_high = ntohl(j->dma.high);
734 			ptr = j->buf;
735 		}
736 		ss->rx_small.info[idx].ptr = ptr;
737 		ss->rx_small.shadow[idx].addr_low = htonl(dma_low);
738 		ss->rx_small.shadow[idx].addr_high = htonl(dma_high);
739 		len -= alloc_size;
740 		ptr += alloc_size;
741 		dma_low += alloc_size;
742 	}
743 }
744 
745 /*
746  * Return the jumbo bufs we carved up for small to the jumbo pool
747  */
748 
749 static void
750 myri10ge_release_small_jbufs(struct myri10ge_slice_state *ss)
751 {
752 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
753 	struct myri10ge_jpool_entry *j = NULL;
754 
755 	mutex_enter(&jpool->mtx);
756 	while (ss->small_jpool != NULL) {
757 		j = ss->small_jpool;
758 		ss->small_jpool = j->next;
759 		j->next = jpool->head;
760 		jpool->head = j;
761 	}
762 	mutex_exit(&jpool->mtx);
763 	ss->jbufs_for_smalls = 0;
764 }
765 
766 static int
767 myri10ge_add_tx_handle(struct myri10ge_slice_state *ss)
768 {
769 	myri10ge_tx_ring_t *tx = &ss->tx;
770 	struct myri10ge_priv *mgp = ss->mgp;
771 	struct myri10ge_tx_dma_handle *handle;
772 	int err;
773 
774 	handle = kmem_zalloc(sizeof (*handle), KM_SLEEP);
775 	err = ddi_dma_alloc_handle(mgp->dip,
776 	    &myri10ge_tx_dma_attr,
777 	    DDI_DMA_SLEEP, NULL,
778 	    &handle->h);
779 	if (err) {
780 		static int limit = 0;
781 		if (limit == 0)
782 			cmn_err(CE_WARN, "%s: Falled to alloc tx dma handle\n",
783 			    mgp->name);
784 		limit++;
785 		kmem_free(handle, sizeof (*handle));
786 		return (err);
787 	}
788 	mutex_enter(&tx->handle_lock);
789 	MYRI10GE_SLICE_STAT_INC(tx_handles_alloced);
790 	handle->next = tx->free_tx_handles;
791 	tx->free_tx_handles = handle;
792 	mutex_exit(&tx->handle_lock);
793 	return (DDI_SUCCESS);
794 }
795 
796 static void
797 myri10ge_remove_tx_handles(struct myri10ge_slice_state *ss)
798 {
799 	myri10ge_tx_ring_t *tx = &ss->tx;
800 	struct myri10ge_tx_dma_handle *handle;
801 	mutex_enter(&tx->handle_lock);
802 
803 	handle = tx->free_tx_handles;
804 	while (handle != NULL) {
805 		tx->free_tx_handles = handle->next;
806 		ddi_dma_free_handle(&handle->h);
807 		kmem_free(handle, sizeof (*handle));
808 		handle = tx->free_tx_handles;
809 		MYRI10GE_SLICE_STAT_DEC(tx_handles_alloced);
810 	}
811 	mutex_exit(&tx->handle_lock);
812 	if (MYRI10GE_SLICE_STAT(tx_handles_alloced) != 0) {
813 		cmn_err(CE_WARN, "%s: %d tx dma handles allocated at close\n",
814 		    ss->mgp->name,
815 		    (int)MYRI10GE_SLICE_STAT(tx_handles_alloced));
816 	}
817 }
818 
819 static void
820 myri10ge_free_tx_handles(myri10ge_tx_ring_t *tx,
821     struct myri10ge_tx_dma_handle_head *list)
822 {
823 	mutex_enter(&tx->handle_lock);
824 	list->tail->next = tx->free_tx_handles;
825 	tx->free_tx_handles = list->head;
826 	mutex_exit(&tx->handle_lock);
827 }
828 
829 static void
830 myri10ge_free_tx_handle_slist(myri10ge_tx_ring_t *tx,
831     struct myri10ge_tx_dma_handle *handle)
832 {
833 	struct myri10ge_tx_dma_handle_head list;
834 
835 	if (handle == NULL)
836 		return;
837 	list.head = handle;
838 	list.tail = handle;
839 	while (handle != NULL) {
840 		list.tail = handle;
841 		handle = handle->next;
842 	}
843 	myri10ge_free_tx_handles(tx, &list);
844 }
845 
846 static int
847 myri10ge_alloc_tx_handles(struct myri10ge_slice_state *ss, int count,
848     struct myri10ge_tx_dma_handle **ret)
849 {
850 	myri10ge_tx_ring_t *tx = &ss->tx;
851 	struct myri10ge_tx_dma_handle *handle;
852 	int err, i;
853 
854 	mutex_enter(&tx->handle_lock);
855 	for (i = 0; i < count; i++) {
856 		handle = tx->free_tx_handles;
857 		while (handle == NULL) {
858 			mutex_exit(&tx->handle_lock);
859 			err = myri10ge_add_tx_handle(ss);
860 			if (err != DDI_SUCCESS) {
861 				goto abort_with_handles;
862 			}
863 			mutex_enter(&tx->handle_lock);
864 			handle = tx->free_tx_handles;
865 		}
866 		tx->free_tx_handles = handle->next;
867 		handle->next = *ret;
868 		*ret = handle;
869 	}
870 	mutex_exit(&tx->handle_lock);
871 	return (DDI_SUCCESS);
872 
873 abort_with_handles:
874 	myri10ge_free_tx_handle_slist(tx, *ret);
875 	return (err);
876 }
877 
878 
879 /*
880  * Frees DMA resources associated with the send ring
881  */
882 static void
883 myri10ge_unprepare_tx_ring(struct myri10ge_slice_state *ss)
884 {
885 	myri10ge_tx_ring_t *tx;
886 	struct myri10ge_tx_dma_handle_head handles;
887 	size_t bytes;
888 	int idx;
889 
890 	tx = &ss->tx;
891 	handles.head = NULL;
892 	handles.tail = NULL;
893 	for (idx = 0; idx < ss->tx.mask + 1; idx++) {
894 		if (tx->info[idx].m) {
895 			(void) ddi_dma_unbind_handle(tx->info[idx].handle->h);
896 			handles.head = tx->info[idx].handle;
897 			if (handles.tail == NULL)
898 				handles.tail = tx->info[idx].handle;
899 			freeb(tx->info[idx].m);
900 			tx->info[idx].m = 0;
901 			tx->info[idx].handle = 0;
902 		}
903 		tx->cp[idx].va = NULL;
904 		myri10ge_dma_free(&tx->cp[idx].dma);
905 	}
906 	bytes = sizeof (*tx->cp) * (tx->mask + 1);
907 	kmem_free(tx->cp, bytes);
908 	tx->cp = NULL;
909 	if (handles.head != NULL)
910 		myri10ge_free_tx_handles(tx, &handles);
911 	myri10ge_remove_tx_handles(ss);
912 }
913 
914 /*
915  * Allocates DMA handles associated with the send ring
916  */
917 static inline int
918 myri10ge_prepare_tx_ring(struct myri10ge_slice_state *ss)
919 {
920 	struct myri10ge_tx_dma_handle *handles;
921 	int h;
922 	size_t bytes;
923 
924 	bytes = sizeof (*ss->tx.cp) * (ss->tx.mask + 1);
925 	ss->tx.cp = kmem_zalloc(bytes, KM_SLEEP);
926 	if (ss->tx.cp == NULL) {
927 		cmn_err(CE_WARN,
928 		    "%s: Failed to allocate tx copyblock storage\n",
929 		    ss->mgp->name);
930 		return (DDI_FAILURE);
931 	}
932 
933 
934 	/* allocate the TX copyblocks */
935 	for (h = 0; h < ss->tx.mask + 1; h++) {
936 		ss->tx.cp[h].va = myri10ge_dma_alloc(ss->mgp->dip,
937 		    4096, &myri10ge_rx_jumbo_dma_attr,
938 		    &myri10ge_dev_access_attr, DDI_DMA_STREAMING,
939 		    DDI_DMA_WRITE|DDI_DMA_STREAMING, &ss->tx.cp[h].dma, 1,
940 		    DDI_DMA_DONTWAIT);
941 		if (ss->tx.cp[h].va == NULL) {
942 			cmn_err(CE_WARN, "%s: Failed to allocate tx "
943 			    "copyblock %d\n", ss->mgp->name, h);
944 			goto abort_with_copyblocks;
945 		}
946 	}
947 	/* pre-allocate transmit handles */
948 	handles = NULL;
949 	(void) myri10ge_alloc_tx_handles(ss, myri10ge_tx_handles_initial,
950 	    &handles);
951 	if (handles != NULL)
952 		myri10ge_free_tx_handle_slist(&ss->tx, handles);
953 
954 	return (DDI_SUCCESS);
955 
956 abort_with_copyblocks:
957 	while (h > 0)  {
958 		h--;
959 		myri10ge_dma_free(&ss->tx.cp[h].dma);
960 	}
961 
962 	bytes = sizeof (*ss->tx.cp) * (ss->tx.mask + 1);
963 	kmem_free(ss->tx.cp, bytes);
964 	ss->tx.cp = NULL;
965 	return (DDI_FAILURE);
966 }
967 
968 /*
969  * The eeprom strings on the lanaiX have the format
970  * SN=x\0
971  * MAC=x:x:x:x:x:x\0
972  * PT:ddd mmm xx xx:xx:xx xx\0
973  * PV:ddd mmm xx xx:xx:xx xx\0
974  */
975 static int
976 myri10ge_read_mac_addr(struct myri10ge_priv *mgp)
977 {
978 #define	MYRI10GE_NEXT_STRING(p) while (ptr < limit && *ptr++)
979 #define	myri10ge_digit(c) (((c) >= '0' && (c) <= '9') ? ((c) - '0') :	\
980 		(((c) >= 'A' && (c) <= 'F') ? (10 + (c) - 'A') :	\
981 		(((c) >= 'a' && (c) <= 'f') ? (10 + (c) - 'a') : -1)))
982 
983 	char *ptr, *limit;
984 	int i, hv, lv;
985 
986 	ptr = mgp->eeprom_strings;
987 	limit = mgp->eeprom_strings + MYRI10GE_EEPROM_STRINGS_SIZE;
988 
989 	while (*ptr != '\0' && ptr < limit) {
990 		if (memcmp(ptr, "MAC=", 4) == 0) {
991 			ptr += 4;
992 			if (myri10ge_verbose)
993 				printf("%s: mac address = %s\n", mgp->name,
994 				    ptr);
995 			mgp->mac_addr_string = ptr;
996 			for (i = 0; i < 6; i++) {
997 				if ((ptr + 2) > limit)
998 					goto abort;
999 
1000 				if (*(ptr+1) == ':') {
1001 					hv = 0;
1002 					lv = myri10ge_digit(*ptr); ptr++;
1003 				} else {
1004 					hv = myri10ge_digit(*ptr); ptr++;
1005 					lv = myri10ge_digit(*ptr); ptr++;
1006 				}
1007 				mgp->mac_addr[i] = (hv << 4) | lv;
1008 				ptr++;
1009 			}
1010 		}
1011 		if (memcmp((const void *)ptr, "SN=", 3) == 0) {
1012 			ptr += 3;
1013 			mgp->sn_str = (char *)ptr;
1014 		}
1015 		if (memcmp((const void *)ptr, "PC=", 3) == 0) {
1016 			ptr += 3;
1017 			mgp->pc_str = (char *)ptr;
1018 		}
1019 		MYRI10GE_NEXT_STRING(ptr);
1020 	}
1021 
1022 	return (0);
1023 
1024 abort:
1025 	cmn_err(CE_WARN, "%s: failed to parse eeprom_strings", mgp->name);
1026 	return (ENXIO);
1027 }
1028 
1029 
1030 /*
1031  * Determine the register set containing the PCI resource we
1032  * want to map: the memory-mappable part of the interface. We do
1033  * this by scanning the DDI "reg" property of the interface,
1034  * which is an array of mx_ddi_reg_set structures.
1035  */
1036 static int
1037 myri10ge_reg_set(dev_info_t *dip, int *reg_set, int *span,
1038     unsigned long *busno, unsigned long *devno,
1039     unsigned long *funcno)
1040 {
1041 
1042 #define	REGISTER_NUMBER(ip)	(ip[0] >>  0 & 0xff)
1043 #define	FUNCTION_NUMBER(ip)	(ip[0] >>  8 & 0x07)
1044 #define	DEVICE_NUMBER(ip)	(ip[0] >> 11 & 0x1f)
1045 #define	BUS_NUMBER(ip)		(ip[0] >> 16 & 0xff)
1046 #define	ADDRESS_SPACE(ip)	(ip[0] >> 24 & 0x03)
1047 #define	PCI_ADDR_HIGH(ip)	(ip[1])
1048 #define	PCI_ADDR_LOW(ip) 	(ip[2])
1049 #define	PCI_SPAN_HIGH(ip)	(ip[3])
1050 #define	PCI_SPAN_LOW(ip)	(ip[4])
1051 
1052 #define	MX_DDI_REG_SET_32_BIT_MEMORY_SPACE 2
1053 #define	MX_DDI_REG_SET_64_BIT_MEMORY_SPACE 3
1054 
1055 	int *data, i, *rs;
1056 	uint32_t nelementsp;
1057 
1058 #ifdef MYRI10GE_REGSET_VERBOSE
1059 	char *address_space_name[] = { "Configuration Space",
1060 					"I/O Space",
1061 					"32-bit Memory Space",
1062 					"64-bit Memory Space"
1063 	};
1064 #endif
1065 
1066 	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
1067 	    "reg", &data, &nelementsp) != DDI_SUCCESS) {
1068 		printf("Could not determine register set.\n");
1069 		return (ENXIO);
1070 	}
1071 
1072 #ifdef MYRI10GE_REGSET_VERBOSE
1073 	printf("There are %d register sets.\n", nelementsp / 5);
1074 #endif
1075 	if (!nelementsp) {
1076 		printf("Didn't find any \"reg\" properties.\n");
1077 		ddi_prop_free(data);
1078 		return (ENODEV);
1079 	}
1080 
1081 	/* Scan for the register number. */
1082 	rs = &data[0];
1083 	*busno = BUS_NUMBER(rs);
1084 	*devno = DEVICE_NUMBER(rs);
1085 	*funcno = FUNCTION_NUMBER(rs);
1086 
1087 #ifdef MYRI10GE_REGSET_VERBOSE
1088 	printf("*** Scanning for register number.\n");
1089 #endif
1090 	for (i = 0; i < nelementsp / 5; i++) {
1091 		rs = &data[5 * i];
1092 #ifdef MYRI10GE_REGSET_VERBOSE
1093 		printf("Examining register set %d:\n", i);
1094 		printf("  Register number = %d.\n", REGISTER_NUMBER(rs));
1095 		printf("  Function number = %d.\n", FUNCTION_NUMBER(rs));
1096 		printf("  Device number   = %d.\n", DEVICE_NUMBER(rs));
1097 		printf("  Bus number      = %d.\n", BUS_NUMBER(rs));
1098 		printf("  Address space   = %d (%s ).\n", ADDRESS_SPACE(rs),
1099 		    address_space_name[ADDRESS_SPACE(rs)]);
1100 		printf("  pci address 0x%08x %08x\n", PCI_ADDR_HIGH(rs),
1101 		    PCI_ADDR_LOW(rs));
1102 		printf("  pci span 0x%08x %08x\n", PCI_SPAN_HIGH(rs),
1103 		    PCI_SPAN_LOW(rs));
1104 #endif
1105 		/* We are looking for a memory property. */
1106 
1107 		if (ADDRESS_SPACE(rs) == MX_DDI_REG_SET_64_BIT_MEMORY_SPACE ||
1108 		    ADDRESS_SPACE(rs) == MX_DDI_REG_SET_32_BIT_MEMORY_SPACE) {
1109 			*reg_set = i;
1110 
1111 #ifdef MYRI10GE_REGSET_VERBOSE
1112 			printf("%s uses register set %d.\n",
1113 			    address_space_name[ADDRESS_SPACE(rs)], *reg_set);
1114 #endif
1115 
1116 			*span = (PCI_SPAN_LOW(rs));
1117 #ifdef MYRI10GE_REGSET_VERBOSE
1118 			printf("Board span is 0x%x\n", *span);
1119 #endif
1120 			break;
1121 		}
1122 	}
1123 
1124 	ddi_prop_free(data);
1125 
1126 	/* If no match, fail. */
1127 	if (i >= nelementsp / 5) {
1128 		return (EIO);
1129 	}
1130 
1131 	return (0);
1132 }
1133 
1134 
1135 static int
1136 myri10ge_load_firmware_from_zlib(struct myri10ge_priv *mgp, uint32_t *limit)
1137 {
1138 	void *inflate_buffer;
1139 	int rv, status;
1140 	size_t sram_size = mgp->sram_size - MYRI10GE_EEPROM_STRINGS_SIZE;
1141 	size_t destlen;
1142 	mcp_gen_header_t *hdr;
1143 	unsigned hdr_offset, i;
1144 
1145 
1146 	*limit = 0; /* -Wuninitialized */
1147 	status = 0;
1148 
1149 	inflate_buffer = kmem_zalloc(sram_size, KM_NOSLEEP);
1150 	if (!inflate_buffer) {
1151 		cmn_err(CE_WARN,
1152 		    "%s: Could not allocate buffer to inflate mcp\n",
1153 		    mgp->name);
1154 		return (ENOMEM);
1155 	}
1156 
1157 	destlen = sram_size;
1158 	rv = z_uncompress(inflate_buffer, &destlen, mgp->eth_z8e,
1159 	    mgp->eth_z8e_length);
1160 
1161 	if (rv != Z_OK) {
1162 		cmn_err(CE_WARN, "%s: Could not inflate mcp: %s\n",
1163 		    mgp->name, z_strerror(rv));
1164 		status = ENXIO;
1165 		goto abort;
1166 	}
1167 
1168 	*limit = (uint32_t)destlen;
1169 
1170 	hdr_offset = htonl(*(uint32_t *)(void *)((char *)inflate_buffer +
1171 	    MCP_HEADER_PTR_OFFSET));
1172 	hdr = (void *)((char *)inflate_buffer + hdr_offset);
1173 	if (ntohl(hdr->mcp_type) != MCP_TYPE_ETH) {
1174 		cmn_err(CE_WARN, "%s: Bad firmware type: 0x%x\n", mgp->name,
1175 		    ntohl(hdr->mcp_type));
1176 		status = EIO;
1177 		goto abort;
1178 	}
1179 
1180 	/* save firmware version for kstat */
1181 	(void) strncpy(mgp->fw_version, hdr->version, sizeof (mgp->fw_version));
1182 	if (myri10ge_verbose)
1183 		printf("%s: firmware id: %s\n", mgp->name, hdr->version);
1184 
1185 	/* Copy the inflated firmware to NIC SRAM. */
1186 	for (i = 0; i < *limit; i += 256) {
1187 		myri10ge_pio_copy((char *)mgp->sram + MYRI10GE_FW_OFFSET + i,
1188 		    (char *)inflate_buffer + i,
1189 		    min(256U, (unsigned)(*limit - i)));
1190 		mb();
1191 		(void) *(int *)(void *)mgp->sram;
1192 		mb();
1193 	}
1194 
1195 abort:
1196 	kmem_free(inflate_buffer, sram_size);
1197 
1198 	return (status);
1199 
1200 }
1201 
1202 
1203 int
1204 myri10ge_send_cmd(struct myri10ge_priv *mgp, uint32_t cmd,
1205 		myri10ge_cmd_t *data)
1206 {
1207 	mcp_cmd_t *buf;
1208 	char buf_bytes[sizeof (*buf) + 8];
1209 	volatile mcp_cmd_response_t *response = mgp->cmd;
1210 	volatile char *cmd_addr =
1211 	    (volatile char *)mgp->sram + MXGEFW_ETH_CMD;
1212 	int sleep_total = 0;
1213 
1214 	/* ensure buf is aligned to 8 bytes */
1215 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
1216 
1217 	buf->data0 = htonl(data->data0);
1218 	buf->data1 = htonl(data->data1);
1219 	buf->data2 = htonl(data->data2);
1220 	buf->cmd = htonl(cmd);
1221 	buf->response_addr.low = mgp->cmd_dma.low;
1222 	buf->response_addr.high = mgp->cmd_dma.high;
1223 	mutex_enter(&mgp->cmd_lock);
1224 	response->result = 0xffffffff;
1225 	mb();
1226 
1227 	myri10ge_pio_copy((void *)cmd_addr, buf, sizeof (*buf));
1228 
1229 	/* wait up to 20ms */
1230 	for (sleep_total = 0; sleep_total < 20; sleep_total++) {
1231 		mb();
1232 		if (response->result != 0xffffffff) {
1233 			if (response->result == 0) {
1234 				data->data0 = ntohl(response->data);
1235 				mutex_exit(&mgp->cmd_lock);
1236 				return (0);
1237 			} else if (ntohl(response->result)
1238 			    == MXGEFW_CMD_UNKNOWN) {
1239 				mutex_exit(&mgp->cmd_lock);
1240 				return (ENOSYS);
1241 			} else if (ntohl(response->result)
1242 			    == MXGEFW_CMD_ERROR_UNALIGNED) {
1243 				mutex_exit(&mgp->cmd_lock);
1244 				return (E2BIG);
1245 			} else {
1246 				cmn_err(CE_WARN,
1247 				    "%s: command %d failed, result = %d\n",
1248 				    mgp->name, cmd, ntohl(response->result));
1249 				mutex_exit(&mgp->cmd_lock);
1250 				return (ENXIO);
1251 			}
1252 		}
1253 		drv_usecwait(1000);
1254 	}
1255 	mutex_exit(&mgp->cmd_lock);
1256 	cmn_err(CE_WARN, "%s: command %d timed out, result = %d\n",
1257 	    mgp->name, cmd, ntohl(response->result));
1258 	return (EAGAIN);
1259 }
1260 
1261 /*
1262  * Enable or disable periodic RDMAs from the host to make certain
1263  * chipsets resend dropped PCIe messages
1264  */
1265 
1266 static void
1267 myri10ge_dummy_rdma(struct myri10ge_priv *mgp, int enable)
1268 {
1269 	char buf_bytes[72];
1270 	volatile uint32_t *confirm;
1271 	volatile char *submit;
1272 	uint32_t *buf;
1273 	int i;
1274 
1275 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
1276 
1277 	/* clear confirmation addr */
1278 	confirm = (volatile uint32_t *)mgp->cmd;
1279 	*confirm = 0;
1280 	mb();
1281 
1282 	/*
1283 	 * send an rdma command to the PCIe engine, and wait for the
1284 	 * response in the confirmation address.  The firmware should
1285 	 *  write a -1 there to indicate it is alive and well
1286 	 */
1287 
1288 	buf[0] = mgp->cmd_dma.high;		/* confirm addr MSW */
1289 	buf[1] = mgp->cmd_dma.low;		/* confirm addr LSW */
1290 	buf[2] = htonl(0xffffffff);		/* confirm data */
1291 	buf[3] = htonl(mgp->cmd_dma.high); 	/* dummy addr MSW */
1292 	buf[4] = htonl(mgp->cmd_dma.low); 	/* dummy addr LSW */
1293 	buf[5] = htonl(enable);			/* enable? */
1294 
1295 
1296 	submit = (volatile char *)(mgp->sram + MXGEFW_BOOT_DUMMY_RDMA);
1297 
1298 	myri10ge_pio_copy((char *)submit, buf, 64);
1299 	mb();
1300 	drv_usecwait(1000);
1301 	mb();
1302 	i = 0;
1303 	while (*confirm != 0xffffffff && i < 20) {
1304 		drv_usecwait(1000);
1305 		i++;
1306 	}
1307 	if (*confirm != 0xffffffff) {
1308 		cmn_err(CE_WARN, "%s: dummy rdma %s failed (%p = 0x%x)",
1309 		    mgp->name,
1310 		    (enable ? "enable" : "disable"), (void*) confirm, *confirm);
1311 	}
1312 }
1313 
1314 static int
1315 myri10ge_load_firmware(struct myri10ge_priv *mgp)
1316 {
1317 	myri10ge_cmd_t cmd;
1318 	volatile uint32_t *confirm;
1319 	volatile char *submit;
1320 	char buf_bytes[72];
1321 	uint32_t *buf, size;
1322 	int status, i;
1323 
1324 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
1325 
1326 	status = myri10ge_load_firmware_from_zlib(mgp, &size);
1327 	if (status) {
1328 		cmn_err(CE_WARN, "%s: firmware loading failed\n", mgp->name);
1329 		return (status);
1330 	}
1331 
1332 	/* clear confirmation addr */
1333 	confirm = (volatile uint32_t *)mgp->cmd;
1334 	*confirm = 0;
1335 	mb();
1336 
1337 	/*
1338 	 * send a reload command to the bootstrap MCP, and wait for the
1339 	 * response in the confirmation address.  The firmware should
1340 	 * write a -1 there to indicate it is alive and well
1341 	 */
1342 
1343 	buf[0] = mgp->cmd_dma.high;	/* confirm addr MSW */
1344 	buf[1] = mgp->cmd_dma.low;	/* confirm addr LSW */
1345 	buf[2] = htonl(0xffffffff);	/* confirm data */
1346 
1347 	/*
1348 	 * FIX: All newest firmware should un-protect the bottom of
1349 	 * the sram before handoff. However, the very first interfaces
1350 	 * do not. Therefore the handoff copy must skip the first 8 bytes
1351 	 */
1352 	buf[3] = htonl(MYRI10GE_FW_OFFSET + 8); /* where the code starts */
1353 	buf[4] = htonl(size - 8); 	/* length of code */
1354 	buf[5] = htonl(8);		/* where to copy to */
1355 	buf[6] = htonl(0);		/* where to jump to */
1356 
1357 	submit = (volatile char *)(mgp->sram + MXGEFW_BOOT_HANDOFF);
1358 
1359 	myri10ge_pio_copy((char *)submit, buf, 64);
1360 	mb();
1361 	drv_usecwait(1000);
1362 	mb();
1363 	i = 0;
1364 	while (*confirm != 0xffffffff && i < 1000) {
1365 		drv_usecwait(1000);
1366 		i++;
1367 	}
1368 	if (*confirm != 0xffffffff) {
1369 		cmn_err(CE_WARN, "%s: handoff failed (%p = 0x%x)",
1370 		    mgp->name, (void *) confirm, *confirm);
1371 
1372 		return (ENXIO);
1373 	}
1374 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
1375 	if (status != 0) {
1376 		cmn_err(CE_WARN, "%s: failed MXGEFW_CMD_GET_RX_RING_SIZE\n",
1377 		    mgp->name);
1378 		return (ENXIO);
1379 	}
1380 
1381 	mgp->max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
1382 	myri10ge_dummy_rdma(mgp, 1);
1383 	return (0);
1384 }
1385 
1386 static int
1387 myri10ge_m_unicst(void *arg, const uint8_t *addr)
1388 {
1389 	struct myri10ge_priv *mgp = arg;
1390 	myri10ge_cmd_t cmd;
1391 	int status;
1392 
1393 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1394 	    | (addr[2] << 8) | addr[3]);
1395 
1396 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1397 
1398 	status = myri10ge_send_cmd(mgp, MXGEFW_SET_MAC_ADDRESS, &cmd);
1399 	if (status == 0 && (addr != mgp->mac_addr))
1400 		(void) memcpy(mgp->mac_addr, addr, sizeof (mgp->mac_addr));
1401 
1402 	return (status);
1403 }
1404 
1405 static int
1406 myri10ge_change_pause(struct myri10ge_priv *mgp, int pause)
1407 {
1408 	myri10ge_cmd_t cmd;
1409 	int status;
1410 
1411 	if (pause)
1412 		status = myri10ge_send_cmd(mgp, MXGEFW_ENABLE_FLOW_CONTROL,
1413 		    &cmd);
1414 	else
1415 		status = myri10ge_send_cmd(mgp, MXGEFW_DISABLE_FLOW_CONTROL,
1416 		    &cmd);
1417 
1418 	if (status) {
1419 		cmn_err(CE_WARN, "%s: Failed to set flow control mode\n",
1420 		    mgp->name);
1421 		return (ENXIO);
1422 	}
1423 	mgp->pause = pause;
1424 	return (0);
1425 }
1426 
1427 static void
1428 myri10ge_change_promisc(struct myri10ge_priv *mgp, int promisc)
1429 {
1430 	myri10ge_cmd_t cmd;
1431 	int status;
1432 
1433 	if (promisc)
1434 		status = myri10ge_send_cmd(mgp, MXGEFW_ENABLE_PROMISC, &cmd);
1435 	else
1436 		status = myri10ge_send_cmd(mgp, MXGEFW_DISABLE_PROMISC, &cmd);
1437 
1438 	if (status) {
1439 		cmn_err(CE_WARN, "%s: Failed to set promisc mode\n",
1440 		    mgp->name);
1441 	}
1442 }
1443 
1444 static int
1445 myri10ge_dma_test(struct myri10ge_priv *mgp, int test_type)
1446 {
1447 	myri10ge_cmd_t cmd;
1448 	int status;
1449 	uint32_t len;
1450 	void *dmabench;
1451 	struct myri10ge_dma_stuff dmabench_dma;
1452 	char *test = " ";
1453 
1454 	/*
1455 	 * Run a small DMA test.
1456 	 * The magic multipliers to the length tell the firmware
1457 	 * tp do DMA read, write, or read+write tests.  The
1458 	 * results are returned in cmd.data0.  The upper 16
1459 	 * bits or the return is the number of transfers completed.
1460 	 * The lower 16 bits is the time in 0.5us ticks that the
1461 	 * transfers took to complete
1462 	 */
1463 
1464 	len = mgp->tx_boundary;
1465 
1466 	dmabench = myri10ge_dma_alloc(mgp->dip, len,
1467 	    &myri10ge_rx_jumbo_dma_attr, &myri10ge_dev_access_attr,
1468 	    DDI_DMA_STREAMING,  DDI_DMA_RDWR|DDI_DMA_STREAMING,
1469 	    &dmabench_dma, 1, DDI_DMA_DONTWAIT);
1470 	mgp->read_dma = mgp->write_dma = mgp->read_write_dma = 0;
1471 	if (dmabench == NULL) {
1472 		cmn_err(CE_WARN, "%s dma benchmark aborted\n", mgp->name);
1473 		return (ENOMEM);
1474 	}
1475 
1476 	cmd.data0 = ntohl(dmabench_dma.low);
1477 	cmd.data1 = ntohl(dmabench_dma.high);
1478 	cmd.data2 = len * 0x10000;
1479 	status = myri10ge_send_cmd(mgp, test_type, &cmd);
1480 	if (status != 0) {
1481 		test = "read";
1482 		goto abort;
1483 	}
1484 	mgp->read_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
1485 
1486 	cmd.data0 = ntohl(dmabench_dma.low);
1487 	cmd.data1 = ntohl(dmabench_dma.high);
1488 	cmd.data2 = len * 0x1;
1489 	status = myri10ge_send_cmd(mgp, test_type, &cmd);
1490 	if (status != 0) {
1491 		test = "write";
1492 		goto abort;
1493 	}
1494 	mgp->write_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
1495 
1496 	cmd.data0 = ntohl(dmabench_dma.low);
1497 	cmd.data1 = ntohl(dmabench_dma.high);
1498 	cmd.data2 = len * 0x10001;
1499 	status = myri10ge_send_cmd(mgp, test_type, &cmd);
1500 	if (status != 0) {
1501 		test = "read/write";
1502 		goto abort;
1503 	}
1504 	mgp->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
1505 	    (cmd.data0 & 0xffff);
1506 
1507 
1508 abort:
1509 	myri10ge_dma_free(&dmabench_dma);
1510 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
1511 		cmn_err(CE_WARN, "%s %s dma benchmark failed\n", mgp->name,
1512 		    test);
1513 	return (status);
1514 }
1515 
1516 static int
1517 myri10ge_reset(struct myri10ge_priv *mgp)
1518 {
1519 	myri10ge_cmd_t cmd;
1520 	struct myri10ge_nic_stat *ethstat;
1521 	struct myri10ge_slice_state *ss;
1522 	int i, status;
1523 	size_t bytes;
1524 
1525 	/* send a reset command to the card to see if it is alive */
1526 	(void) memset(&cmd, 0, sizeof (cmd));
1527 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_RESET, &cmd);
1528 	if (status != 0) {
1529 		cmn_err(CE_WARN, "%s: failed reset\n", mgp->name);
1530 		return (ENXIO);
1531 	}
1532 
1533 	/* Now exchange information about interrupts  */
1534 
1535 	bytes = mgp->max_intr_slots * sizeof (*mgp->ss[0].rx_done.entry);
1536 	cmd.data0 = (uint32_t)bytes;
1537 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1538 
1539 	/*
1540 	 * Even though we already know how many slices are supported
1541 	 * via myri10ge_probe_slices() MXGEFW_CMD_GET_MAX_RSS_QUEUES
1542 	 * has magic side effects, and must be called after a reset.
1543 	 * It must be called prior to calling any RSS related cmds,
1544 	 * including assigning an interrupt queue for anything but
1545 	 * slice 0.  It must also be called *after*
1546 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1547 	 * the firmware to compute offsets.
1548 	 */
1549 
1550 	if (mgp->num_slices > 1) {
1551 
1552 		/* ask the maximum number of slices it supports */
1553 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1554 		    &cmd);
1555 		if (status != 0) {
1556 			cmn_err(CE_WARN,
1557 			    "%s: failed to get number of slices\n",
1558 			    mgp->name);
1559 			return (status);
1560 		}
1561 
1562 		/*
1563 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1564 		 * to setting up the interrupt queue DMA
1565 		 */
1566 
1567 		cmd.data0 = mgp->num_slices;
1568 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE |
1569 		    MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1570 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1571 		    &cmd);
1572 		if (status != 0) {
1573 			cmn_err(CE_WARN,
1574 			    "%s: failed to set number of slices\n",
1575 			    mgp->name);
1576 			return (status);
1577 		}
1578 	}
1579 	for (i = 0; i < mgp->num_slices; i++) {
1580 		ss = &mgp->ss[i];
1581 		cmd.data0 = ntohl(ss->rx_done.dma.low);
1582 		cmd.data1 = ntohl(ss->rx_done.dma.high);
1583 		cmd.data2 = i;
1584 		status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_INTRQ_DMA,
1585 		    &cmd);
1586 	};
1587 
1588 	status |= myri10ge_send_cmd(mgp,  MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1589 	for (i = 0; i < mgp->num_slices; i++) {
1590 		ss = &mgp->ss[i];
1591 		ss->irq_claim = (volatile unsigned int *)
1592 		    (void *)(mgp->sram + cmd.data0 + 8 * i);
1593 	}
1594 
1595 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_FIXED) {
1596 		status |= myri10ge_send_cmd(mgp,
1597 		    MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, &cmd);
1598 		mgp->irq_deassert = (uint32_t *)(void *)(mgp->sram + cmd.data0);
1599 	}
1600 
1601 	status |= myri10ge_send_cmd(mgp,
1602 	    MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1603 	mgp->intr_coal_delay_ptr = (uint32_t *)(void *)(mgp->sram + cmd.data0);
1604 
1605 	if (status != 0) {
1606 		cmn_err(CE_WARN, "%s: failed set interrupt parameters\n",
1607 		    mgp->name);
1608 		return (status);
1609 	}
1610 
1611 	*mgp->intr_coal_delay_ptr = htonl(mgp->intr_coal_delay);
1612 	(void) myri10ge_dma_test(mgp, MXGEFW_DMA_TEST);
1613 
1614 	/* reset mcp/driver shared state back to 0 */
1615 
1616 	for (i = 0; i < mgp->num_slices; i++) {
1617 		ss = &mgp->ss[i];
1618 		bytes = mgp->max_intr_slots *
1619 		    sizeof (*mgp->ss[0].rx_done.entry);
1620 		(void) memset(ss->rx_done.entry, 0, bytes);
1621 		ss->tx.req = 0;
1622 		ss->tx.done = 0;
1623 		ss->tx.pkt_done = 0;
1624 		ss->rx_big.cnt = 0;
1625 		ss->rx_small.cnt = 0;
1626 		ss->rx_done.idx = 0;
1627 		ss->rx_done.cnt = 0;
1628 		ss->rx_token = 0;
1629 		ss->tx.watchdog_done = 0;
1630 		ss->tx.watchdog_req = 0;
1631 		ss->tx.active = 0;
1632 		ss->tx.activate = 0;
1633 	}
1634 	mgp->watchdog_rx_pause = 0;
1635 	if (mgp->ksp_stat != NULL) {
1636 		ethstat = (struct myri10ge_nic_stat *)mgp->ksp_stat->ks_data;
1637 		ethstat->link_changes.value.ul = 0;
1638 	}
1639 	status = myri10ge_m_unicst(mgp, mgp->mac_addr);
1640 	myri10ge_change_promisc(mgp, 0);
1641 	(void) myri10ge_change_pause(mgp, mgp->pause);
1642 	return (status);
1643 }
1644 
1645 static int
1646 myri10ge_init_toeplitz(struct myri10ge_priv *mgp)
1647 {
1648 	myri10ge_cmd_t cmd;
1649 	int i, b, s, t, j;
1650 	int status;
1651 	uint32_t k[8];
1652 	uint32_t tmp;
1653 	uint8_t *key;
1654 
1655 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_RSS_KEY_OFFSET,
1656 	    &cmd);
1657 	if (status != 0) {
1658 		cmn_err(CE_WARN, "%s: failed to get rss key\n",
1659 		    mgp->name);
1660 		return (EIO);
1661 	}
1662 	myri10ge_pio_copy32(mgp->rss_key,
1663 	    (uint32_t *)(void*)((char *)mgp->sram + cmd.data0),
1664 	    sizeof (mgp->rss_key));
1665 
1666 	mgp->toeplitz_hash_table = kmem_alloc(sizeof (uint32_t) * 12 * 256,
1667 	    KM_SLEEP);
1668 	key = (uint8_t *)mgp->rss_key;
1669 	t = 0;
1670 	for (b = 0; b < 12; b++) {
1671 		for (s = 0; s < 8; s++) {
1672 			/* Bits: b*8+s, ..., b*8+s+31 */
1673 			k[s] = 0;
1674 			for (j = 0; j < 32; j++) {
1675 				int bit = b*8+s+j;
1676 				bit = 0x1 & (key[bit / 8] >> (7 -(bit & 0x7)));
1677 				k[s] |= bit << (31 - j);
1678 			}
1679 		}
1680 
1681 		for (i = 0; i <= 0xff; i++) {
1682 			tmp = 0;
1683 			if (i & (1 << 7)) { tmp ^= k[0]; }
1684 			if (i & (1 << 6)) { tmp ^= k[1]; }
1685 			if (i & (1 << 5)) { tmp ^= k[2]; }
1686 			if (i & (1 << 4)) { tmp ^= k[3]; }
1687 			if (i & (1 << 3)) { tmp ^= k[4]; }
1688 			if (i & (1 << 2)) { tmp ^= k[5]; }
1689 			if (i & (1 << 1)) { tmp ^= k[6]; }
1690 			if (i & (1 << 0)) { tmp ^= k[7]; }
1691 			mgp->toeplitz_hash_table[t++] = tmp;
1692 		}
1693 	}
1694 	return (0);
1695 }
1696 
1697 static inline struct myri10ge_slice_state *
1698 myri10ge_toeplitz_send_hash(struct myri10ge_priv *mgp, struct ip *ip)
1699 {
1700 	struct tcphdr *hdr;
1701 	uint32_t saddr, daddr;
1702 	uint32_t hash, slice;
1703 	uint32_t *table = mgp->toeplitz_hash_table;
1704 	uint16_t src, dst;
1705 
1706 	/*
1707 	 * Note hashing order is reversed from how it is done
1708 	 * in the NIC, so as to generate the same hash value
1709 	 * for the connection to try to keep connections CPU local
1710 	 */
1711 
1712 	/* hash on IPv4 src/dst address */
1713 	saddr = ntohl(ip->ip_src.s_addr);
1714 	daddr = ntohl(ip->ip_dst.s_addr);
1715 	hash = table[(256 * 0) + ((daddr >> 24) & 0xff)];
1716 	hash ^= table[(256 * 1) + ((daddr >> 16) & 0xff)];
1717 	hash ^= table[(256 * 2) + ((daddr >> 8) & 0xff)];
1718 	hash ^= table[(256 * 3) + ((daddr) & 0xff)];
1719 	hash ^= table[(256 * 4) + ((saddr >> 24) & 0xff)];
1720 	hash ^= table[(256 * 5) + ((saddr >> 16) & 0xff)];
1721 	hash ^= table[(256 * 6) + ((saddr >> 8) & 0xff)];
1722 	hash ^= table[(256 * 7) + ((saddr) & 0xff)];
1723 	/* hash on TCP port, if required */
1724 	if ((myri10ge_rss_hash & MXGEFW_RSS_HASH_TYPE_TCP_IPV4) &&
1725 	    ip->ip_p == IPPROTO_TCP) {
1726 		hdr = (struct tcphdr *)(void *)
1727 		    (((uint8_t *)ip) +  (ip->ip_hl << 2));
1728 		src = ntohs(hdr->th_sport);
1729 		dst = ntohs(hdr->th_dport);
1730 
1731 		hash ^= table[(256 * 8) + ((dst >> 8) & 0xff)];
1732 		hash ^= table[(256 * 9) + ((dst) & 0xff)];
1733 		hash ^= table[(256 * 10) + ((src >> 8) & 0xff)];
1734 		hash ^= table[(256 * 11) + ((src) & 0xff)];
1735 	}
1736 	slice = (mgp->num_slices - 1) & hash;
1737 	return (&mgp->ss[slice]);
1738 
1739 }
1740 
1741 static inline struct myri10ge_slice_state *
1742 myri10ge_simple_send_hash(struct myri10ge_priv *mgp, struct ip *ip)
1743 {
1744 	struct tcphdr *hdr;
1745 	uint32_t slice, hash_val;
1746 
1747 
1748 	if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP) {
1749 		return (&mgp->ss[0]);
1750 	}
1751 	hdr = (struct tcphdr *)(void *)(((uint8_t *)ip) +  (ip->ip_hl << 2));
1752 
1753 	/*
1754 	 * Use the second byte of the *destination* address for
1755 	 * MXGEFW_RSS_HASH_TYPE_SRC_PORT, so as to match NIC's hashing
1756 	 */
1757 	hash_val = ntohs(hdr->th_dport) & 0xff;
1758 	if (myri10ge_rss_hash == MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT)
1759 		hash_val += ntohs(hdr->th_sport) & 0xff;
1760 
1761 	slice = (mgp->num_slices - 1) & hash_val;
1762 	return (&mgp->ss[slice]);
1763 }
1764 
1765 static inline struct myri10ge_slice_state *
1766 myri10ge_send_hash(struct myri10ge_priv *mgp, mblk_t *mp)
1767 {
1768 	unsigned int slice = 0;
1769 	struct ether_header *eh;
1770 	struct ether_vlan_header *vh;
1771 	struct ip *ip;
1772 	int ehl, ihl;
1773 
1774 	if (mgp->num_slices == 1)
1775 		return (&mgp->ss[0]);
1776 
1777 	if (myri10ge_tx_hash == 0) {
1778 		slice = CPU->cpu_id & (mgp->num_slices - 1);
1779 		return (&mgp->ss[slice]);
1780 	}
1781 
1782 	/*
1783 	 *  ensure it is a TCP or UDP over IPv4 packet, and that the
1784 	 *  headers are in the 1st mblk.  Otherwise, punt
1785 	 */
1786 	ehl = sizeof (*eh);
1787 	ihl = sizeof (*ip);
1788 	if ((MBLKL(mp)) <  (ehl + ihl + 8))
1789 		return (&mgp->ss[0]);
1790 	eh = (struct ether_header *)(void *)mp->b_rptr;
1791 	ip = (struct ip *)(void *)(eh + 1);
1792 	if (eh->ether_type != BE_16(ETHERTYPE_IP)) {
1793 		if (eh->ether_type != BE_16(ETHERTYPE_VLAN))
1794 			return (&mgp->ss[0]);
1795 		vh = (struct ether_vlan_header *)(void *)mp->b_rptr;
1796 		if (vh->ether_type != BE_16(ETHERTYPE_IP))
1797 			return (&mgp->ss[0]);
1798 		ehl += 4;
1799 		ip = (struct ip *)(void *)(vh + 1);
1800 	}
1801 	ihl = ip->ip_hl << 2;
1802 	if (MBLKL(mp) <  (ehl + ihl + 8))
1803 		return (&mgp->ss[0]);
1804 	switch (myri10ge_rss_hash) {
1805 	case MXGEFW_RSS_HASH_TYPE_IPV4:
1806 		/* fallthru */
1807 	case MXGEFW_RSS_HASH_TYPE_TCP_IPV4:
1808 		/* fallthru */
1809 	case (MXGEFW_RSS_HASH_TYPE_IPV4|MXGEFW_RSS_HASH_TYPE_TCP_IPV4):
1810 		return (myri10ge_toeplitz_send_hash(mgp, ip));
1811 	case MXGEFW_RSS_HASH_TYPE_SRC_PORT:
1812 		/* fallthru */
1813 	case MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT:
1814 		return (myri10ge_simple_send_hash(mgp, ip));
1815 	default:
1816 		break;
1817 	}
1818 	return (&mgp->ss[0]);
1819 }
1820 
1821 static int
1822 myri10ge_setup_slice(struct myri10ge_slice_state *ss)
1823 {
1824 	struct myri10ge_priv *mgp = ss->mgp;
1825 	myri10ge_cmd_t cmd;
1826 	int tx_ring_size, rx_ring_size;
1827 	int tx_ring_entries, rx_ring_entries;
1828 	int slice, status;
1829 	int allocated, idx;
1830 	size_t bytes;
1831 
1832 	slice = ss - mgp->ss;
1833 	cmd.data0 = slice;
1834 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
1835 	tx_ring_size = cmd.data0;
1836 	cmd.data0 = slice;
1837 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
1838 	if (status != 0)
1839 		return (status);
1840 	rx_ring_size = cmd.data0;
1841 
1842 	tx_ring_entries = tx_ring_size / sizeof (struct mcp_kreq_ether_send);
1843 	rx_ring_entries = rx_ring_size / sizeof (struct mcp_dma_addr);
1844 	ss->tx.mask = tx_ring_entries - 1;
1845 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
1846 
1847 	/* get the lanai pointers to the send and receive rings */
1848 
1849 	cmd.data0 = slice;
1850 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
1851 	ss->tx.lanai = (mcp_kreq_ether_send_t *)(void *)(mgp->sram + cmd.data0);
1852 	if (mgp->num_slices > 1) {
1853 		ss->tx.go = (char *)mgp->sram + MXGEFW_ETH_SEND_GO + 64 * slice;
1854 		ss->tx.stop = (char *)mgp->sram + MXGEFW_ETH_SEND_STOP +
1855 		    64 * slice;
1856 	} else {
1857 		ss->tx.go = NULL;
1858 		ss->tx.stop = NULL;
1859 	}
1860 
1861 	cmd.data0 = slice;
1862 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
1863 	ss->rx_small.lanai = (mcp_kreq_ether_recv_t *)
1864 	    (void *)(mgp->sram + cmd.data0);
1865 
1866 	cmd.data0 = slice;
1867 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
1868 	ss->rx_big.lanai = (mcp_kreq_ether_recv_t *)(void *)
1869 	    (mgp->sram + cmd.data0);
1870 
1871 	if (status != 0) {
1872 		cmn_err(CE_WARN,
1873 		    "%s: failed to get ring sizes or locations\n", mgp->name);
1874 		return (status);
1875 	}
1876 
1877 	status = ENOMEM;
1878 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
1879 	ss->rx_small.shadow = kmem_zalloc(bytes, KM_SLEEP);
1880 	if (ss->rx_small.shadow == NULL)
1881 		goto abort;
1882 	(void) memset(ss->rx_small.shadow, 0, bytes);
1883 
1884 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
1885 	ss->rx_big.shadow = kmem_zalloc(bytes, KM_SLEEP);
1886 	if (ss->rx_big.shadow == NULL)
1887 		goto abort_with_rx_small_shadow;
1888 	(void) memset(ss->rx_big.shadow, 0, bytes);
1889 
1890 	/* allocate the host info rings */
1891 
1892 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
1893 	ss->tx.info = kmem_zalloc(bytes, KM_SLEEP);
1894 	if (ss->tx.info == NULL)
1895 		goto abort_with_rx_big_shadow;
1896 	(void) memset(ss->tx.info, 0, bytes);
1897 
1898 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
1899 	ss->rx_small.info = kmem_zalloc(bytes, KM_SLEEP);
1900 	if (ss->rx_small.info == NULL)
1901 		goto abort_with_tx_info;
1902 	(void) memset(ss->rx_small.info, 0, bytes);
1903 
1904 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
1905 	ss->rx_big.info = kmem_zalloc(bytes, KM_SLEEP);
1906 	if (ss->rx_big.info == NULL)
1907 		goto abort_with_rx_small_info;
1908 	(void) memset(ss->rx_big.info, 0, bytes);
1909 
1910 	ss->tx.stall = ss->tx.sched = 0;
1911 	ss->tx.stall_early = ss->tx.stall_late = 0;
1912 
1913 	ss->jbufs_for_smalls = 1 + (1 + ss->rx_small.mask) /
1914 	    (myri10ge_mtu / (myri10ge_small_bytes + MXGEFW_PAD));
1915 
1916 	allocated = myri10ge_add_jbufs(ss,
1917 	    myri10ge_bigbufs_initial + ss->jbufs_for_smalls, 1);
1918 	if (allocated < ss->jbufs_for_smalls + myri10ge_bigbufs_initial) {
1919 		cmn_err(CE_WARN,
1920 		    "%s: Could not allocate enough receive buffers (%d/%d)\n",
1921 		    mgp->name, allocated,
1922 		    myri10ge_bigbufs_initial + ss->jbufs_for_smalls);
1923 		goto abort_with_jumbos;
1924 	}
1925 
1926 	myri10ge_carve_up_jbufs_into_small_ring(ss);
1927 	ss->j_rx_cnt = 0;
1928 
1929 	mutex_enter(&ss->jpool.mtx);
1930 	if (allocated < rx_ring_entries)
1931 		ss->jpool.low_water = allocated / 4;
1932 	else
1933 		ss->jpool.low_water = rx_ring_entries / 2;
1934 
1935 	/*
1936 	 * invalidate the big receive ring in case we do not
1937 	 * allocate sufficient jumbos to fill it
1938 	 */
1939 	(void) memset(ss->rx_big.shadow, 1,
1940 	    (ss->rx_big.mask + 1) * sizeof (ss->rx_big.shadow[0]));
1941 	for (idx = 7; idx <= ss->rx_big.mask; idx += 8) {
1942 		myri10ge_submit_8rx(&ss->rx_big.lanai[idx - 7],
1943 		    &ss->rx_big.shadow[idx - 7]);
1944 		mb();
1945 	}
1946 
1947 
1948 	myri10ge_restock_jumbos(ss);
1949 
1950 	for (idx = 7; idx <= ss->rx_small.mask; idx += 8) {
1951 		myri10ge_submit_8rx(&ss->rx_small.lanai[idx - 7],
1952 		    &ss->rx_small.shadow[idx - 7]);
1953 		mb();
1954 	}
1955 	ss->rx_small.cnt = ss->rx_small.mask + 1;
1956 
1957 	mutex_exit(&ss->jpool.mtx);
1958 
1959 	status = myri10ge_prepare_tx_ring(ss);
1960 
1961 	if (status != 0)
1962 		goto abort_with_small_jbufs;
1963 
1964 	cmd.data0 = ntohl(ss->fw_stats_dma.low);
1965 	cmd.data1 = ntohl(ss->fw_stats_dma.high);
1966 	cmd.data2 = sizeof (mcp_irq_data_t);
1967 	cmd.data2 |= (slice << 16);
1968 	bzero(ss->fw_stats, sizeof (*ss->fw_stats));
1969 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
1970 	if (status == ENOSYS) {
1971 		cmd.data0 = ntohl(ss->fw_stats_dma.low) +
1972 		    offsetof(mcp_irq_data_t, send_done_count);
1973 		cmd.data1 = ntohl(ss->fw_stats_dma.high);
1974 		status = myri10ge_send_cmd(mgp,
1975 		    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE, &cmd);
1976 	}
1977 	if (status) {
1978 		cmn_err(CE_WARN, "%s: Couldn't set stats DMA\n", mgp->name);
1979 		goto abort_with_tx;
1980 	}
1981 
1982 	return (0);
1983 
1984 abort_with_tx:
1985 	myri10ge_unprepare_tx_ring(ss);
1986 
1987 abort_with_small_jbufs:
1988 	myri10ge_release_small_jbufs(ss);
1989 
1990 abort_with_jumbos:
1991 	if (allocated != 0) {
1992 		mutex_enter(&ss->jpool.mtx);
1993 		ss->jpool.low_water = 0;
1994 		mutex_exit(&ss->jpool.mtx);
1995 		myri10ge_unstock_jumbos(ss);
1996 		myri10ge_remove_jbufs(ss);
1997 	}
1998 
1999 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
2000 	kmem_free(ss->rx_big.info, bytes);
2001 
2002 abort_with_rx_small_info:
2003 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
2004 	kmem_free(ss->rx_small.info, bytes);
2005 
2006 abort_with_tx_info:
2007 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
2008 	kmem_free(ss->tx.info, bytes);
2009 
2010 abort_with_rx_big_shadow:
2011 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
2012 	kmem_free(ss->rx_big.shadow, bytes);
2013 
2014 abort_with_rx_small_shadow:
2015 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
2016 	kmem_free(ss->rx_small.shadow, bytes);
2017 abort:
2018 	return (status);
2019 
2020 }
2021 
2022 static void
2023 myri10ge_teardown_slice(struct myri10ge_slice_state *ss)
2024 {
2025 	int tx_ring_entries, rx_ring_entries;
2026 	size_t bytes;
2027 
2028 	/* ignore slices that have not been fully setup */
2029 	if (ss->tx.cp == NULL)
2030 		return;
2031 	/* Free the TX copy buffers */
2032 	myri10ge_unprepare_tx_ring(ss);
2033 
2034 	/* stop passing returned buffers to firmware */
2035 
2036 	mutex_enter(&ss->jpool.mtx);
2037 	ss->jpool.low_water = 0;
2038 	mutex_exit(&ss->jpool.mtx);
2039 	myri10ge_release_small_jbufs(ss);
2040 
2041 	/* Release the free jumbo frame pool */
2042 	myri10ge_unstock_jumbos(ss);
2043 	myri10ge_remove_jbufs(ss);
2044 
2045 	rx_ring_entries = ss->rx_big.mask + 1;
2046 	tx_ring_entries = ss->tx.mask + 1;
2047 
2048 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
2049 	kmem_free(ss->rx_big.info, bytes);
2050 
2051 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
2052 	kmem_free(ss->rx_small.info, bytes);
2053 
2054 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
2055 	kmem_free(ss->tx.info, bytes);
2056 
2057 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
2058 	kmem_free(ss->rx_big.shadow, bytes);
2059 
2060 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
2061 	kmem_free(ss->rx_small.shadow, bytes);
2062 
2063 }
2064 static int
2065 myri10ge_start_locked(struct myri10ge_priv *mgp)
2066 {
2067 	myri10ge_cmd_t cmd;
2068 	int status, big_pow2, i;
2069 	volatile uint8_t *itable;
2070 
2071 	status = DDI_SUCCESS;
2072 	/* Allocate DMA resources and receive buffers */
2073 
2074 	status = myri10ge_reset(mgp);
2075 	if (status != 0) {
2076 		cmn_err(CE_WARN, "%s: failed reset\n", mgp->name);
2077 		return (DDI_FAILURE);
2078 	}
2079 
2080 	if (mgp->num_slices > 1) {
2081 		cmd.data0 = mgp->num_slices;
2082 		cmd.data1 = 1; /* use MSI-X */
2083 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ENABLE_RSS_QUEUES,
2084 		    &cmd);
2085 		if (status != 0) {
2086 			cmn_err(CE_WARN,
2087 			    "%s: failed to set number of slices\n",
2088 			    mgp->name);
2089 			goto abort_with_nothing;
2090 		}
2091 		/* setup the indirection table */
2092 		cmd.data0 = mgp->num_slices;
2093 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
2094 		    &cmd);
2095 
2096 		status |= myri10ge_send_cmd(mgp,
2097 		    MXGEFW_CMD_GET_RSS_TABLE_OFFSET, &cmd);
2098 		if (status != 0) {
2099 			cmn_err(CE_WARN,
2100 			    "%s: failed to setup rss tables\n", mgp->name);
2101 		}
2102 
2103 		/* just enable an identity mapping */
2104 		itable = mgp->sram + cmd.data0;
2105 		for (i = 0; i < mgp->num_slices; i++)
2106 			itable[i] = (uint8_t)i;
2107 
2108 		if (myri10ge_rss_hash & MYRI10GE_TOEPLITZ_HASH) {
2109 			status = myri10ge_init_toeplitz(mgp);
2110 			if (status != 0) {
2111 				cmn_err(CE_WARN, "%s: failed to setup "
2112 				    "toeplitz tx hash table", mgp->name);
2113 				goto abort_with_nothing;
2114 			}
2115 		}
2116 		cmd.data0 = 1;
2117 		cmd.data1 = myri10ge_rss_hash;
2118 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_RSS_ENABLE,
2119 		    &cmd);
2120 		if (status != 0) {
2121 			cmn_err(CE_WARN,
2122 			    "%s: failed to enable slices\n", mgp->name);
2123 			goto abort_with_toeplitz;
2124 		}
2125 	}
2126 
2127 	for (i = 0; i < mgp->num_slices; i++) {
2128 		status = myri10ge_setup_slice(&mgp->ss[i]);
2129 		if (status != 0)
2130 			goto abort_with_slices;
2131 	}
2132 
2133 	/*
2134 	 * Tell the MCP how many buffers he has, and to
2135 	 *  bring the ethernet interface up
2136 	 *
2137 	 * Firmware needs the big buff size as a power of 2.  Lie and
2138 	 * tell him the buffer is larger, because we only use 1
2139 	 * buffer/pkt, and the mtu will prevent overruns
2140 	 */
2141 	big_pow2 = myri10ge_mtu + MXGEFW_PAD;
2142 	while ((big_pow2 & (big_pow2 - 1)) != 0)
2143 		big_pow2++;
2144 
2145 	/* now give firmware buffers sizes, and MTU */
2146 	cmd.data0 = myri10ge_mtu;
2147 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_MTU, &cmd);
2148 	cmd.data0 = myri10ge_small_bytes;
2149 	status |=
2150 	    myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE, &cmd);
2151 	cmd.data0 = big_pow2;
2152 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
2153 	if (status) {
2154 		cmn_err(CE_WARN, "%s: Couldn't set buffer sizes\n", mgp->name);
2155 		goto abort_with_slices;
2156 	}
2157 
2158 
2159 	cmd.data0 = 1;
2160 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_TSO_MODE, &cmd);
2161 	if (status) {
2162 		cmn_err(CE_WARN, "%s: unable to setup TSO (%d)\n",
2163 		    mgp->name, status);
2164 	} else {
2165 		mgp->features |= MYRI10GE_TSO;
2166 	}
2167 
2168 	mgp->link_state = -1;
2169 	mgp->rdma_tags_available = 15;
2170 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_UP, &cmd);
2171 	if (status) {
2172 		cmn_err(CE_WARN, "%s: unable to start ethernet\n", mgp->name);
2173 		goto abort_with_slices;
2174 	}
2175 	mgp->running = MYRI10GE_ETH_RUNNING;
2176 	return (DDI_SUCCESS);
2177 
2178 abort_with_slices:
2179 	for (i = 0; i < mgp->num_slices; i++)
2180 		myri10ge_teardown_slice(&mgp->ss[i]);
2181 
2182 	mgp->running = MYRI10GE_ETH_STOPPED;
2183 
2184 abort_with_toeplitz:
2185 	if (mgp->toeplitz_hash_table != NULL) {
2186 		kmem_free(mgp->toeplitz_hash_table,
2187 		    sizeof (uint32_t) * 12 * 256);
2188 		mgp->toeplitz_hash_table = NULL;
2189 	}
2190 
2191 abort_with_nothing:
2192 	return (DDI_FAILURE);
2193 }
2194 
2195 static void
2196 myri10ge_stop_locked(struct myri10ge_priv *mgp)
2197 {
2198 	int status, old_down_cnt;
2199 	myri10ge_cmd_t cmd;
2200 	int wait_time = 10;
2201 	int i, polling;
2202 
2203 	old_down_cnt = mgp->down_cnt;
2204 	mb();
2205 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
2206 	if (status) {
2207 		cmn_err(CE_WARN, "%s: Couldn't bring down link\n", mgp->name);
2208 	}
2209 
2210 	while (old_down_cnt == *((volatile int *)&mgp->down_cnt)) {
2211 		delay(1 * drv_usectohz(1000000));
2212 		wait_time--;
2213 		if (wait_time == 0)
2214 			break;
2215 	}
2216 again:
2217 	if (old_down_cnt == *((volatile int *)&mgp->down_cnt)) {
2218 		cmn_err(CE_WARN, "%s: didn't get down irq\n", mgp->name);
2219 		for (i = 0; i < mgp->num_slices; i++) {
2220 			/*
2221 			 * take and release the rx lock to ensure
2222 			 * that no interrupt thread is blocked
2223 			 * elsewhere in the stack, preventing
2224 			 * completion
2225 			 */
2226 
2227 			mutex_enter(&mgp->ss[i].rx_lock);
2228 			printf("%s: slice %d rx irq idle\n",
2229 			    mgp->name, i);
2230 			mutex_exit(&mgp->ss[i].rx_lock);
2231 
2232 			/* verify that the poll handler is inactive */
2233 			mutex_enter(&mgp->ss->poll_lock);
2234 			polling = mgp->ss->rx_polling;
2235 			mutex_exit(&mgp->ss->poll_lock);
2236 			if (polling) {
2237 				printf("%s: slice %d is polling\n",
2238 				    mgp->name, i);
2239 				delay(1 * drv_usectohz(1000000));
2240 				goto again;
2241 			}
2242 		}
2243 		delay(1 * drv_usectohz(1000000));
2244 		if (old_down_cnt == *((volatile int *)&mgp->down_cnt)) {
2245 			cmn_err(CE_WARN, "%s: Never got down irq\n", mgp->name);
2246 		}
2247 	}
2248 
2249 	for (i = 0; i < mgp->num_slices; i++)
2250 		myri10ge_teardown_slice(&mgp->ss[i]);
2251 
2252 	if (mgp->toeplitz_hash_table != NULL) {
2253 		kmem_free(mgp->toeplitz_hash_table,
2254 		    sizeof (uint32_t) * 12 * 256);
2255 		mgp->toeplitz_hash_table = NULL;
2256 	}
2257 	mgp->running = MYRI10GE_ETH_STOPPED;
2258 }
2259 
2260 static int
2261 myri10ge_m_start(void *arg)
2262 {
2263 	struct myri10ge_priv *mgp = arg;
2264 	int status;
2265 
2266 	mutex_enter(&mgp->intrlock);
2267 
2268 	if (mgp->running != MYRI10GE_ETH_STOPPED) {
2269 		mutex_exit(&mgp->intrlock);
2270 		return (DDI_FAILURE);
2271 	}
2272 	status = myri10ge_start_locked(mgp);
2273 	mutex_exit(&mgp->intrlock);
2274 
2275 	if (status != DDI_SUCCESS)
2276 		return (status);
2277 
2278 	/* start the watchdog timer */
2279 	mgp->timer_id = timeout(myri10ge_watchdog, mgp,
2280 	    mgp->timer_ticks);
2281 	return (DDI_SUCCESS);
2282 
2283 }
2284 
2285 static void
2286 myri10ge_m_stop(void *arg)
2287 {
2288 	struct myri10ge_priv *mgp = arg;
2289 
2290 	mutex_enter(&mgp->intrlock);
2291 	/* if the device not running give up */
2292 	if (mgp->running != MYRI10GE_ETH_RUNNING) {
2293 		mutex_exit(&mgp->intrlock);
2294 		return;
2295 	}
2296 
2297 	mgp->running = MYRI10GE_ETH_STOPPING;
2298 	mutex_exit(&mgp->intrlock);
2299 	(void) untimeout(mgp->timer_id);
2300 	mutex_enter(&mgp->intrlock);
2301 	myri10ge_stop_locked(mgp);
2302 	mutex_exit(&mgp->intrlock);
2303 
2304 }
2305 
2306 static inline void
2307 myri10ge_rx_csum(mblk_t *mp, struct myri10ge_rx_ring_stats *s, uint32_t csum)
2308 {
2309 	struct ether_header *eh;
2310 	struct ip *ip;
2311 	struct ip6_hdr *ip6;
2312 	uint32_t start, stuff, end, partial, hdrlen;
2313 
2314 
2315 	csum = ntohs((uint16_t)csum);
2316 	eh = (struct ether_header *)(void *)mp->b_rptr;
2317 	hdrlen = sizeof (*eh);
2318 	if (eh->ether_dhost.ether_addr_octet[0] & 1) {
2319 		if (0 == (bcmp(eh->ether_dhost.ether_addr_octet,
2320 		    myri10ge_broadcastaddr, sizeof (eh->ether_dhost))))
2321 			s->brdcstrcv++;
2322 		else
2323 			s->multircv++;
2324 	}
2325 
2326 	if (eh->ether_type == BE_16(ETHERTYPE_VLAN)) {
2327 		/*
2328 		 * fix checksum by subtracting 4 bytes after what the
2329 		 * firmware thought was the end of the ether hdr
2330 		 */
2331 		partial = *(uint32_t *)
2332 		    (void *)(mp->b_rptr + ETHERNET_HEADER_SIZE);
2333 		csum += ~partial;
2334 		csum +=  (csum < ~partial);
2335 		csum = (csum >> 16) + (csum & 0xFFFF);
2336 		csum = (csum >> 16) + (csum & 0xFFFF);
2337 		hdrlen += VLAN_TAGSZ;
2338 	}
2339 
2340 	if (eh->ether_type ==  BE_16(ETHERTYPE_IP)) {
2341 		ip = (struct ip *)(void *)(mp->b_rptr + hdrlen);
2342 		start = ip->ip_hl << 2;
2343 
2344 		if (ip->ip_p == IPPROTO_TCP)
2345 			stuff = start + offsetof(struct tcphdr, th_sum);
2346 		else if (ip->ip_p == IPPROTO_UDP)
2347 			stuff = start + offsetof(struct udphdr, uh_sum);
2348 		else
2349 			return;
2350 		end = ntohs(ip->ip_len);
2351 	} else if (eh->ether_type ==  BE_16(ETHERTYPE_IPV6)) {
2352 		ip6 = (struct ip6_hdr *)(void *)(mp->b_rptr + hdrlen);
2353 		start = sizeof (*ip6);
2354 		if (ip6->ip6_nxt == IPPROTO_TCP) {
2355 			stuff = start + offsetof(struct tcphdr, th_sum);
2356 		} else if (ip6->ip6_nxt == IPPROTO_UDP)
2357 			stuff = start + offsetof(struct udphdr, uh_sum);
2358 		else
2359 			return;
2360 		end = start + ntohs(ip6->ip6_plen);
2361 		/*
2362 		 * IPv6 headers do not contain a checksum, and hence
2363 		 * do not checksum to zero, so they don't "fall out"
2364 		 * of the partial checksum calculation like IPv4
2365 		 * headers do.  We need to fix the partial checksum by
2366 		 * subtracting the checksum of the IPv6 header.
2367 		 */
2368 
2369 		partial = myri10ge_csum_generic((uint16_t *)ip6, sizeof (*ip6));
2370 		csum += ~partial;
2371 		csum +=  (csum < ~partial);
2372 		csum = (csum >> 16) + (csum & 0xFFFF);
2373 		csum = (csum >> 16) + (csum & 0xFFFF);
2374 	} else {
2375 		return;
2376 	}
2377 
2378 	if (MBLKL(mp) > hdrlen + end) {
2379 		/* padded frame, so hw csum may be invalid */
2380 		return;
2381 	}
2382 
2383 	(void) hcksum_assoc(mp, NULL, NULL, start, stuff, end,
2384 	    csum, HCK_PARTIALCKSUM, 0);
2385 }
2386 
2387 static mblk_t *
2388 myri10ge_rx_done_small(struct myri10ge_slice_state *ss, uint32_t len,
2389     uint32_t csum)
2390 {
2391 	mblk_t *mp;
2392 	myri10ge_rx_ring_t *rx;
2393 	int idx;
2394 
2395 	rx = &ss->rx_small;
2396 	idx = rx->cnt & rx->mask;
2397 	ss->rx_small.cnt++;
2398 
2399 	/* allocate a new buffer to pass up the stack */
2400 	mp = allocb(len + MXGEFW_PAD, 0);
2401 	if (mp == NULL) {
2402 		MYRI10GE_ATOMIC_SLICE_STAT_INC(rx_small_nobuf);
2403 		goto abort;
2404 	}
2405 	bcopy(ss->rx_small.info[idx].ptr,
2406 	    (caddr_t)mp->b_wptr, len + MXGEFW_PAD);
2407 	mp->b_wptr += len + MXGEFW_PAD;
2408 	mp->b_rptr += MXGEFW_PAD;
2409 
2410 	ss->rx_stats.ibytes += len;
2411 	ss->rx_stats.ipackets += 1;
2412 	myri10ge_rx_csum(mp, &ss->rx_stats, csum);
2413 
2414 abort:
2415 	if ((idx & 7) == 7) {
2416 		myri10ge_submit_8rx(&rx->lanai[idx - 7],
2417 		    &rx->shadow[idx - 7]);
2418 	}
2419 
2420 	return (mp);
2421 }
2422 
2423 
2424 static mblk_t *
2425 myri10ge_rx_done_big(struct myri10ge_slice_state *ss, uint32_t len,
2426     uint32_t csum)
2427 {
2428 	struct myri10ge_jpool_stuff *jpool;
2429 	struct myri10ge_jpool_entry *j;
2430 	mblk_t *mp;
2431 	int idx, num_owned_by_mcp;
2432 
2433 	jpool = &ss->jpool;
2434 	idx = ss->j_rx_cnt & ss->rx_big.mask;
2435 	j = ss->rx_big.info[idx].j;
2436 
2437 	if (j == NULL) {
2438 		printf("%s: null j at idx=%d, rx_big.cnt = %d, j_rx_cnt=%d\n",
2439 		    ss->mgp->name, idx, ss->rx_big.cnt, ss->j_rx_cnt);
2440 		return (NULL);
2441 	}
2442 
2443 
2444 	ss->rx_big.info[idx].j = NULL;
2445 	ss->j_rx_cnt++;
2446 
2447 
2448 	/*
2449 	 * Check to see if we are low on rx buffers.
2450 	 * Note that we must leave at least 8 free so there are
2451 	 * enough to free in a single 64-byte write.
2452 	 */
2453 	num_owned_by_mcp = ss->rx_big.cnt - ss->j_rx_cnt;
2454 	if (num_owned_by_mcp < jpool->low_water) {
2455 		mutex_enter(&jpool->mtx);
2456 		myri10ge_restock_jumbos(ss);
2457 		mutex_exit(&jpool->mtx);
2458 		num_owned_by_mcp = ss->rx_big.cnt - ss->j_rx_cnt;
2459 		/* if we are still low, then we have to copy */
2460 		if (num_owned_by_mcp < 16) {
2461 			MYRI10GE_ATOMIC_SLICE_STAT_INC(rx_copy);
2462 			/* allocate a new buffer to pass up the stack */
2463 			mp = allocb(len + MXGEFW_PAD, 0);
2464 			if (mp == NULL) {
2465 				goto abort;
2466 			}
2467 			bcopy(j->buf,
2468 			    (caddr_t)mp->b_wptr, len + MXGEFW_PAD);
2469 			myri10ge_jfree_rtn(j);
2470 			/* push buffer back to NIC */
2471 			mutex_enter(&jpool->mtx);
2472 			myri10ge_restock_jumbos(ss);
2473 			mutex_exit(&jpool->mtx);
2474 			goto set_len;
2475 		}
2476 	}
2477 
2478 	/* loan our buffer to the stack */
2479 	mp = desballoc((unsigned char *)j->buf, myri10ge_mtu, 0, &j->free_func);
2480 	if (mp == NULL) {
2481 		goto abort;
2482 	}
2483 
2484 set_len:
2485 	mp->b_rptr += MXGEFW_PAD;
2486 	mp->b_wptr = ((unsigned char *) mp->b_rptr + len);
2487 
2488 	ss->rx_stats.ibytes += len;
2489 	ss->rx_stats.ipackets += 1;
2490 	myri10ge_rx_csum(mp, &ss->rx_stats, csum);
2491 
2492 	return (mp);
2493 
2494 abort:
2495 	myri10ge_jfree_rtn(j);
2496 	MYRI10GE_ATOMIC_SLICE_STAT_INC(rx_big_nobuf);
2497 	return (NULL);
2498 }
2499 
2500 /*
2501  * Free all transmit buffers up until the specified index
2502  */
2503 static inline void
2504 myri10ge_tx_done(struct myri10ge_slice_state *ss, uint32_t mcp_index)
2505 {
2506 	myri10ge_tx_ring_t *tx;
2507 	struct myri10ge_tx_dma_handle_head handles;
2508 	int idx;
2509 	int limit = 0;
2510 
2511 	tx = &ss->tx;
2512 	handles.head = NULL;
2513 	handles.tail = NULL;
2514 	while (tx->pkt_done != (int)mcp_index) {
2515 		idx = tx->done & tx->mask;
2516 
2517 		/*
2518 		 * mblk & DMA handle attached only to first slot
2519 		 * per buffer in the packet
2520 		 */
2521 
2522 		if (tx->info[idx].m) {
2523 			(void) ddi_dma_unbind_handle(tx->info[idx].handle->h);
2524 			tx->info[idx].handle->next = handles.head;
2525 			handles.head = tx->info[idx].handle;
2526 			if (handles.tail == NULL)
2527 				handles.tail = tx->info[idx].handle;
2528 			freeb(tx->info[idx].m);
2529 			tx->info[idx].m = 0;
2530 			tx->info[idx].handle = 0;
2531 		}
2532 		if (tx->info[idx].ostat.opackets != 0) {
2533 			tx->stats.multixmt += tx->info[idx].ostat.multixmt;
2534 			tx->stats.brdcstxmt += tx->info[idx].ostat.brdcstxmt;
2535 			tx->stats.obytes += tx->info[idx].ostat.obytes;
2536 			tx->stats.opackets += tx->info[idx].ostat.opackets;
2537 			tx->info[idx].stat.un.all = 0;
2538 			tx->pkt_done++;
2539 		}
2540 
2541 		tx->done++;
2542 		/*
2543 		 * if we stalled the queue, wake it.  But Wait until
2544 		 * we have at least 1/2 our slots free.
2545 		 */
2546 		if ((tx->req - tx->done) < (tx->mask >> 1) &&
2547 		    tx->stall != tx->sched) {
2548 			mutex_enter(&ss->tx.lock);
2549 			tx->sched = tx->stall;
2550 			mutex_exit(&ss->tx.lock);
2551 			mac_tx_ring_update(ss->mgp->mh, tx->rh);
2552 		}
2553 
2554 		/* limit potential for livelock */
2555 		if (unlikely(++limit >  2 * tx->mask))
2556 			break;
2557 	}
2558 	if (tx->req == tx->done && tx->stop != NULL) {
2559 		/*
2560 		 * Nic has sent all pending requests, allow him
2561 		 * to stop polling this queue
2562 		 */
2563 		mutex_enter(&tx->lock);
2564 		if (tx->req == tx->done && tx->active) {
2565 			*(int *)(void *)tx->stop = 1;
2566 			tx->active = 0;
2567 			mb();
2568 		}
2569 		mutex_exit(&tx->lock);
2570 	}
2571 	if (handles.head != NULL)
2572 		myri10ge_free_tx_handles(tx, &handles);
2573 }
2574 
2575 static void
2576 myri10ge_mbl_init(struct myri10ge_mblk_list *mbl)
2577 {
2578 	mbl->head = NULL;
2579 	mbl->tail = &mbl->head;
2580 	mbl->cnt = 0;
2581 }
2582 
2583 /*ARGSUSED*/
2584 void
2585 myri10ge_mbl_append(struct myri10ge_slice_state *ss,
2586     struct myri10ge_mblk_list *mbl, mblk_t *mp)
2587 {
2588 	*(mbl->tail) = mp;
2589 	mbl->tail = &mp->b_next;
2590 	mp->b_next = NULL;
2591 	mbl->cnt++;
2592 }
2593 
2594 
2595 static inline void
2596 myri10ge_clean_rx_done(struct myri10ge_slice_state *ss,
2597     struct myri10ge_mblk_list *mbl, int limit, boolean_t *stop)
2598 {
2599 	myri10ge_rx_done_t *rx_done = &ss->rx_done;
2600 	struct myri10ge_priv *mgp = ss->mgp;
2601 	mblk_t *mp;
2602 	struct lro_entry *lro;
2603 	uint16_t length;
2604 	uint16_t checksum;
2605 
2606 
2607 	while (rx_done->entry[rx_done->idx].length != 0) {
2608 		if (unlikely (*stop)) {
2609 			break;
2610 		}
2611 		length = ntohs(rx_done->entry[rx_done->idx].length);
2612 		length &= (~MXGEFW_RSS_HASH_MASK);
2613 
2614 		/* limit potential for livelock */
2615 		limit -= length;
2616 		if (unlikely(limit < 0))
2617 			break;
2618 
2619 		rx_done->entry[rx_done->idx].length = 0;
2620 		checksum = ntohs(rx_done->entry[rx_done->idx].checksum);
2621 		if (length <= myri10ge_small_bytes)
2622 			mp = myri10ge_rx_done_small(ss, length, checksum);
2623 		else
2624 			mp = myri10ge_rx_done_big(ss, length, checksum);
2625 		if (mp != NULL) {
2626 			if (!myri10ge_lro ||
2627 			    0 != myri10ge_lro_rx(ss, mp, checksum, mbl))
2628 				myri10ge_mbl_append(ss, mbl, mp);
2629 		}
2630 		rx_done->cnt++;
2631 		rx_done->idx = rx_done->cnt & (mgp->max_intr_slots - 1);
2632 	}
2633 	while (ss->lro_active != NULL) {
2634 		lro = ss->lro_active;
2635 		ss->lro_active = lro->next;
2636 		myri10ge_lro_flush(ss, lro, mbl);
2637 	}
2638 }
2639 
2640 static void
2641 myri10ge_intr_rx(struct myri10ge_slice_state *ss)
2642 {
2643 	uint64_t gen;
2644 	struct myri10ge_mblk_list mbl;
2645 
2646 	myri10ge_mbl_init(&mbl);
2647 	if (mutex_tryenter(&ss->rx_lock) == 0)
2648 		return;
2649 	gen = ss->rx_gen_num;
2650 	myri10ge_clean_rx_done(ss, &mbl, MYRI10GE_POLL_NULL,
2651 	    &ss->rx_polling);
2652 	if (mbl.head != NULL)
2653 		mac_rx_ring(ss->mgp->mh, ss->rx_rh, mbl.head, gen);
2654 	mutex_exit(&ss->rx_lock);
2655 
2656 }
2657 
2658 static mblk_t *
2659 myri10ge_poll_rx(void *arg, int bytes)
2660 {
2661 	struct myri10ge_slice_state *ss = arg;
2662 	struct myri10ge_mblk_list mbl;
2663 	boolean_t dummy = B_FALSE;
2664 
2665 	if (bytes == 0)
2666 		return (NULL);
2667 
2668 	myri10ge_mbl_init(&mbl);
2669 	mutex_enter(&ss->rx_lock);
2670 	if (ss->rx_polling)
2671 		myri10ge_clean_rx_done(ss, &mbl, bytes, &dummy);
2672 	else
2673 		printf("%d: poll_rx: token=%d, polling=%d\n", (int)(ss -
2674 		    ss->mgp->ss), ss->rx_token, ss->rx_polling);
2675 	mutex_exit(&ss->rx_lock);
2676 	return (mbl.head);
2677 }
2678 
2679 /*ARGSUSED*/
2680 static uint_t
2681 myri10ge_intr(caddr_t arg0, caddr_t arg1)
2682 {
2683 	struct myri10ge_slice_state *ss =
2684 	    (struct myri10ge_slice_state *)(void *)arg0;
2685 	struct myri10ge_priv *mgp = ss->mgp;
2686 	mcp_irq_data_t *stats = ss->fw_stats;
2687 	myri10ge_tx_ring_t *tx = &ss->tx;
2688 	uint32_t send_done_count;
2689 	uint8_t valid;
2690 
2691 
2692 	/* make sure the DMA has finished */
2693 	if (!stats->valid) {
2694 		return (DDI_INTR_UNCLAIMED);
2695 	}
2696 	valid = stats->valid;
2697 
2698 	/* low bit indicates receives are present */
2699 	if (valid & 1)
2700 		myri10ge_intr_rx(ss);
2701 
2702 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_FIXED) {
2703 		/* lower legacy IRQ  */
2704 		*mgp->irq_deassert = 0;
2705 		if (!myri10ge_deassert_wait)
2706 			/* don't wait for conf. that irq is low */
2707 			stats->valid = 0;
2708 		mb();
2709 	} else {
2710 		/* no need to wait for conf. that irq is low */
2711 		stats->valid = 0;
2712 	}
2713 
2714 	do {
2715 		/* check for transmit completes and receives */
2716 		send_done_count = ntohl(stats->send_done_count);
2717 		if (send_done_count != tx->pkt_done)
2718 			myri10ge_tx_done(ss, (int)send_done_count);
2719 	} while (*((volatile uint8_t *) &stats->valid));
2720 
2721 	if (stats->stats_updated) {
2722 		if (mgp->link_state != stats->link_up || stats->link_down) {
2723 			mgp->link_state = stats->link_up;
2724 			if (stats->link_down) {
2725 				mgp->down_cnt += stats->link_down;
2726 				mgp->link_state = 0;
2727 			}
2728 			if (mgp->link_state) {
2729 				if (myri10ge_verbose)
2730 					printf("%s: link up\n", mgp->name);
2731 				mac_link_update(mgp->mh, LINK_STATE_UP);
2732 			} else {
2733 				if (myri10ge_verbose)
2734 					printf("%s: link down\n", mgp->name);
2735 				mac_link_update(mgp->mh, LINK_STATE_DOWN);
2736 			}
2737 			MYRI10GE_NIC_STAT_INC(link_changes);
2738 		}
2739 		if (mgp->rdma_tags_available !=
2740 		    ntohl(ss->fw_stats->rdma_tags_available)) {
2741 			mgp->rdma_tags_available =
2742 			    ntohl(ss->fw_stats->rdma_tags_available);
2743 			cmn_err(CE_NOTE, "%s: RDMA timed out! "
2744 			    "%d tags left\n", mgp->name,
2745 			    mgp->rdma_tags_available);
2746 		}
2747 	}
2748 
2749 	mb();
2750 	/* check to see if we have rx token to pass back */
2751 	if (valid & 0x1) {
2752 		mutex_enter(&ss->poll_lock);
2753 		if (ss->rx_polling) {
2754 			ss->rx_token = 1;
2755 		} else {
2756 			*ss->irq_claim = BE_32(3);
2757 			ss->rx_token = 0;
2758 		}
2759 		mutex_exit(&ss->poll_lock);
2760 	}
2761 	*(ss->irq_claim + 1) = BE_32(3);
2762 	return (DDI_INTR_CLAIMED);
2763 }
2764 
2765 /*
2766  * Add or remove a multicast address.  This is called with our
2767  * macinfo's lock held by GLD, so we do not need to worry about
2768  * our own locking here.
2769  */
2770 static int
2771 myri10ge_m_multicst(void *arg, boolean_t add, const uint8_t *multicastaddr)
2772 {
2773 	myri10ge_cmd_t cmd;
2774 	struct myri10ge_priv *mgp = arg;
2775 	int status, join_leave;
2776 
2777 	if (add)
2778 		join_leave = MXGEFW_JOIN_MULTICAST_GROUP;
2779 	else
2780 		join_leave = MXGEFW_LEAVE_MULTICAST_GROUP;
2781 	(void) memcpy(&cmd.data0, multicastaddr, 4);
2782 	(void) memcpy(&cmd.data1, multicastaddr + 4, 2);
2783 	cmd.data0 = htonl(cmd.data0);
2784 	cmd.data1 = htonl(cmd.data1);
2785 	status = myri10ge_send_cmd(mgp, join_leave, &cmd);
2786 	if (status == 0)
2787 		return (0);
2788 
2789 	cmn_err(CE_WARN, "%s: failed to set multicast address\n",
2790 	    mgp->name);
2791 	return (status);
2792 }
2793 
2794 
2795 static int
2796 myri10ge_m_promisc(void *arg, boolean_t on)
2797 {
2798 	struct myri10ge_priv *mgp = arg;
2799 
2800 	myri10ge_change_promisc(mgp, on);
2801 	return (0);
2802 }
2803 
2804 /*
2805  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
2806  *  backwards one at a time and handle ring wraps
2807  */
2808 
2809 static inline void
2810 myri10ge_submit_req_backwards(myri10ge_tx_ring_t *tx,
2811     mcp_kreq_ether_send_t *src, int cnt)
2812 {
2813 	int idx, starting_slot;
2814 	starting_slot = tx->req;
2815 	while (cnt > 1) {
2816 		cnt--;
2817 		idx = (starting_slot + cnt) & tx->mask;
2818 		myri10ge_pio_copy(&tx->lanai[idx],
2819 		    &src[cnt], sizeof (*src));
2820 		mb();
2821 	}
2822 }
2823 
2824 /*
2825  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
2826  * at most 32 bytes at a time, so as to avoid involving the software
2827  * pio handler in the nic.   We re-write the first segment's flags
2828  * to mark them valid only after writing the entire chain
2829  */
2830 
2831 static inline void
2832 myri10ge_submit_req(myri10ge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
2833     int cnt)
2834 {
2835 	int idx, i;
2836 	uint32_t *src_ints, *dst_ints;
2837 	mcp_kreq_ether_send_t *srcp, *dstp, *dst;
2838 	uint8_t last_flags;
2839 
2840 	idx = tx->req & tx->mask;
2841 
2842 	last_flags = src->flags;
2843 	src->flags = 0;
2844 	mb();
2845 	dst = dstp = &tx->lanai[idx];
2846 	srcp = src;
2847 
2848 	if ((idx + cnt) < tx->mask) {
2849 		for (i = 0; i < (cnt - 1); i += 2) {
2850 			myri10ge_pio_copy(dstp, srcp, 2 * sizeof (*src));
2851 			mb(); /* force write every 32 bytes */
2852 			srcp += 2;
2853 			dstp += 2;
2854 		}
2855 	} else {
2856 		/*
2857 		 * submit all but the first request, and ensure
2858 		 *  that it is submitted below
2859 		 */
2860 		myri10ge_submit_req_backwards(tx, src, cnt);
2861 		i = 0;
2862 	}
2863 	if (i < cnt) {
2864 		/* submit the first request */
2865 		myri10ge_pio_copy(dstp, srcp, sizeof (*src));
2866 		mb(); /* barrier before setting valid flag */
2867 	}
2868 
2869 	/* re-write the last 32-bits with the valid flags */
2870 	src->flags |= last_flags;
2871 	src_ints = (uint32_t *)src;
2872 	src_ints += 3;
2873 	dst_ints = (uint32_t *)dst;
2874 	dst_ints += 3;
2875 	*dst_ints =  *src_ints;
2876 	tx->req += cnt;
2877 	mb();
2878 	/* notify NIC to poll this tx ring */
2879 	if (!tx->active && tx->go != NULL) {
2880 		*(int *)(void *)tx->go = 1;
2881 		tx->active = 1;
2882 		tx->activate++;
2883 		mb();
2884 	}
2885 }
2886 
2887 /* ARGSUSED */
2888 static inline void
2889 myri10ge_lso_info_get(mblk_t *mp, uint32_t *mss, uint32_t *flags)
2890 {
2891 	uint32_t lso_flag;
2892 	lso_info_get(mp, mss, &lso_flag);
2893 	(*flags) |= lso_flag;
2894 }
2895 
2896 
2897 /* like pullupmsg, except preserve hcksum/LSO attributes */
2898 static int
2899 myri10ge_pullup(struct myri10ge_slice_state *ss, mblk_t *mp)
2900 {
2901 	uint32_t start, stuff, tx_offload_flags, mss;
2902 	int ok;
2903 
2904 	mss = 0;
2905 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, NULL, NULL,
2906 	    &tx_offload_flags);
2907 	myri10ge_lso_info_get(mp, &mss, &tx_offload_flags);
2908 
2909 	ok = pullupmsg(mp, -1);
2910 	if (!ok) {
2911 		printf("pullupmsg failed");
2912 		return (DDI_FAILURE);
2913 	}
2914 	MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_pullup);
2915 	(void) hcksum_assoc(mp, NULL, NULL, start, stuff, NULL,
2916 	    NULL, tx_offload_flags, 0);
2917 	if (tx_offload_flags & HW_LSO)
2918 		DB_LSOMSS(mp) = (uint16_t)mss;
2919 	lso_info_set(mp, mss, tx_offload_flags);
2920 	return (DDI_SUCCESS);
2921 }
2922 
2923 static inline void
2924 myri10ge_tx_stat(struct myri10ge_tx_pkt_stats *s, struct ether_header *eh,
2925     int opackets, int obytes)
2926 {
2927 	s->un.all = 0;
2928 	if (eh->ether_dhost.ether_addr_octet[0] & 1) {
2929 		if (0 == (bcmp(eh->ether_dhost.ether_addr_octet,
2930 		    myri10ge_broadcastaddr, sizeof (eh->ether_dhost))))
2931 			s->un.s.brdcstxmt = 1;
2932 		else
2933 			s->un.s.multixmt = 1;
2934 	}
2935 	s->un.s.opackets = (uint16_t)opackets;
2936 	s->un.s.obytes = obytes;
2937 }
2938 
2939 static int
2940 myri10ge_tx_copy(struct myri10ge_slice_state *ss, mblk_t *mp,
2941     mcp_kreq_ether_send_t *req)
2942 {
2943 	myri10ge_tx_ring_t *tx = &ss->tx;
2944 	caddr_t ptr;
2945 	struct myri10ge_tx_copybuf *cp;
2946 	mblk_t *bp;
2947 	int idx, mblen, avail;
2948 	uint16_t len;
2949 
2950 	mutex_enter(&tx->lock);
2951 	avail = tx->mask - (tx->req - tx->done);
2952 	if (avail <= 1) {
2953 		mutex_exit(&tx->lock);
2954 		return (EBUSY);
2955 	}
2956 	idx = tx->req & tx->mask;
2957 	cp = &tx->cp[idx];
2958 	ptr = cp->va;
2959 	for (len = 0, bp = mp; bp != NULL; bp = bp->b_cont) {
2960 		mblen = MBLKL(bp);
2961 		bcopy(bp->b_rptr, ptr, mblen);
2962 		ptr += mblen;
2963 		len += mblen;
2964 	}
2965 	/* ensure runts are padded to 60 bytes */
2966 	if (len < 60) {
2967 		bzero(ptr, 64 - len);
2968 		len = 60;
2969 	}
2970 	req->addr_low = cp->dma.low;
2971 	req->addr_high = cp->dma.high;
2972 	req->length = htons(len);
2973 	req->pad = 0;
2974 	req->rdma_count = 1;
2975 	myri10ge_tx_stat(&tx->info[idx].stat,
2976 	    (struct ether_header *)(void *)cp->va, 1, len);
2977 	(void) ddi_dma_sync(cp->dma.handle, 0, len, DDI_DMA_SYNC_FORDEV);
2978 	myri10ge_submit_req(&ss->tx, req, 1);
2979 	mutex_exit(&tx->lock);
2980 	freemsg(mp);
2981 	return (DDI_SUCCESS);
2982 }
2983 
2984 
2985 static void
2986 myri10ge_send_locked(myri10ge_tx_ring_t *tx, mcp_kreq_ether_send_t *req_list,
2987     struct myri10ge_tx_buffer_state *tx_info,
2988     int count)
2989 {
2990 	int i, idx;
2991 
2992 	idx = 0; /* gcc -Wuninitialized */
2993 	/* store unmapping and bp info for tx irq handler */
2994 	for (i = 0; i < count; i++) {
2995 		idx = (tx->req + i) & tx->mask;
2996 		tx->info[idx].m = tx_info[i].m;
2997 		tx->info[idx].handle = tx_info[i].handle;
2998 	}
2999 	tx->info[idx].stat.un.all = tx_info[0].stat.un.all;
3000 
3001 	/* submit the frame to the nic */
3002 	myri10ge_submit_req(tx, req_list, count);
3003 
3004 
3005 }
3006 
3007 
3008 
3009 static void
3010 myri10ge_copydata(mblk_t *mp, int off, int len, caddr_t buf)
3011 {
3012 	mblk_t *bp;
3013 	int seglen;
3014 	uint_t count;
3015 
3016 	bp = mp;
3017 
3018 	while (off > 0) {
3019 		seglen = MBLKL(bp);
3020 		if (off < seglen)
3021 			break;
3022 		off -= seglen;
3023 		bp = bp->b_cont;
3024 	}
3025 	while (len > 0) {
3026 		seglen = MBLKL(bp);
3027 		count = min(seglen - off, len);
3028 		bcopy(bp->b_rptr + off, buf, count);
3029 		len -= count;
3030 		buf += count;
3031 		off = 0;
3032 		bp = bp->b_cont;
3033 	}
3034 }
3035 
3036 static int
3037 myri10ge_ether_parse_header(mblk_t *mp)
3038 {
3039 	struct ether_header eh_copy;
3040 	struct ether_header *eh;
3041 	int eth_hdr_len, seglen;
3042 
3043 	seglen = MBLKL(mp);
3044 	eth_hdr_len = sizeof (*eh);
3045 	if (seglen < eth_hdr_len) {
3046 		myri10ge_copydata(mp, 0, eth_hdr_len, (caddr_t)&eh_copy);
3047 		eh = &eh_copy;
3048 	} else {
3049 		eh = (struct ether_header *)(void *)mp->b_rptr;
3050 	}
3051 	if (eh->ether_type == BE_16(ETHERTYPE_VLAN)) {
3052 		eth_hdr_len += 4;
3053 	}
3054 
3055 	return (eth_hdr_len);
3056 }
3057 
3058 static int
3059 myri10ge_lso_parse_header(mblk_t *mp, int off)
3060 {
3061 	char buf[128];
3062 	int seglen, sum_off;
3063 	struct ip *ip;
3064 	struct tcphdr *tcp;
3065 
3066 	seglen = MBLKL(mp);
3067 	if (seglen < off + sizeof (*ip)) {
3068 		myri10ge_copydata(mp, off, sizeof (*ip), buf);
3069 		ip = (struct ip *)(void *)buf;
3070 	} else {
3071 		ip = (struct ip *)(void *)(mp->b_rptr + off);
3072 	}
3073 	if (seglen < off + (ip->ip_hl << 2) + sizeof (*tcp)) {
3074 		myri10ge_copydata(mp, off,
3075 		    (ip->ip_hl << 2) + sizeof (*tcp), buf);
3076 		ip = (struct ip *)(void *)buf;
3077 	}
3078 	tcp = (struct tcphdr *)(void *)((char *)ip + (ip->ip_hl << 2));
3079 
3080 	/*
3081 	 * NIC expects ip_sum to be zero.  Recent changes to
3082 	 * OpenSolaris leave the correct ip checksum there, rather
3083 	 * than the required zero, so we need to zero it.  Otherwise,
3084 	 * the NIC will produce bad checksums when sending LSO packets.
3085 	 */
3086 	if (ip->ip_sum != 0) {
3087 		if (((char *)ip) != buf) {
3088 			/* ip points into mblk, so just zero it */
3089 			ip->ip_sum = 0;
3090 		} else {
3091 			/*
3092 			 * ip points into a copy, so walk the chain
3093 			 * to find the ip_csum, then zero it
3094 			 */
3095 			sum_off = off + _PTRDIFF(&ip->ip_sum, buf);
3096 			while (sum_off > (int)(MBLKL(mp) - 1)) {
3097 				sum_off -= MBLKL(mp);
3098 				mp = mp->b_cont;
3099 			}
3100 			mp->b_rptr[sum_off] = 0;
3101 			sum_off++;
3102 			while (sum_off > MBLKL(mp) - 1) {
3103 				sum_off -= MBLKL(mp);
3104 				mp = mp->b_cont;
3105 			}
3106 			mp->b_rptr[sum_off] = 0;
3107 		}
3108 	}
3109 	return (off + ((ip->ip_hl + tcp->th_off) << 2));
3110 }
3111 
3112 static int
3113 myri10ge_tx_tso_copy(struct myri10ge_slice_state *ss, mblk_t *mp,
3114     mcp_kreq_ether_send_t *req_list, int hdr_size, int pkt_size,
3115     uint16_t mss, uint8_t cksum_offset)
3116 {
3117 	myri10ge_tx_ring_t *tx = &ss->tx;
3118 	struct myri10ge_priv *mgp = ss->mgp;
3119 	mblk_t *bp;
3120 	mcp_kreq_ether_send_t *req;
3121 	struct myri10ge_tx_copybuf *cp;
3122 	caddr_t rptr, ptr;
3123 	int mblen, count, cum_len, mss_resid, tx_req, pkt_size_tmp;
3124 	int resid, avail, idx, hdr_size_tmp, tx_boundary;
3125 	int rdma_count;
3126 	uint32_t seglen, len, boundary, low, high_swapped;
3127 	uint16_t pseudo_hdr_offset = htons(mss);
3128 	uint8_t flags;
3129 
3130 	tx_boundary = mgp->tx_boundary;
3131 	hdr_size_tmp = hdr_size;
3132 	resid = tx_boundary;
3133 	count = 1;
3134 	mutex_enter(&tx->lock);
3135 
3136 	/* check to see if the slots are really there */
3137 	avail = tx->mask - (tx->req - tx->done);
3138 	if (unlikely(avail <=  MYRI10GE_MAX_SEND_DESC_TSO)) {
3139 		atomic_add_32(&tx->stall, 1);
3140 		mutex_exit(&tx->lock);
3141 		return (EBUSY);
3142 	}
3143 
3144 	/* copy */
3145 	cum_len = -hdr_size;
3146 	count = 0;
3147 	req = req_list;
3148 	idx = tx->mask & tx->req;
3149 	cp = &tx->cp[idx];
3150 	low = ntohl(cp->dma.low);
3151 	ptr = cp->va;
3152 	cp->len = 0;
3153 	if (mss) {
3154 		int payload = pkt_size - hdr_size;
3155 		uint16_t opackets = (payload / mss) + ((payload % mss) != 0);
3156 		tx->info[idx].ostat.opackets = opackets;
3157 		tx->info[idx].ostat.obytes = (opackets - 1) * hdr_size
3158 		    + pkt_size;
3159 	}
3160 	hdr_size_tmp = hdr_size;
3161 	mss_resid = mss;
3162 	flags = (MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST);
3163 	tx_req = tx->req;
3164 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
3165 		mblen = MBLKL(bp);
3166 		rptr = (caddr_t)bp->b_rptr;
3167 		len = min(hdr_size_tmp, mblen);
3168 		if (len) {
3169 			bcopy(rptr, ptr, len);
3170 			rptr += len;
3171 			ptr += len;
3172 			resid -= len;
3173 			mblen -= len;
3174 			hdr_size_tmp -= len;
3175 			cp->len += len;
3176 			if (hdr_size_tmp)
3177 				continue;
3178 			if (resid < mss) {
3179 				tx_req++;
3180 				idx = tx->mask & tx_req;
3181 				cp = &tx->cp[idx];
3182 				low = ntohl(cp->dma.low);
3183 				ptr = cp->va;
3184 				resid = tx_boundary;
3185 			}
3186 		}
3187 		while (mblen) {
3188 			len = min(mss_resid, mblen);
3189 			bcopy(rptr, ptr, len);
3190 			mss_resid -= len;
3191 			resid -= len;
3192 			mblen -= len;
3193 			rptr += len;
3194 			ptr += len;
3195 			cp->len += len;
3196 			if (mss_resid == 0) {
3197 				mss_resid = mss;
3198 				if (resid < mss) {
3199 					tx_req++;
3200 					idx = tx->mask & tx_req;
3201 					cp = &tx->cp[idx];
3202 					cp->len = 0;
3203 					low = ntohl(cp->dma.low);
3204 					ptr = cp->va;
3205 					resid = tx_boundary;
3206 				}
3207 			}
3208 		}
3209 	}
3210 
3211 	req = req_list;
3212 	pkt_size_tmp = pkt_size;
3213 	count = 0;
3214 	rdma_count = 0;
3215 	tx_req = tx->req;
3216 	while (pkt_size_tmp) {
3217 		idx = tx->mask & tx_req;
3218 		cp = &tx->cp[idx];
3219 		high_swapped = cp->dma.high;
3220 		low = ntohl(cp->dma.low);
3221 		len = cp->len;
3222 		if (len == 0) {
3223 			printf("len=0! pkt_size_tmp=%d, pkt_size=%d\n",
3224 			    pkt_size_tmp, pkt_size);
3225 			for (bp = mp; bp != NULL; bp = bp->b_cont) {
3226 				mblen = MBLKL(bp);
3227 				printf("mblen:%d\n", mblen);
3228 			}
3229 			pkt_size_tmp = pkt_size;
3230 			tx_req = tx->req;
3231 			while (pkt_size_tmp > 0) {
3232 				idx = tx->mask & tx_req;
3233 				cp = &tx->cp[idx];
3234 				printf("cp->len = %d\n", cp->len);
3235 				pkt_size_tmp -= cp->len;
3236 				tx_req++;
3237 			}
3238 			printf("dropped\n");
3239 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3240 			goto done;
3241 		}
3242 		pkt_size_tmp -= len;
3243 		while (len) {
3244 			while (len) {
3245 				uint8_t flags_next;
3246 				int cum_len_next;
3247 
3248 				boundary = (low + mgp->tx_boundary) &
3249 				    ~(mgp->tx_boundary - 1);
3250 				seglen = boundary - low;
3251 				if (seglen > len)
3252 					seglen = len;
3253 
3254 				flags_next = flags & ~MXGEFW_FLAGS_FIRST;
3255 				cum_len_next = cum_len + seglen;
3256 				(req-rdma_count)->rdma_count = rdma_count + 1;
3257 				if (likely(cum_len >= 0)) {
3258 					/* payload */
3259 					int next_is_first, chop;
3260 
3261 					chop = (cum_len_next > mss);
3262 					cum_len_next = cum_len_next % mss;
3263 					next_is_first = (cum_len_next == 0);
3264 					flags |= chop *
3265 					    MXGEFW_FLAGS_TSO_CHOP;
3266 					flags_next |= next_is_first *
3267 					    MXGEFW_FLAGS_FIRST;
3268 					rdma_count |= -(chop | next_is_first);
3269 					rdma_count += chop & !next_is_first;
3270 				} else if (likely(cum_len_next >= 0)) {
3271 					/* header ends */
3272 					int small;
3273 
3274 					rdma_count = -1;
3275 					cum_len_next = 0;
3276 					seglen = -cum_len;
3277 					small = (mss <= MXGEFW_SEND_SMALL_SIZE);
3278 					flags_next = MXGEFW_FLAGS_TSO_PLD |
3279 					    MXGEFW_FLAGS_FIRST |
3280 					    (small * MXGEFW_FLAGS_SMALL);
3281 				}
3282 				req->addr_high = high_swapped;
3283 				req->addr_low = htonl(low);
3284 				req->pseudo_hdr_offset = pseudo_hdr_offset;
3285 				req->pad = 0; /* complete solid 16-byte block */
3286 				req->rdma_count = 1;
3287 				req->cksum_offset = cksum_offset;
3288 				req->length = htons(seglen);
3289 				req->flags = flags | ((cum_len & 1) *
3290 				    MXGEFW_FLAGS_ALIGN_ODD);
3291 				if (cksum_offset > seglen)
3292 					cksum_offset -= seglen;
3293 				else
3294 					cksum_offset = 0;
3295 				low += seglen;
3296 				len -= seglen;
3297 				cum_len = cum_len_next;
3298 				req++;
3299 				req->flags = 0;
3300 				flags = flags_next;
3301 				count++;
3302 				rdma_count++;
3303 			}
3304 		}
3305 		tx_req++;
3306 	}
3307 	(req-rdma_count)->rdma_count = (uint8_t)rdma_count;
3308 	do {
3309 		req--;
3310 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
3311 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP |
3312 	    MXGEFW_FLAGS_FIRST)));
3313 
3314 	myri10ge_submit_req(tx, req_list, count);
3315 done:
3316 	mutex_exit(&tx->lock);
3317 	freemsg(mp);
3318 	return (DDI_SUCCESS);
3319 }
3320 
3321 /*
3322  * Try to send the chain of buffers described by the mp.  We must not
3323  * encapsulate more than eth->tx.req - eth->tx.done, or
3324  * MXGEFW_MAX_SEND_DESC, whichever is more.
3325  */
3326 
3327 static int
3328 myri10ge_send(struct myri10ge_slice_state *ss, mblk_t *mp,
3329     mcp_kreq_ether_send_t *req_list, struct myri10ge_tx_buffer_state *tx_info)
3330 {
3331 	struct myri10ge_priv *mgp = ss->mgp;
3332 	myri10ge_tx_ring_t *tx = &ss->tx;
3333 	mcp_kreq_ether_send_t *req;
3334 	struct myri10ge_tx_dma_handle *handles, *dma_handle = NULL;
3335 	mblk_t  *bp;
3336 	ddi_dma_cookie_t cookie;
3337 	int err, rv, count, avail, mblen, try_pullup, i, max_segs, maclen,
3338 	    rdma_count, cum_len, lso_hdr_size;
3339 	uint32_t start, stuff, tx_offload_flags;
3340 	uint32_t seglen, len, mss, boundary, low, high_swapped;
3341 	uint_t ncookies;
3342 	uint16_t pseudo_hdr_offset;
3343 	uint8_t flags, cksum_offset, odd_flag;
3344 	int pkt_size;
3345 	int lso_copy = myri10ge_lso_copy;
3346 	try_pullup = 1;
3347 
3348 again:
3349 	/* Setup checksum offloading, if needed */
3350 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, NULL, NULL,
3351 	    &tx_offload_flags);
3352 	myri10ge_lso_info_get(mp, &mss, &tx_offload_flags);
3353 	if (tx_offload_flags & HW_LSO) {
3354 		max_segs = MYRI10GE_MAX_SEND_DESC_TSO;
3355 		if ((tx_offload_flags & HCK_PARTIALCKSUM) == 0) {
3356 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_lsobadflags);
3357 			freemsg(mp);
3358 			return (DDI_SUCCESS);
3359 		}
3360 	} else {
3361 		max_segs = MXGEFW_MAX_SEND_DESC;
3362 		mss = 0;
3363 	}
3364 	req = req_list;
3365 	cksum_offset = 0;
3366 	pseudo_hdr_offset = 0;
3367 
3368 	/* leave an extra slot keep the ring from wrapping */
3369 	avail = tx->mask - (tx->req - tx->done);
3370 
3371 	/*
3372 	 * If we have > MXGEFW_MAX_SEND_DESC, then any over-length
3373 	 * message will need to be pulled up in order to fit.
3374 	 * Otherwise, we are low on transmit descriptors, it is
3375 	 * probably better to stall and try again rather than pullup a
3376 	 * message to fit.
3377 	 */
3378 
3379 	if (avail < max_segs) {
3380 		err = EBUSY;
3381 		atomic_add_32(&tx->stall_early, 1);
3382 		goto stall;
3383 	}
3384 
3385 	/* find out how long the frame is and how many segments it is */
3386 	count = 0;
3387 	odd_flag = 0;
3388 	pkt_size = 0;
3389 	flags = (MXGEFW_FLAGS_NO_TSO | MXGEFW_FLAGS_FIRST);
3390 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
3391 		dblk_t *dbp;
3392 		mblen = MBLKL(bp);
3393 		if (mblen == 0) {
3394 			/*
3395 			 * we can't simply skip over 0-length mblks
3396 			 * because the hardware can't deal with them,
3397 			 * and we could leak them.
3398 			 */
3399 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_zero_len);
3400 			err = EIO;
3401 			goto pullup;
3402 		}
3403 		/*
3404 		 * There's no advantage to copying most gesballoc
3405 		 * attached blocks, so disable lso copy in that case
3406 		 */
3407 		if (mss && lso_copy == 1 && ((dbp = bp->b_datap) != NULL)) {
3408 			if ((void *)dbp->db_lastfree != myri10ge_db_lastfree) {
3409 				lso_copy = 0;
3410 			}
3411 		}
3412 		pkt_size += mblen;
3413 		count++;
3414 	}
3415 
3416 	/* Try to pull up excessivly long chains */
3417 	if (count >= max_segs) {
3418 		err = myri10ge_pullup(ss, mp);
3419 		if (likely(err == DDI_SUCCESS)) {
3420 			count = 1;
3421 		} else {
3422 			if (count <  MYRI10GE_MAX_SEND_DESC_TSO) {
3423 				/*
3424 				 * just let the h/w send it, it will be
3425 				 * inefficient, but us better than dropping
3426 				 */
3427 				max_segs = MYRI10GE_MAX_SEND_DESC_TSO;
3428 			} else {
3429 				/* drop it */
3430 				MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3431 				freemsg(mp);
3432 				return (0);
3433 			}
3434 		}
3435 	}
3436 
3437 	cum_len = 0;
3438 	maclen = myri10ge_ether_parse_header(mp);
3439 
3440 	if (tx_offload_flags & HCK_PARTIALCKSUM) {
3441 
3442 		cksum_offset = start + maclen;
3443 		pseudo_hdr_offset = htons(stuff + maclen);
3444 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
3445 		flags |= MXGEFW_FLAGS_CKSUM;
3446 	}
3447 
3448 	lso_hdr_size = 0; /* -Wunitinialized */
3449 	if (mss) { /* LSO */
3450 		/* this removes any CKSUM flag from before */
3451 		flags = (MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST);
3452 		/*
3453 		 * parse the headers and set cum_len to a negative
3454 		 * value to reflect the offset of the TCP payload
3455 		 */
3456 		lso_hdr_size =  myri10ge_lso_parse_header(mp, maclen);
3457 		cum_len = -lso_hdr_size;
3458 		if ((mss < mgp->tx_boundary) && lso_copy) {
3459 			err = myri10ge_tx_tso_copy(ss, mp, req_list,
3460 			    lso_hdr_size, pkt_size, mss, cksum_offset);
3461 			return (err);
3462 		}
3463 
3464 		/*
3465 		 * for TSO, pseudo_hdr_offset holds mss.  The firmware
3466 		 * figures out where to put the checksum by parsing
3467 		 * the header.
3468 		 */
3469 
3470 		pseudo_hdr_offset = htons(mss);
3471 	} else if (pkt_size <= MXGEFW_SEND_SMALL_SIZE) {
3472 		flags |= MXGEFW_FLAGS_SMALL;
3473 		if (pkt_size < myri10ge_tx_copylen) {
3474 			req->cksum_offset = cksum_offset;
3475 			req->pseudo_hdr_offset = pseudo_hdr_offset;
3476 			req->flags = flags;
3477 			err = myri10ge_tx_copy(ss, mp, req);
3478 			return (err);
3479 		}
3480 		cum_len = 0;
3481 	}
3482 
3483 	/* pull one DMA handle for each bp from our freelist */
3484 	handles = NULL;
3485 	err = myri10ge_alloc_tx_handles(ss, count, &handles);
3486 	if (err != DDI_SUCCESS) {
3487 		err = DDI_FAILURE;
3488 		goto stall;
3489 	}
3490 	count = 0;
3491 	rdma_count = 0;
3492 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
3493 		mblen = MBLKL(bp);
3494 		dma_handle = handles;
3495 		handles = handles->next;
3496 
3497 		rv = ddi_dma_addr_bind_handle(dma_handle->h, NULL,
3498 		    (caddr_t)bp->b_rptr, mblen,
3499 		    DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_SLEEP, NULL,
3500 		    &cookie, &ncookies);
3501 		if (unlikely(rv != DDI_DMA_MAPPED)) {
3502 			err = EIO;
3503 			try_pullup = 0;
3504 			dma_handle->next = handles;
3505 			handles = dma_handle;
3506 			goto abort_with_handles;
3507 		}
3508 
3509 		/* reserve the slot */
3510 		tx_info[count].m = bp;
3511 		tx_info[count].handle = dma_handle;
3512 
3513 		for (; ; ) {
3514 			low = MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress);
3515 			high_swapped =
3516 			    htonl(MYRI10GE_HIGHPART_TO_U32(
3517 			    cookie.dmac_laddress));
3518 			len = (uint32_t)cookie.dmac_size;
3519 			while (len) {
3520 				uint8_t flags_next;
3521 				int cum_len_next;
3522 
3523 				boundary = (low + mgp->tx_boundary) &
3524 				    ~(mgp->tx_boundary - 1);
3525 				seglen = boundary - low;
3526 				if (seglen > len)
3527 					seglen = len;
3528 
3529 				flags_next = flags & ~MXGEFW_FLAGS_FIRST;
3530 				cum_len_next = cum_len + seglen;
3531 				if (mss) {
3532 					(req-rdma_count)->rdma_count =
3533 					    rdma_count + 1;
3534 					if (likely(cum_len >= 0)) {
3535 						/* payload */
3536 						int next_is_first, chop;
3537 
3538 						chop = (cum_len_next > mss);
3539 						cum_len_next =
3540 						    cum_len_next % mss;
3541 						next_is_first =
3542 						    (cum_len_next == 0);
3543 						flags |= chop *
3544 						    MXGEFW_FLAGS_TSO_CHOP;
3545 						flags_next |= next_is_first *
3546 						    MXGEFW_FLAGS_FIRST;
3547 						rdma_count |=
3548 						    -(chop | next_is_first);
3549 						rdma_count +=
3550 						    chop & !next_is_first;
3551 					} else if (likely(cum_len_next >= 0)) {
3552 						/* header ends */
3553 						int small;
3554 
3555 						rdma_count = -1;
3556 						cum_len_next = 0;
3557 						seglen = -cum_len;
3558 						small = (mss <=
3559 						    MXGEFW_SEND_SMALL_SIZE);
3560 						flags_next =
3561 						    MXGEFW_FLAGS_TSO_PLD
3562 						    | MXGEFW_FLAGS_FIRST
3563 						    | (small *
3564 						    MXGEFW_FLAGS_SMALL);
3565 					}
3566 				}
3567 				req->addr_high = high_swapped;
3568 				req->addr_low = htonl(low);
3569 				req->pseudo_hdr_offset = pseudo_hdr_offset;
3570 				req->pad = 0; /* complete solid 16-byte block */
3571 				req->rdma_count = 1;
3572 				req->cksum_offset = cksum_offset;
3573 				req->length = htons(seglen);
3574 				req->flags = flags | ((cum_len & 1) * odd_flag);
3575 				if (cksum_offset > seglen)
3576 					cksum_offset -= seglen;
3577 				else
3578 					cksum_offset = 0;
3579 				low += seglen;
3580 				len -= seglen;
3581 				cum_len = cum_len_next;
3582 				count++;
3583 				rdma_count++;
3584 				/*  make sure all the segments will fit */
3585 				if (unlikely(count >= max_segs)) {
3586 					MYRI10GE_ATOMIC_SLICE_STAT_INC(
3587 					    xmit_lowbuf);
3588 					/* may try a pullup */
3589 					err = EBUSY;
3590 					if (try_pullup)
3591 						try_pullup = 2;
3592 					goto abort_with_handles;
3593 				}
3594 				req++;
3595 				req->flags = 0;
3596 				flags = flags_next;
3597 				tx_info[count].m = 0;
3598 			}
3599 			ncookies--;
3600 			if (ncookies == 0)
3601 				break;
3602 			ddi_dma_nextcookie(dma_handle->h, &cookie);
3603 		}
3604 	}
3605 	(req-rdma_count)->rdma_count = (uint8_t)rdma_count;
3606 
3607 	if (mss) {
3608 		do {
3609 			req--;
3610 			req->flags |= MXGEFW_FLAGS_TSO_LAST;
3611 		} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP |
3612 		    MXGEFW_FLAGS_FIRST)));
3613 	}
3614 
3615 	/* calculate tx stats */
3616 	if (mss) {
3617 		uint16_t opackets;
3618 		int payload;
3619 
3620 		payload = pkt_size - lso_hdr_size;
3621 		opackets = (payload / mss) + ((payload % mss) != 0);
3622 		tx_info[0].stat.un.all = 0;
3623 		tx_info[0].ostat.opackets = opackets;
3624 		tx_info[0].ostat.obytes = (opackets - 1) * lso_hdr_size
3625 		    + pkt_size;
3626 	} else {
3627 		myri10ge_tx_stat(&tx_info[0].stat,
3628 		    (struct ether_header *)(void *)mp->b_rptr, 1, pkt_size);
3629 	}
3630 	mutex_enter(&tx->lock);
3631 
3632 	/* check to see if the slots are really there */
3633 	avail = tx->mask - (tx->req - tx->done);
3634 	if (unlikely(avail <= count)) {
3635 		mutex_exit(&tx->lock);
3636 		err = 0;
3637 		goto late_stall;
3638 	}
3639 
3640 	myri10ge_send_locked(tx, req_list, tx_info, count);
3641 	mutex_exit(&tx->lock);
3642 	return (DDI_SUCCESS);
3643 
3644 late_stall:
3645 	try_pullup = 0;
3646 	atomic_add_32(&tx->stall_late, 1);
3647 
3648 abort_with_handles:
3649 	/* unbind and free handles from previous mblks */
3650 	for (i = 0; i < count; i++) {
3651 		bp = tx_info[i].m;
3652 		tx_info[i].m = 0;
3653 		if (bp) {
3654 			dma_handle = tx_info[i].handle;
3655 			(void) ddi_dma_unbind_handle(dma_handle->h);
3656 			dma_handle->next = handles;
3657 			handles = dma_handle;
3658 			tx_info[i].handle = NULL;
3659 			tx_info[i].m = NULL;
3660 		}
3661 	}
3662 	myri10ge_free_tx_handle_slist(tx, handles);
3663 pullup:
3664 	if (try_pullup) {
3665 		err = myri10ge_pullup(ss, mp);
3666 		if (err != DDI_SUCCESS && try_pullup == 2) {
3667 			/* drop */
3668 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3669 			freemsg(mp);
3670 			return (0);
3671 		}
3672 		try_pullup = 0;
3673 		goto again;
3674 	}
3675 
3676 stall:
3677 	if (err != 0) {
3678 		if (err == EBUSY) {
3679 			atomic_add_32(&tx->stall, 1);
3680 		} else {
3681 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3682 		}
3683 	}
3684 	return (err);
3685 }
3686 
3687 static mblk_t *
3688 myri10ge_send_wrapper(void *arg, mblk_t *mp)
3689 {
3690 	struct myri10ge_slice_state *ss = arg;
3691 	int err = 0;
3692 	mcp_kreq_ether_send_t *req_list;
3693 #if defined(__i386)
3694 	/*
3695 	 * We need about 2.5KB of scratch space to handle transmits.
3696 	 * i86pc has only 8KB of kernel stack space, so we malloc the
3697 	 * scratch space there rather than keeping it on the stack.
3698 	 */
3699 	size_t req_size, tx_info_size;
3700 	struct myri10ge_tx_buffer_state *tx_info;
3701 	caddr_t req_bytes;
3702 
3703 	req_size = sizeof (*req_list) * (MYRI10GE_MAX_SEND_DESC_TSO + 4)
3704 	    + 8;
3705 	req_bytes = kmem_alloc(req_size, KM_SLEEP);
3706 	tx_info_size = sizeof (*tx_info) * (MYRI10GE_MAX_SEND_DESC_TSO + 1);
3707 	tx_info = kmem_alloc(tx_info_size, KM_SLEEP);
3708 #else
3709 	char req_bytes[sizeof (*req_list) * (MYRI10GE_MAX_SEND_DESC_TSO + 4)
3710 	    + 8];
3711 	struct myri10ge_tx_buffer_state tx_info[MYRI10GE_MAX_SEND_DESC_TSO + 1];
3712 #endif
3713 
3714 	/* ensure req_list entries are aligned to 8 bytes */
3715 	req_list = (struct mcp_kreq_ether_send *)
3716 	    (((unsigned long)req_bytes + 7UL) & ~7UL);
3717 
3718 	err = myri10ge_send(ss, mp, req_list, tx_info);
3719 
3720 #if defined(__i386)
3721 	kmem_free(tx_info, tx_info_size);
3722 	kmem_free(req_bytes, req_size);
3723 #endif
3724 	if (err)
3725 		return (mp);
3726 	else
3727 		return (NULL);
3728 }
3729 
3730 static int
3731 myri10ge_addmac(void *arg, const uint8_t *mac_addr)
3732 {
3733 	struct myri10ge_priv *mgp = arg;
3734 	int err;
3735 
3736 	if (mac_addr == NULL)
3737 		return (EINVAL);
3738 
3739 	mutex_enter(&mgp->intrlock);
3740 	if (mgp->macaddr_cnt) {
3741 		mutex_exit(&mgp->intrlock);
3742 		return (ENOSPC);
3743 	}
3744 	err = myri10ge_m_unicst(mgp, mac_addr);
3745 	if (!err)
3746 		mgp->macaddr_cnt++;
3747 
3748 	mutex_exit(&mgp->intrlock);
3749 	if (err)
3750 		return (err);
3751 
3752 	bcopy(mac_addr, mgp->mac_addr, sizeof (mgp->mac_addr));
3753 	return (0);
3754 }
3755 
3756 /*ARGSUSED*/
3757 static int
3758 myri10ge_remmac(void *arg, const uint8_t *mac_addr)
3759 {
3760 	struct myri10ge_priv *mgp = arg;
3761 
3762 	mutex_enter(&mgp->intrlock);
3763 	mgp->macaddr_cnt--;
3764 	mutex_exit(&mgp->intrlock);
3765 
3766 	return (0);
3767 }
3768 
3769 /*ARGSUSED*/
3770 static void
3771 myri10ge_fill_group(void *arg, mac_ring_type_t rtype, const int index,
3772     mac_group_info_t *infop, mac_group_handle_t gh)
3773 {
3774 	struct myri10ge_priv *mgp = arg;
3775 
3776 	if (rtype != MAC_RING_TYPE_RX)
3777 		return;
3778 
3779 	infop->mgi_driver = (mac_group_driver_t)mgp;
3780 	infop->mgi_start = NULL;
3781 	infop->mgi_stop = NULL;
3782 	infop->mgi_addmac = myri10ge_addmac;
3783 	infop->mgi_remmac = myri10ge_remmac;
3784 	infop->mgi_count = mgp->num_slices;
3785 }
3786 
3787 static int
3788 myri10ge_ring_start(mac_ring_driver_t rh, uint64_t mr_gen_num)
3789 {
3790 	struct myri10ge_slice_state *ss;
3791 
3792 	ss = (struct myri10ge_slice_state *)rh;
3793 	mutex_enter(&ss->rx_lock);
3794 	ss->rx_gen_num = mr_gen_num;
3795 	mutex_exit(&ss->rx_lock);
3796 	return (0);
3797 }
3798 
3799 static int
3800 myri10ge_rx_ring_intr_disable(mac_intr_handle_t intrh)
3801 {
3802 	struct myri10ge_slice_state *ss;
3803 
3804 	ss = (struct myri10ge_slice_state *)intrh;
3805 	mutex_enter(&ss->poll_lock);
3806 	ss->rx_polling = B_TRUE;
3807 	mutex_exit(&ss->poll_lock);
3808 	return (0);
3809 }
3810 
3811 static int
3812 myri10ge_rx_ring_intr_enable(mac_intr_handle_t intrh)
3813 {
3814 	struct myri10ge_slice_state *ss;
3815 
3816 	ss = (struct myri10ge_slice_state *)intrh;
3817 	mutex_enter(&ss->poll_lock);
3818 	ss->rx_polling = B_FALSE;
3819 	if (ss->rx_token) {
3820 		*ss->irq_claim = BE_32(3);
3821 		ss->rx_token = 0;
3822 	}
3823 	mutex_exit(&ss->poll_lock);
3824 	return (0);
3825 }
3826 
3827 /*ARGSUSED*/
3828 static void
3829 myri10ge_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
3830     const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh)
3831 {
3832 	struct myri10ge_priv *mgp = arg;
3833 	struct myri10ge_slice_state *ss;
3834 	mac_intr_t *mintr = &infop->mri_intr;
3835 
3836 	ASSERT((unsigned int)ring_index < mgp->num_slices);
3837 
3838 	ss = &mgp->ss[ring_index];
3839 	switch (rtype) {
3840 	case MAC_RING_TYPE_RX:
3841 		ss->rx_rh = rh;
3842 		infop->mri_driver = (mac_ring_driver_t)ss;
3843 		infop->mri_start = myri10ge_ring_start;
3844 		infop->mri_stop = NULL;
3845 		infop->mri_poll = myri10ge_poll_rx;
3846 		mintr->mi_handle = (mac_intr_handle_t)ss;
3847 		mintr->mi_enable = myri10ge_rx_ring_intr_enable;
3848 		mintr->mi_disable = myri10ge_rx_ring_intr_disable;
3849 		break;
3850 	case MAC_RING_TYPE_TX:
3851 		ss->tx.rh = rh;
3852 		infop->mri_driver = (mac_ring_driver_t)ss;
3853 		infop->mri_start = NULL;
3854 		infop->mri_stop = NULL;
3855 		infop->mri_tx = myri10ge_send_wrapper;
3856 		break;
3857 	default:
3858 		break;
3859 	}
3860 }
3861 
3862 static void
3863 myri10ge_nic_stat_destroy(struct myri10ge_priv *mgp)
3864 {
3865 	if (mgp->ksp_stat == NULL)
3866 		return;
3867 
3868 	kstat_delete(mgp->ksp_stat);
3869 	mgp->ksp_stat = NULL;
3870 }
3871 
3872 static void
3873 myri10ge_slice_stat_destroy(struct myri10ge_slice_state *ss)
3874 {
3875 	if (ss->ksp_stat == NULL)
3876 		return;
3877 
3878 	kstat_delete(ss->ksp_stat);
3879 	ss->ksp_stat = NULL;
3880 }
3881 
3882 static void
3883 myri10ge_info_destroy(struct myri10ge_priv *mgp)
3884 {
3885 	if (mgp->ksp_info == NULL)
3886 		return;
3887 
3888 	kstat_delete(mgp->ksp_info);
3889 	mgp->ksp_info = NULL;
3890 }
3891 
3892 static int
3893 myri10ge_nic_stat_kstat_update(kstat_t *ksp, int rw)
3894 {
3895 	struct myri10ge_nic_stat *ethstat;
3896 	struct myri10ge_priv *mgp;
3897 	mcp_irq_data_t *fw_stats;
3898 
3899 
3900 	if (rw == KSTAT_WRITE)
3901 		return (EACCES);
3902 
3903 	ethstat = (struct myri10ge_nic_stat *)ksp->ks_data;
3904 	mgp = (struct myri10ge_priv *)ksp->ks_private;
3905 	fw_stats = mgp->ss[0].fw_stats;
3906 
3907 	ethstat->dma_read_bw_MBs.value.ul = mgp->read_dma;
3908 	ethstat->dma_write_bw_MBs.value.ul = mgp->write_dma;
3909 	ethstat->dma_read_write_bw_MBs.value.ul = mgp->read_write_dma;
3910 	if (myri10ge_tx_dma_attr.dma_attr_flags & DDI_DMA_FORCE_PHYSICAL)
3911 		ethstat->dma_force_physical.value.ul = 1;
3912 	else
3913 		ethstat->dma_force_physical.value.ul = 0;
3914 	ethstat->lanes.value.ul = mgp->pcie_link_width;
3915 	ethstat->dropped_bad_crc32.value.ul =
3916 	    ntohl(fw_stats->dropped_bad_crc32);
3917 	ethstat->dropped_bad_phy.value.ul =
3918 	    ntohl(fw_stats->dropped_bad_phy);
3919 	ethstat->dropped_link_error_or_filtered.value.ul =
3920 	    ntohl(fw_stats->dropped_link_error_or_filtered);
3921 	ethstat->dropped_link_overflow.value.ul =
3922 	    ntohl(fw_stats->dropped_link_overflow);
3923 	ethstat->dropped_multicast_filtered.value.ul =
3924 	    ntohl(fw_stats->dropped_multicast_filtered);
3925 	ethstat->dropped_no_big_buffer.value.ul =
3926 	    ntohl(fw_stats->dropped_no_big_buffer);
3927 	ethstat->dropped_no_small_buffer.value.ul =
3928 	    ntohl(fw_stats->dropped_no_small_buffer);
3929 	ethstat->dropped_overrun.value.ul =
3930 	    ntohl(fw_stats->dropped_overrun);
3931 	ethstat->dropped_pause.value.ul =
3932 	    ntohl(fw_stats->dropped_pause);
3933 	ethstat->dropped_runt.value.ul =
3934 	    ntohl(fw_stats->dropped_runt);
3935 	ethstat->link_up.value.ul =
3936 	    ntohl(fw_stats->link_up);
3937 	ethstat->dropped_unicast_filtered.value.ul =
3938 	    ntohl(fw_stats->dropped_unicast_filtered);
3939 	return (0);
3940 }
3941 
3942 static int
3943 myri10ge_slice_stat_kstat_update(kstat_t *ksp, int rw)
3944 {
3945 	struct myri10ge_slice_stat *ethstat;
3946 	struct myri10ge_slice_state *ss;
3947 
3948 	if (rw == KSTAT_WRITE)
3949 		return (EACCES);
3950 
3951 	ethstat = (struct myri10ge_slice_stat *)ksp->ks_data;
3952 	ss = (struct myri10ge_slice_state *)ksp->ks_private;
3953 
3954 	ethstat->rx_big.value.ul = ss->j_rx_cnt;
3955 	ethstat->rx_bigbuf_firmware.value.ul = ss->rx_big.cnt - ss->j_rx_cnt;
3956 	ethstat->rx_bigbuf_pool.value.ul =
3957 	    ss->jpool.num_alloc - ss->jbufs_for_smalls;
3958 	ethstat->rx_bigbuf_smalls.value.ul = ss->jbufs_for_smalls;
3959 	ethstat->rx_small.value.ul = ss->rx_small.cnt -
3960 	    (ss->rx_small.mask + 1);
3961 	ethstat->tx_done.value.ul = ss->tx.done;
3962 	ethstat->tx_req.value.ul = ss->tx.req;
3963 	ethstat->tx_activate.value.ul = ss->tx.activate;
3964 	ethstat->xmit_sched.value.ul = ss->tx.sched;
3965 	ethstat->xmit_stall.value.ul = ss->tx.stall;
3966 	ethstat->xmit_stall_early.value.ul = ss->tx.stall_early;
3967 	ethstat->xmit_stall_late.value.ul = ss->tx.stall_late;
3968 	ethstat->xmit_err.value.ul =  MYRI10GE_SLICE_STAT(xmit_err);
3969 	return (0);
3970 }
3971 
3972 static int
3973 myri10ge_info_kstat_update(kstat_t *ksp, int rw)
3974 {
3975 	struct myri10ge_info *info;
3976 	struct myri10ge_priv *mgp;
3977 
3978 
3979 	if (rw == KSTAT_WRITE)
3980 		return (EACCES);
3981 
3982 	info = (struct myri10ge_info *)ksp->ks_data;
3983 	mgp = (struct myri10ge_priv *)ksp->ks_private;
3984 	kstat_named_setstr(&info->driver_version, MYRI10GE_VERSION_STR);
3985 	kstat_named_setstr(&info->firmware_version, mgp->fw_version);
3986 	kstat_named_setstr(&info->firmware_name, mgp->fw_name);
3987 	kstat_named_setstr(&info->interrupt_type, mgp->intr_type);
3988 	kstat_named_setstr(&info->product_code, mgp->pc_str);
3989 	kstat_named_setstr(&info->serial_number, mgp->sn_str);
3990 	return (0);
3991 }
3992 
3993 static struct myri10ge_info myri10ge_info_template = {
3994 	{ "driver_version",	KSTAT_DATA_STRING },
3995 	{ "firmware_version",	KSTAT_DATA_STRING },
3996 	{ "firmware_name",	KSTAT_DATA_STRING },
3997 	{ "interrupt_type",	KSTAT_DATA_STRING },
3998 	{ "product_code",	KSTAT_DATA_STRING },
3999 	{ "serial_number",	KSTAT_DATA_STRING },
4000 };
4001 static kmutex_t myri10ge_info_template_lock;
4002 
4003 
4004 static int
4005 myri10ge_info_init(struct myri10ge_priv *mgp)
4006 {
4007 	struct kstat *ksp;
4008 
4009 	ksp = kstat_create("myri10ge", ddi_get_instance(mgp->dip),
4010 	    "myri10ge_info", "net", KSTAT_TYPE_NAMED,
4011 	    sizeof (myri10ge_info_template) /
4012 	    sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4013 	if (ksp == NULL) {
4014 		cmn_err(CE_WARN,
4015 		    "%s: myri10ge_info_init: kstat_create failed", mgp->name);
4016 		return (DDI_FAILURE);
4017 	}
4018 	mgp->ksp_info = ksp;
4019 	ksp->ks_update = myri10ge_info_kstat_update;
4020 	ksp->ks_private = (void *) mgp;
4021 	ksp->ks_data = &myri10ge_info_template;
4022 	ksp->ks_lock = &myri10ge_info_template_lock;
4023 	if (MYRI10GE_VERSION_STR != NULL)
4024 		ksp->ks_data_size += strlen(MYRI10GE_VERSION_STR) + 1;
4025 	if (mgp->fw_version != NULL)
4026 		ksp->ks_data_size += strlen(mgp->fw_version) + 1;
4027 	ksp->ks_data_size += strlen(mgp->fw_name) + 1;
4028 	ksp->ks_data_size += strlen(mgp->intr_type) + 1;
4029 	if (mgp->pc_str != NULL)
4030 		ksp->ks_data_size += strlen(mgp->pc_str) + 1;
4031 	if (mgp->sn_str != NULL)
4032 		ksp->ks_data_size += strlen(mgp->sn_str) + 1;
4033 
4034 	kstat_install(ksp);
4035 	return (DDI_SUCCESS);
4036 }
4037 
4038 
4039 static int
4040 myri10ge_nic_stat_init(struct myri10ge_priv *mgp)
4041 {
4042 	struct kstat *ksp;
4043 	struct myri10ge_nic_stat *ethstat;
4044 
4045 	ksp = kstat_create("myri10ge", ddi_get_instance(mgp->dip),
4046 	    "myri10ge_nic_stats", "net", KSTAT_TYPE_NAMED,
4047 	    sizeof (*ethstat) / sizeof (kstat_named_t), 0);
4048 	if (ksp == NULL) {
4049 		cmn_err(CE_WARN,
4050 		    "%s: myri10ge_stat_init: kstat_create failed", mgp->name);
4051 		return (DDI_FAILURE);
4052 	}
4053 	mgp->ksp_stat = ksp;
4054 	ethstat = (struct myri10ge_nic_stat *)(ksp->ks_data);
4055 
4056 	kstat_named_init(&ethstat->dma_read_bw_MBs,
4057 	    "dma_read_bw_MBs", KSTAT_DATA_ULONG);
4058 	kstat_named_init(&ethstat->dma_write_bw_MBs,
4059 	    "dma_write_bw_MBs", KSTAT_DATA_ULONG);
4060 	kstat_named_init(&ethstat->dma_read_write_bw_MBs,
4061 	    "dma_read_write_bw_MBs", KSTAT_DATA_ULONG);
4062 	kstat_named_init(&ethstat->dma_force_physical,
4063 	    "dma_force_physical", KSTAT_DATA_ULONG);
4064 	kstat_named_init(&ethstat->lanes,
4065 	    "lanes", KSTAT_DATA_ULONG);
4066 	kstat_named_init(&ethstat->dropped_bad_crc32,
4067 	    "dropped_bad_crc32", KSTAT_DATA_ULONG);
4068 	kstat_named_init(&ethstat->dropped_bad_phy,
4069 	    "dropped_bad_phy", KSTAT_DATA_ULONG);
4070 	kstat_named_init(&ethstat->dropped_link_error_or_filtered,
4071 	    "dropped_link_error_or_filtered", KSTAT_DATA_ULONG);
4072 	kstat_named_init(&ethstat->dropped_link_overflow,
4073 	    "dropped_link_overflow", KSTAT_DATA_ULONG);
4074 	kstat_named_init(&ethstat->dropped_multicast_filtered,
4075 	    "dropped_multicast_filtered", KSTAT_DATA_ULONG);
4076 	kstat_named_init(&ethstat->dropped_no_big_buffer,
4077 	    "dropped_no_big_buffer", KSTAT_DATA_ULONG);
4078 	kstat_named_init(&ethstat->dropped_no_small_buffer,
4079 	    "dropped_no_small_buffer", KSTAT_DATA_ULONG);
4080 	kstat_named_init(&ethstat->dropped_overrun,
4081 	    "dropped_overrun", KSTAT_DATA_ULONG);
4082 	kstat_named_init(&ethstat->dropped_pause,
4083 	    "dropped_pause", KSTAT_DATA_ULONG);
4084 	kstat_named_init(&ethstat->dropped_runt,
4085 	    "dropped_runt", KSTAT_DATA_ULONG);
4086 	kstat_named_init(&ethstat->dropped_unicast_filtered,
4087 	    "dropped_unicast_filtered", KSTAT_DATA_ULONG);
4088 	kstat_named_init(&ethstat->dropped_runt, "dropped_runt",
4089 	    KSTAT_DATA_ULONG);
4090 	kstat_named_init(&ethstat->link_up, "link_up", KSTAT_DATA_ULONG);
4091 	kstat_named_init(&ethstat->link_changes, "link_changes",
4092 	    KSTAT_DATA_ULONG);
4093 	ksp->ks_update = myri10ge_nic_stat_kstat_update;
4094 	ksp->ks_private = (void *) mgp;
4095 	kstat_install(ksp);
4096 	return (DDI_SUCCESS);
4097 }
4098 
4099 static int
4100 myri10ge_slice_stat_init(struct myri10ge_slice_state *ss)
4101 {
4102 	struct myri10ge_priv *mgp = ss->mgp;
4103 	struct kstat *ksp;
4104 	struct myri10ge_slice_stat *ethstat;
4105 	int instance;
4106 
4107 	/*
4108 	 * fake an instance so that the same slice numbers from
4109 	 * different instances do not collide
4110 	 */
4111 	instance = (ddi_get_instance(mgp->dip) * 1000) +  (int)(ss - mgp->ss);
4112 	ksp = kstat_create("myri10ge", instance,
4113 	    "myri10ge_slice_stats", "net", KSTAT_TYPE_NAMED,
4114 	    sizeof (*ethstat) / sizeof (kstat_named_t), 0);
4115 	if (ksp == NULL) {
4116 		cmn_err(CE_WARN,
4117 		    "%s: myri10ge_stat_init: kstat_create failed", mgp->name);
4118 		return (DDI_FAILURE);
4119 	}
4120 	ss->ksp_stat = ksp;
4121 	ethstat = (struct myri10ge_slice_stat *)(ksp->ks_data);
4122 	kstat_named_init(&ethstat->lro_bad_csum, "lro_bad_csum",
4123 	    KSTAT_DATA_ULONG);
4124 	kstat_named_init(&ethstat->lro_flushed, "lro_flushed",
4125 	    KSTAT_DATA_ULONG);
4126 	kstat_named_init(&ethstat->lro_queued, "lro_queued",
4127 	    KSTAT_DATA_ULONG);
4128 	kstat_named_init(&ethstat->rx_bigbuf_firmware, "rx_bigbuf_firmware",
4129 	    KSTAT_DATA_ULONG);
4130 	kstat_named_init(&ethstat->rx_bigbuf_pool, "rx_bigbuf_pool",
4131 	    KSTAT_DATA_ULONG);
4132 	kstat_named_init(&ethstat->rx_bigbuf_smalls, "rx_bigbuf_smalls",
4133 	    KSTAT_DATA_ULONG);
4134 	kstat_named_init(&ethstat->rx_copy, "rx_copy",
4135 	    KSTAT_DATA_ULONG);
4136 	kstat_named_init(&ethstat->rx_big_nobuf, "rx_big_nobuf",
4137 	    KSTAT_DATA_ULONG);
4138 	kstat_named_init(&ethstat->rx_small_nobuf, "rx_small_nobuf",
4139 	    KSTAT_DATA_ULONG);
4140 	kstat_named_init(&ethstat->xmit_zero_len, "xmit_zero_len",
4141 	    KSTAT_DATA_ULONG);
4142 	kstat_named_init(&ethstat->xmit_pullup, "xmit_pullup",
4143 	    KSTAT_DATA_ULONG);
4144 	kstat_named_init(&ethstat->xmit_pullup_first, "xmit_pullup_first",
4145 	    KSTAT_DATA_ULONG);
4146 	kstat_named_init(&ethstat->xmit_lowbuf, "xmit_lowbuf",
4147 	    KSTAT_DATA_ULONG);
4148 	kstat_named_init(&ethstat->xmit_lsobadflags, "xmit_lsobadflags",
4149 	    KSTAT_DATA_ULONG);
4150 	kstat_named_init(&ethstat->xmit_sched, "xmit_sched",
4151 	    KSTAT_DATA_ULONG);
4152 	kstat_named_init(&ethstat->xmit_stall, "xmit_stall",
4153 	    KSTAT_DATA_ULONG);
4154 	kstat_named_init(&ethstat->xmit_stall_early, "xmit_stall_early",
4155 	    KSTAT_DATA_ULONG);
4156 	kstat_named_init(&ethstat->xmit_stall_late, "xmit_stall_late",
4157 	    KSTAT_DATA_ULONG);
4158 	kstat_named_init(&ethstat->xmit_err, "xmit_err",
4159 	    KSTAT_DATA_ULONG);
4160 	kstat_named_init(&ethstat->tx_req, "tx_req",
4161 	    KSTAT_DATA_ULONG);
4162 	kstat_named_init(&ethstat->tx_activate, "tx_activate",
4163 	    KSTAT_DATA_ULONG);
4164 	kstat_named_init(&ethstat->tx_done, "tx_done",
4165 	    KSTAT_DATA_ULONG);
4166 	kstat_named_init(&ethstat->tx_handles_alloced, "tx_handles_alloced",
4167 	    KSTAT_DATA_ULONG);
4168 	kstat_named_init(&ethstat->rx_big, "rx_big",
4169 	    KSTAT_DATA_ULONG);
4170 	kstat_named_init(&ethstat->rx_small, "rx_small",
4171 	    KSTAT_DATA_ULONG);
4172 	ksp->ks_update = myri10ge_slice_stat_kstat_update;
4173 	ksp->ks_private = (void *) ss;
4174 	kstat_install(ksp);
4175 	return (DDI_SUCCESS);
4176 }
4177 
4178 
4179 
4180 #if #cpu(i386) || defined __i386 || defined i386 ||	\
4181 	defined __i386__ || #cpu(x86_64) || defined __x86_64__
4182 
4183 #include <vm/hat.h>
4184 #include <sys/ddi_isa.h>
4185 void *device_arena_alloc(size_t size, int vm_flag);
4186 void device_arena_free(void *vaddr, size_t size);
4187 
4188 static void
4189 myri10ge_enable_nvidia_ecrc(struct myri10ge_priv *mgp)
4190 {
4191 	dev_info_t *parent_dip;
4192 	ddi_acc_handle_t handle;
4193 	unsigned long bus_number, dev_number, func_number;
4194 	unsigned long cfg_pa, paddr, base, pgoffset;
4195 	char 		*cvaddr, *ptr;
4196 	uint32_t	*ptr32;
4197 	int 		retval = DDI_FAILURE;
4198 	int dontcare;
4199 	uint16_t read_vid, read_did, vendor_id, device_id;
4200 
4201 	if (!myri10ge_nvidia_ecrc_enable)
4202 		return;
4203 
4204 	parent_dip = ddi_get_parent(mgp->dip);
4205 	if (parent_dip == NULL) {
4206 		cmn_err(CE_WARN, "%s: I'm an orphan?", mgp->name);
4207 		return;
4208 	}
4209 
4210 	if (pci_config_setup(parent_dip, &handle) != DDI_SUCCESS) {
4211 		cmn_err(CE_WARN,
4212 		    "%s: Could not access my parent's registers", mgp->name);
4213 		return;
4214 	}
4215 
4216 	vendor_id = pci_config_get16(handle, PCI_CONF_VENID);
4217 	device_id = pci_config_get16(handle, PCI_CONF_DEVID);
4218 	pci_config_teardown(&handle);
4219 
4220 	if (myri10ge_verbose) {
4221 		unsigned long 	bus_number, dev_number, func_number;
4222 		int 		reg_set, span;
4223 		(void) myri10ge_reg_set(parent_dip, &reg_set, &span,
4224 		    &bus_number, &dev_number, &func_number);
4225 		if (myri10ge_verbose)
4226 			printf("%s: parent at %ld:%ld:%ld\n", mgp->name,
4227 			    bus_number, dev_number, func_number);
4228 	}
4229 
4230 	if (vendor_id !=  0x10de)
4231 		return;
4232 
4233 	if (device_id != 0x005d /* CK804 */ &&
4234 	    (device_id < 0x374 || device_id > 0x378) /* MCP55 */) {
4235 		return;
4236 	}
4237 	(void) myri10ge_reg_set(parent_dip, &dontcare, &dontcare,
4238 	    &bus_number, &dev_number, &func_number);
4239 
4240 	for (cfg_pa = 0xf0000000UL;
4241 	    retval != DDI_SUCCESS && cfg_pa >= 0xe0000000UL;
4242 	    cfg_pa -= 0x10000000UL) {
4243 		/* find the config space address for the nvidia bridge */
4244 		paddr = (cfg_pa + bus_number * 0x00100000UL +
4245 		    (dev_number * 8 + func_number) * 0x00001000UL);
4246 
4247 		base = paddr & (~MMU_PAGEOFFSET);
4248 		pgoffset = paddr & MMU_PAGEOFFSET;
4249 
4250 		/* map it into the kernel */
4251 		cvaddr =  device_arena_alloc(ptob(1), VM_NOSLEEP);
4252 		if (cvaddr == NULL)
4253 			cmn_err(CE_WARN, "%s: failed to map nf4: cvaddr\n",
4254 			    mgp->name);
4255 
4256 		hat_devload(kas.a_hat, cvaddr, mmu_ptob(1),
4257 		    i_ddi_paddr_to_pfn(base),
4258 		    PROT_WRITE|HAT_STRICTORDER, HAT_LOAD_LOCK);
4259 
4260 		ptr = cvaddr + pgoffset;
4261 		read_vid = *(uint16_t *)(void *)(ptr + PCI_CONF_VENID);
4262 		read_did = *(uint16_t *)(void *)(ptr + PCI_CONF_DEVID);
4263 		if (vendor_id ==  read_did || device_id == read_did) {
4264 			ptr32 = (uint32_t *)(void *)(ptr + 0x178);
4265 			if (myri10ge_verbose)
4266 				printf("%s: Enabling ECRC on upstream "
4267 				    "Nvidia bridge (0x%x:0x%x) "
4268 				    "at %ld:%ld:%ld\n", mgp->name,
4269 				    read_vid, read_did, bus_number,
4270 				    dev_number, func_number);
4271 			*ptr32 |= 0x40;
4272 			retval = DDI_SUCCESS;
4273 		}
4274 		hat_unload(kas.a_hat, cvaddr, ptob(1), HAT_UNLOAD_UNLOCK);
4275 		device_arena_free(cvaddr, ptob(1));
4276 	}
4277 }
4278 
4279 #else
4280 /*ARGSUSED*/
4281 static void
4282 myri10ge_enable_nvidia_ecrc(struct myri10ge_priv *mgp)
4283 {
4284 }
4285 #endif /* i386 */
4286 
4287 
4288 /*
4289  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
4290  * when the PCI-E Completion packets are aligned on an 8-byte
4291  * boundary.  Some PCI-E chip sets always align Completion packets; on
4292  * the ones that do not, the alignment can be enforced by enabling
4293  * ECRC generation (if supported).
4294  *
4295  * When PCI-E Completion packets are not aligned, it is actually more
4296  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
4297  *
4298  * If the driver can neither enable ECRC nor verify that it has
4299  * already been enabled, then it must use a firmware image which works
4300  * around unaligned completion packets (ethp_z8e.dat), and it should
4301  * also ensure that it never gives the device a Read-DMA which is
4302  * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
4303  * enabled, then the driver should use the aligned (eth_z8e.dat)
4304  * firmware image, and set tx.boundary to 4KB.
4305  */
4306 
4307 
4308 static int
4309 myri10ge_firmware_probe(struct myri10ge_priv *mgp)
4310 {
4311 	int status;
4312 
4313 	mgp->tx_boundary = 4096;
4314 	/*
4315 	 * Verify the max read request size was set to 4KB
4316 	 * before trying the test with 4KB.
4317 	 */
4318 	if (mgp->max_read_request_4k == 0)
4319 		mgp->tx_boundary = 2048;
4320 	/*
4321 	 * load the optimized firmware which assumes aligned PCIe
4322 	 * completions in order to see if it works on this host.
4323 	 */
4324 
4325 	mgp->fw_name = "rss_eth_z8e";
4326 	mgp->eth_z8e = (unsigned char *)rss_eth_z8e;
4327 	mgp->eth_z8e_length = rss_eth_z8e_length;
4328 
4329 	status = myri10ge_load_firmware(mgp);
4330 	if (status != 0) {
4331 		return (status);
4332 	}
4333 	/*
4334 	 * Enable ECRC if possible
4335 	 */
4336 	myri10ge_enable_nvidia_ecrc(mgp);
4337 
4338 	/*
4339 	 * Run a DMA test which watches for unaligned completions and
4340 	 * aborts on the first one seen.
4341 	 */
4342 	status = myri10ge_dma_test(mgp, MXGEFW_CMD_UNALIGNED_TEST);
4343 	if (status == 0)
4344 		return (0); /* keep the aligned firmware */
4345 
4346 	if (status != E2BIG)
4347 		cmn_err(CE_WARN, "%s: DMA test failed: %d\n",
4348 		    mgp->name, status);
4349 	if (status == ENOSYS)
4350 		cmn_err(CE_WARN, "%s: Falling back to ethp! "
4351 		    "Please install up to date fw\n", mgp->name);
4352 	return (status);
4353 }
4354 
4355 static int
4356 myri10ge_select_firmware(struct myri10ge_priv *mgp)
4357 {
4358 	int aligned;
4359 
4360 	aligned = 0;
4361 
4362 	if (myri10ge_force_firmware == 1) {
4363 		if (myri10ge_verbose)
4364 			printf("%s: Assuming aligned completions (forced)\n",
4365 			    mgp->name);
4366 		aligned = 1;
4367 		goto done;
4368 	}
4369 
4370 	if (myri10ge_force_firmware == 2) {
4371 		if (myri10ge_verbose)
4372 			printf("%s: Assuming unaligned completions (forced)\n",
4373 			    mgp->name);
4374 		aligned = 0;
4375 		goto done;
4376 	}
4377 
4378 	/* If the width is less than 8, we may used the aligned firmware */
4379 	if (mgp->pcie_link_width != 0 && mgp->pcie_link_width < 8) {
4380 		cmn_err(CE_WARN, "!%s: PCIe link running at x%d\n",
4381 		    mgp->name, mgp->pcie_link_width);
4382 		aligned = 1;
4383 		goto done;
4384 	}
4385 
4386 	if (0 == myri10ge_firmware_probe(mgp))
4387 		return (0);  /* keep optimized firmware */
4388 
4389 done:
4390 	if (aligned) {
4391 		mgp->fw_name = "rss_eth_z8e";
4392 		mgp->eth_z8e = (unsigned char *)rss_eth_z8e;
4393 		mgp->eth_z8e_length = rss_eth_z8e_length;
4394 		mgp->tx_boundary = 4096;
4395 	} else {
4396 		mgp->fw_name = "rss_ethp_z8e";
4397 		mgp->eth_z8e = (unsigned char *)rss_ethp_z8e;
4398 		mgp->eth_z8e_length = rss_ethp_z8e_length;
4399 		mgp->tx_boundary = 2048;
4400 	}
4401 
4402 	return (myri10ge_load_firmware(mgp));
4403 }
4404 
4405 static int
4406 myri10ge_add_intrs(struct myri10ge_priv *mgp, int add_handler)
4407 {
4408 	dev_info_t *devinfo = mgp->dip;
4409 	int count, avail, actual, intr_types;
4410 	int x, y, rc, inum = 0;
4411 
4412 
4413 	rc = ddi_intr_get_supported_types(devinfo, &intr_types);
4414 	if (rc != DDI_SUCCESS) {
4415 		cmn_err(CE_WARN,
4416 		    "!%s: ddi_intr_get_nintrs() failure, rc = %d\n", mgp->name,
4417 		    rc);
4418 		return (DDI_FAILURE);
4419 	}
4420 
4421 	if (!myri10ge_use_msi)
4422 		intr_types &= ~DDI_INTR_TYPE_MSI;
4423 	if (!myri10ge_use_msix)
4424 		intr_types &= ~DDI_INTR_TYPE_MSIX;
4425 
4426 	if (intr_types & DDI_INTR_TYPE_MSIX) {
4427 		mgp->ddi_intr_type = DDI_INTR_TYPE_MSIX;
4428 		mgp->intr_type = "MSI-X";
4429 	} else if (intr_types & DDI_INTR_TYPE_MSI) {
4430 		mgp->ddi_intr_type = DDI_INTR_TYPE_MSI;
4431 		mgp->intr_type = "MSI";
4432 	} else {
4433 		mgp->ddi_intr_type = DDI_INTR_TYPE_FIXED;
4434 		mgp->intr_type = "Legacy";
4435 	}
4436 	/* Get number of interrupts */
4437 	rc = ddi_intr_get_nintrs(devinfo, mgp->ddi_intr_type, &count);
4438 	if ((rc != DDI_SUCCESS) || (count == 0)) {
4439 		cmn_err(CE_WARN, "%s: ddi_intr_get_nintrs() failure, rc: %d, "
4440 		    "count: %d", mgp->name, rc, count);
4441 
4442 		return (DDI_FAILURE);
4443 	}
4444 
4445 	/* Get number of available interrupts */
4446 	rc = ddi_intr_get_navail(devinfo, mgp->ddi_intr_type, &avail);
4447 	if ((rc != DDI_SUCCESS) || (avail == 0)) {
4448 		cmn_err(CE_WARN, "%s: ddi_intr_get_navail() failure, "
4449 		    "rc: %d, avail: %d\n", mgp->name, rc, avail);
4450 		return (DDI_FAILURE);
4451 	}
4452 	if (avail < count) {
4453 		cmn_err(CE_NOTE,
4454 		    "!%s: nintrs() returned %d, navail returned %d",
4455 		    mgp->name, count, avail);
4456 		count = avail;
4457 	}
4458 
4459 	if (count < mgp->num_slices)
4460 		return (DDI_FAILURE);
4461 
4462 	if (count > mgp->num_slices)
4463 		count = mgp->num_slices;
4464 
4465 	/* Allocate memory for MSI interrupts */
4466 	mgp->intr_size = count * sizeof (ddi_intr_handle_t);
4467 	mgp->htable = kmem_alloc(mgp->intr_size, KM_SLEEP);
4468 
4469 	rc = ddi_intr_alloc(devinfo, mgp->htable, mgp->ddi_intr_type, inum,
4470 	    count, &actual, DDI_INTR_ALLOC_NORMAL);
4471 
4472 	if ((rc != DDI_SUCCESS) || (actual == 0)) {
4473 		cmn_err(CE_WARN, "%s: ddi_intr_alloc() failed: %d",
4474 		    mgp->name, rc);
4475 
4476 		kmem_free(mgp->htable, mgp->intr_size);
4477 		mgp->htable = NULL;
4478 		return (DDI_FAILURE);
4479 	}
4480 
4481 	if ((actual < count) && myri10ge_verbose) {
4482 		cmn_err(CE_NOTE, "%s: got %d/%d slices",
4483 		    mgp->name, actual, count);
4484 	}
4485 
4486 	mgp->intr_cnt = actual;
4487 
4488 	/*
4489 	 * Get priority for first irq, assume remaining are all the same
4490 	 */
4491 	if (ddi_intr_get_pri(mgp->htable[0], &mgp->intr_pri)
4492 	    != DDI_SUCCESS) {
4493 		cmn_err(CE_WARN, "%s: ddi_intr_get_pri() failed", mgp->name);
4494 
4495 		/* Free already allocated intr */
4496 		for (y = 0; y < actual; y++) {
4497 			(void) ddi_intr_free(mgp->htable[y]);
4498 		}
4499 
4500 		kmem_free(mgp->htable, mgp->intr_size);
4501 		mgp->htable = NULL;
4502 		return (DDI_FAILURE);
4503 	}
4504 
4505 	mgp->icookie = (void *)(uintptr_t)mgp->intr_pri;
4506 
4507 	if (!add_handler)
4508 		return (DDI_SUCCESS);
4509 
4510 	/* Call ddi_intr_add_handler() */
4511 	for (x = 0; x < actual; x++) {
4512 		if (ddi_intr_add_handler(mgp->htable[x], myri10ge_intr,
4513 		    (caddr_t)&mgp->ss[x], NULL) != DDI_SUCCESS) {
4514 			cmn_err(CE_WARN, "%s: ddi_intr_add_handler() failed",
4515 			    mgp->name);
4516 
4517 			/* Free already allocated intr */
4518 			for (y = 0; y < actual; y++) {
4519 				(void) ddi_intr_free(mgp->htable[y]);
4520 			}
4521 
4522 			kmem_free(mgp->htable, mgp->intr_size);
4523 			mgp->htable = NULL;
4524 			return (DDI_FAILURE);
4525 		}
4526 	}
4527 
4528 	(void) ddi_intr_get_cap(mgp->htable[0], &mgp->intr_cap);
4529 	if (mgp->intr_cap & DDI_INTR_FLAG_BLOCK) {
4530 		/* Call ddi_intr_block_enable() for MSI */
4531 		(void) ddi_intr_block_enable(mgp->htable, mgp->intr_cnt);
4532 	} else {
4533 		/* Call ddi_intr_enable() for MSI non block enable */
4534 		for (x = 0; x < mgp->intr_cnt; x++) {
4535 			(void) ddi_intr_enable(mgp->htable[x]);
4536 		}
4537 	}
4538 
4539 	return (DDI_SUCCESS);
4540 }
4541 
4542 static void
4543 myri10ge_rem_intrs(struct myri10ge_priv *mgp, int handler_installed)
4544 {
4545 	int x, err;
4546 
4547 	/* Disable all interrupts */
4548 	if (handler_installed) {
4549 		if (mgp->intr_cap & DDI_INTR_FLAG_BLOCK) {
4550 			/* Call ddi_intr_block_disable() */
4551 			(void) ddi_intr_block_disable(mgp->htable,
4552 			    mgp->intr_cnt);
4553 		} else {
4554 			for (x = 0; x < mgp->intr_cnt; x++) {
4555 				(void) ddi_intr_disable(mgp->htable[x]);
4556 			}
4557 		}
4558 	}
4559 
4560 	for (x = 0; x < mgp->intr_cnt; x++) {
4561 		if (handler_installed) {
4562 		/* Call ddi_intr_remove_handler() */
4563 			err = ddi_intr_remove_handler(mgp->htable[x]);
4564 			if (err != DDI_SUCCESS) {
4565 				cmn_err(CE_WARN,
4566 				    "%s: ddi_intr_remove_handler for"
4567 				    "vec %d returned %d\n", mgp->name,
4568 				    x, err);
4569 			}
4570 		}
4571 		err = ddi_intr_free(mgp->htable[x]);
4572 		if (err != DDI_SUCCESS) {
4573 			cmn_err(CE_WARN,
4574 			    "%s: ddi_intr_free for vec %d returned %d\n",
4575 			    mgp->name, x, err);
4576 		}
4577 	}
4578 	kmem_free(mgp->htable, mgp->intr_size);
4579 	mgp->htable = NULL;
4580 }
4581 
4582 static void
4583 myri10ge_test_physical(dev_info_t *dip)
4584 {
4585 	ddi_dma_handle_t	handle;
4586 	struct myri10ge_dma_stuff dma;
4587 	void *addr;
4588 	int err;
4589 
4590 	/* test #1, sufficient for older sparc systems */
4591 	myri10ge_tx_dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
4592 	err = ddi_dma_alloc_handle(dip, &myri10ge_tx_dma_attr,
4593 	    DDI_DMA_DONTWAIT, NULL, &handle);
4594 	if (err == DDI_DMA_BADATTR)
4595 		goto fail;
4596 	ddi_dma_free_handle(&handle);
4597 
4598 	/* test #2, required on Olympis where the bind is what fails */
4599 	addr = myri10ge_dma_alloc(dip, 128, &myri10ge_tx_dma_attr,
4600 	    &myri10ge_dev_access_attr, DDI_DMA_STREAMING,
4601 	    DDI_DMA_WRITE|DDI_DMA_STREAMING, &dma, 0, DDI_DMA_DONTWAIT);
4602 	if (addr == NULL)
4603 		goto fail;
4604 	myri10ge_dma_free(&dma);
4605 	return;
4606 
4607 fail:
4608 	if (myri10ge_verbose)
4609 		printf("myri10ge%d: DDI_DMA_FORCE_PHYSICAL failed, "
4610 		    "using IOMMU\n", ddi_get_instance(dip));
4611 
4612 	myri10ge_tx_dma_attr.dma_attr_flags &= ~DDI_DMA_FORCE_PHYSICAL;
4613 }
4614 
4615 static void
4616 myri10ge_get_props(dev_info_t *dip)
4617 {
4618 
4619 	myri10ge_flow_control =  ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4620 	    "myri10ge_flow_control", myri10ge_flow_control);
4621 
4622 	myri10ge_intr_coal_delay = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4623 	    "myri10ge_intr_coal_delay", myri10ge_intr_coal_delay);
4624 
4625 #if #cpu(i386) || defined __i386 || defined i386 ||	\
4626 	defined __i386__ || #cpu(x86_64) || defined __x86_64__
4627 	myri10ge_nvidia_ecrc_enable = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4628 	    "myri10ge_nvidia_ecrc_enable", 1);
4629 #endif
4630 
4631 
4632 	myri10ge_use_msi = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4633 	    "myri10ge_use_msi", myri10ge_use_msi);
4634 
4635 	myri10ge_deassert_wait = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4636 	    "myri10ge_deassert_wait",  myri10ge_deassert_wait);
4637 
4638 	myri10ge_verbose = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4639 	    "myri10ge_verbose", myri10ge_verbose);
4640 
4641 	myri10ge_tx_copylen = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4642 	    "myri10ge_tx_copylen", myri10ge_tx_copylen);
4643 
4644 	if (myri10ge_tx_copylen < 60) {
4645 		cmn_err(CE_WARN,
4646 		    "myri10ge_tx_copylen must be >= 60 bytes\n");
4647 		myri10ge_tx_copylen = 60;
4648 	}
4649 
4650 	myri10ge_mtu_override = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4651 	    "myri10ge_mtu_override", myri10ge_mtu_override);
4652 
4653 	if (myri10ge_mtu_override >= 1500 && myri10ge_mtu_override <= 9000)
4654 		myri10ge_mtu = myri10ge_mtu_override +
4655 		    sizeof (struct ether_header) + MXGEFW_PAD + VLAN_TAGSZ;
4656 	else if (myri10ge_mtu_override != 0) {
4657 		cmn_err(CE_WARN,
4658 		    "myri10ge_mtu_override must be between 1500 and "
4659 		    "9000 bytes\n");
4660 	}
4661 
4662 	myri10ge_bigbufs_initial = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4663 	    "myri10ge_bigbufs_initial", myri10ge_bigbufs_initial);
4664 	myri10ge_bigbufs_max = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4665 	    "myri10ge_bigbufs_max", myri10ge_bigbufs_max);
4666 
4667 	myri10ge_watchdog_reset = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4668 	    "myri10ge_watchdog_reset", myri10ge_watchdog_reset);
4669 
4670 	if (myri10ge_bigbufs_initial < 128) {
4671 		cmn_err(CE_WARN,
4672 		    "myri10ge_bigbufs_initial be at least 128\n");
4673 		myri10ge_bigbufs_initial = 128;
4674 	}
4675 	if (myri10ge_bigbufs_max < 128) {
4676 		cmn_err(CE_WARN,
4677 		    "myri10ge_bigbufs_max be at least 128\n");
4678 		myri10ge_bigbufs_max = 128;
4679 	}
4680 
4681 	if (myri10ge_bigbufs_max < myri10ge_bigbufs_initial) {
4682 		cmn_err(CE_WARN,
4683 		    "myri10ge_bigbufs_max must be >=  "
4684 		    "myri10ge_bigbufs_initial\n");
4685 		myri10ge_bigbufs_max = myri10ge_bigbufs_initial;
4686 	}
4687 
4688 	myri10ge_force_firmware = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4689 	    "myri10ge_force_firmware", myri10ge_force_firmware);
4690 
4691 	myri10ge_max_slices = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4692 	    "myri10ge_max_slices", myri10ge_max_slices);
4693 
4694 	myri10ge_use_msix = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4695 	    "myri10ge_use_msix", myri10ge_use_msix);
4696 
4697 	myri10ge_rss_hash = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4698 	    "myri10ge_rss_hash", myri10ge_rss_hash);
4699 
4700 	if (myri10ge_rss_hash > MXGEFW_RSS_HASH_TYPE_MAX ||
4701 	    myri10ge_rss_hash < MXGEFW_RSS_HASH_TYPE_IPV4) {
4702 		cmn_err(CE_WARN, "myri10ge: Illegal rssh hash type %d\n",
4703 		    myri10ge_rss_hash);
4704 		myri10ge_rss_hash = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4705 	}
4706 	myri10ge_lro = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4707 	    "myri10ge_lro", myri10ge_lro);
4708 	myri10ge_lro_cnt = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4709 	    "myri10ge_lro_cnt", myri10ge_lro_cnt);
4710 	myri10ge_lro_max_aggr = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4711 	    "myri10ge_lro_max_aggr", myri10ge_lro_max_aggr);
4712 	myri10ge_tx_hash = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4713 	    "myri10ge_tx_hash", myri10ge_tx_hash);
4714 	myri10ge_use_lso = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4715 	    "myri10ge_use_lso", myri10ge_use_lso);
4716 	myri10ge_lso_copy = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4717 	    "myri10ge_lso_copy", myri10ge_lso_copy);
4718 	myri10ge_tx_handles_initial = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4719 	    "myri10ge_tx_handles_initial", myri10ge_tx_handles_initial);
4720 	myri10ge_small_bytes = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4721 	    "myri10ge_small_bytes", myri10ge_small_bytes);
4722 	if ((myri10ge_small_bytes + MXGEFW_PAD) & (128 -1)) {
4723 		cmn_err(CE_WARN, "myri10ge: myri10ge_small_bytes (%d)\n",
4724 		    myri10ge_small_bytes);
4725 		cmn_err(CE_WARN, "must be aligned on 128b bndry -2\n");
4726 		myri10ge_small_bytes += 128;
4727 		myri10ge_small_bytes &= ~(128 -1);
4728 		myri10ge_small_bytes -= MXGEFW_PAD;
4729 		cmn_err(CE_WARN, "rounded up to %d\n",
4730 		    myri10ge_small_bytes);
4731 
4732 		myri10ge_rss_hash = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4733 	}
4734 }
4735 
4736 #ifndef	PCI_EXP_LNKSTA
4737 #define	PCI_EXP_LNKSTA 18
4738 #endif
4739 
4740 static int
4741 myri10ge_find_cap(ddi_acc_handle_t handle, uint8_t *capptr, uint8_t capid)
4742 {
4743 	uint16_t	status;
4744 	uint8_t 	ptr;
4745 
4746 	/* check to see if we have capabilities */
4747 	status = pci_config_get16(handle, PCI_CONF_STAT);
4748 	if (!(status & PCI_STAT_CAP)) {
4749 		cmn_err(CE_WARN, "PCI_STAT_CAP not found\n");
4750 		return (ENXIO);
4751 	}
4752 
4753 	ptr = pci_config_get8(handle, PCI_CONF_CAP_PTR);
4754 
4755 	/* Walk the capabilities list, looking for a PCI Express cap */
4756 	while (ptr != PCI_CAP_NEXT_PTR_NULL) {
4757 		if (pci_config_get8(handle, ptr + PCI_CAP_ID) == capid)
4758 			break;
4759 		ptr = pci_config_get8(handle, ptr + PCI_CAP_NEXT_PTR);
4760 	}
4761 	if (ptr < 64) {
4762 		cmn_err(CE_WARN, "Bad capability offset %d\n", ptr);
4763 		return (ENXIO);
4764 	}
4765 	*capptr = ptr;
4766 	return (0);
4767 }
4768 
4769 static int
4770 myri10ge_set_max_readreq(ddi_acc_handle_t handle)
4771 {
4772 	int err;
4773 	uint16_t	val;
4774 	uint8_t		ptr;
4775 
4776 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_PCI_E);
4777 	if (err != 0) {
4778 		cmn_err(CE_WARN, "could not find PCIe cap\n");
4779 		return (ENXIO);
4780 	}
4781 
4782 	/* set max read req to 4096 */
4783 	val = pci_config_get16(handle, ptr + PCIE_DEVCTL);
4784 	val = (val & ~PCIE_DEVCTL_MAX_READ_REQ_MASK) |
4785 	    PCIE_DEVCTL_MAX_READ_REQ_4096;
4786 	pci_config_put16(handle, ptr + PCIE_DEVCTL, val);
4787 	val = pci_config_get16(handle, ptr + PCIE_DEVCTL);
4788 	if ((val & (PCIE_DEVCTL_MAX_READ_REQ_4096)) !=
4789 	    PCIE_DEVCTL_MAX_READ_REQ_4096) {
4790 		cmn_err(CE_WARN, "could not set max read req (%x)\n", val);
4791 		return (EINVAL);
4792 	}
4793 	return (0);
4794 }
4795 
4796 static int
4797 myri10ge_read_pcie_link_width(ddi_acc_handle_t handle, int *link)
4798 {
4799 	int err;
4800 	uint16_t	val;
4801 	uint8_t		ptr;
4802 
4803 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_PCI_E);
4804 	if (err != 0) {
4805 		cmn_err(CE_WARN, "could not set max read req\n");
4806 		return (ENXIO);
4807 	}
4808 
4809 	/* read link width */
4810 	val = pci_config_get16(handle, ptr + PCIE_LINKSTS);
4811 	val &= PCIE_LINKSTS_NEG_WIDTH_MASK;
4812 	*link = (val >> 4);
4813 	return (0);
4814 }
4815 
4816 static int
4817 myri10ge_reset_nic(struct myri10ge_priv *mgp)
4818 {
4819 	ddi_acc_handle_t handle = mgp->cfg_hdl;
4820 	uint32_t reboot;
4821 	uint16_t cmd;
4822 	int err;
4823 
4824 	cmd = pci_config_get16(handle, PCI_CONF_COMM);
4825 	if ((cmd & PCI_COMM_ME) == 0) {
4826 		/*
4827 		 * Bus master DMA disabled?  Check to see if the card
4828 		 * rebooted due to a parity error For now, just report
4829 		 * it
4830 		 */
4831 
4832 		/* enter read32 mode */
4833 		pci_config_put8(handle, mgp->vso + 0x10, 0x3);
4834 		/* read REBOOT_STATUS (0xfffffff0) */
4835 		pci_config_put32(handle, mgp->vso + 0x18, 0xfffffff0);
4836 		reboot = pci_config_get16(handle, mgp->vso + 0x14);
4837 		cmn_err(CE_WARN, "%s NIC rebooted 0x%x\n", mgp->name, reboot);
4838 		return (0);
4839 	}
4840 	if (!myri10ge_watchdog_reset) {
4841 		cmn_err(CE_WARN, "%s: not resetting\n", mgp->name);
4842 		return (1);
4843 	}
4844 
4845 	myri10ge_stop_locked(mgp);
4846 	err = myri10ge_start_locked(mgp);
4847 	if (err == DDI_FAILURE) {
4848 		return (0);
4849 	}
4850 	mac_tx_update(mgp->mh);
4851 	return (1);
4852 }
4853 
4854 static inline int
4855 myri10ge_ring_stalled(myri10ge_tx_ring_t *tx)
4856 {
4857 	if (tx->sched != tx->stall &&
4858 	    tx->done == tx->watchdog_done &&
4859 	    tx->watchdog_req != tx->watchdog_done)
4860 		return (1);
4861 	return (0);
4862 }
4863 
4864 static void
4865 myri10ge_watchdog(void *arg)
4866 {
4867 	struct myri10ge_priv *mgp;
4868 	struct myri10ge_slice_state *ss;
4869 	myri10ge_tx_ring_t *tx;
4870 	int nic_ok = 1;
4871 	int slices_stalled, rx_pause, i;
4872 	int add_rx;
4873 
4874 	mgp = arg;
4875 	mutex_enter(&mgp->intrlock);
4876 	if (mgp->running != MYRI10GE_ETH_RUNNING) {
4877 		cmn_err(CE_WARN,
4878 		    "%s not running, not rearming watchdog (%d)\n",
4879 		    mgp->name, mgp->running);
4880 		mutex_exit(&mgp->intrlock);
4881 		return;
4882 	}
4883 
4884 	rx_pause = ntohl(mgp->ss[0].fw_stats->dropped_pause);
4885 
4886 	/*
4887 	 * make sure nic is stalled before we reset the nic, so as to
4888 	 * ensure we don't rip the transmit data structures out from
4889 	 * under a pending transmit
4890 	 */
4891 
4892 	for (slices_stalled = 0, i = 0; i < mgp->num_slices; i++) {
4893 		tx = &mgp->ss[i].tx;
4894 		slices_stalled = myri10ge_ring_stalled(tx);
4895 		if (slices_stalled)
4896 			break;
4897 	}
4898 
4899 	if (slices_stalled) {
4900 		if (mgp->watchdog_rx_pause == rx_pause) {
4901 			cmn_err(CE_WARN,
4902 			    "%s slice %d stalled:(%d, %d, %d, %d, %d %d %d\n)",
4903 			    mgp->name, i, tx->sched, tx->stall,
4904 			    tx->done, tx->watchdog_done, tx->req, tx->pkt_done,
4905 			    (int)ntohl(mgp->ss[i].fw_stats->send_done_count));
4906 			nic_ok = myri10ge_reset_nic(mgp);
4907 		} else {
4908 			cmn_err(CE_WARN,
4909 			    "%s Flow controlled, check link partner\n",
4910 			    mgp->name);
4911 		}
4912 	}
4913 
4914 	if (!nic_ok) {
4915 		cmn_err(CE_WARN,
4916 		    "%s Nic dead, not rearming watchdog\n", mgp->name);
4917 		mutex_exit(&mgp->intrlock);
4918 		return;
4919 	}
4920 	for (i = 0; i < mgp->num_slices; i++) {
4921 		ss = &mgp->ss[i];
4922 		tx = &ss->tx;
4923 		tx->watchdog_done = tx->done;
4924 		tx->watchdog_req = tx->req;
4925 		if (ss->watchdog_rx_copy != MYRI10GE_SLICE_STAT(rx_copy)) {
4926 			ss->watchdog_rx_copy = MYRI10GE_SLICE_STAT(rx_copy);
4927 			add_rx =
4928 			    min(ss->jpool.num_alloc,
4929 			    myri10ge_bigbufs_max -
4930 			    (ss->jpool.num_alloc -
4931 			    ss->jbufs_for_smalls));
4932 			if (add_rx != 0) {
4933 				(void) myri10ge_add_jbufs(ss, add_rx, 0);
4934 				/* now feed them to the firmware */
4935 				mutex_enter(&ss->jpool.mtx);
4936 				myri10ge_restock_jumbos(ss);
4937 				mutex_exit(&ss->jpool.mtx);
4938 			}
4939 		}
4940 	}
4941 	mgp->watchdog_rx_pause = rx_pause;
4942 
4943 	mgp->timer_id = timeout(myri10ge_watchdog, mgp,
4944 	    mgp->timer_ticks);
4945 	mutex_exit(&mgp->intrlock);
4946 }
4947 
4948 /*ARGSUSED*/
4949 static int
4950 myri10ge_get_coalesce(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *credp)
4951 
4952 {
4953 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
4954 	(void) mi_mpprintf(mp, "%d", mgp->intr_coal_delay);
4955 	return (0);
4956 }
4957 
4958 /*ARGSUSED*/
4959 static int
4960 myri10ge_set_coalesce(queue_t *q, mblk_t *mp, char *value,
4961     caddr_t cp, cred_t *credp)
4962 
4963 {
4964 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
4965 	char *end;
4966 	size_t new_value;
4967 
4968 	new_value = mi_strtol(value, &end, 10);
4969 	if (end == value)
4970 		return (EINVAL);
4971 
4972 	mutex_enter(&myri10ge_param_lock);
4973 	mgp->intr_coal_delay = (int)new_value;
4974 	*mgp->intr_coal_delay_ptr = htonl(mgp->intr_coal_delay);
4975 	mutex_exit(&myri10ge_param_lock);
4976 	return (0);
4977 }
4978 
4979 /*ARGSUSED*/
4980 static int
4981 myri10ge_get_pauseparam(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *credp)
4982 
4983 {
4984 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
4985 	(void) mi_mpprintf(mp, "%d", mgp->pause);
4986 	return (0);
4987 }
4988 
4989 /*ARGSUSED*/
4990 static int
4991 myri10ge_set_pauseparam(queue_t *q, mblk_t *mp, char *value,
4992 			caddr_t cp, cred_t *credp)
4993 
4994 {
4995 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
4996 	char *end;
4997 	size_t new_value;
4998 	int err = 0;
4999 
5000 	new_value = mi_strtol(value, &end, 10);
5001 	if (end == value)
5002 		return (EINVAL);
5003 	if (new_value != 0)
5004 		new_value = 1;
5005 
5006 	mutex_enter(&myri10ge_param_lock);
5007 	if (new_value != mgp->pause)
5008 		err = myri10ge_change_pause(mgp, new_value);
5009 	mutex_exit(&myri10ge_param_lock);
5010 	return (err);
5011 }
5012 
5013 /*ARGSUSED*/
5014 static int
5015 myri10ge_get_int(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *credp)
5016 
5017 {
5018 	(void) mi_mpprintf(mp, "%d", *(int *)(void *)cp);
5019 	return (0);
5020 }
5021 
5022 /*ARGSUSED*/
5023 static int
5024 myri10ge_set_int(queue_t *q, mblk_t *mp, char *value,
5025     caddr_t cp, cred_t *credp)
5026 
5027 {
5028 	char *end;
5029 	size_t new_value;
5030 
5031 	new_value = mi_strtol(value, &end, 10);
5032 	if (end == value)
5033 		return (EINVAL);
5034 	*(int *)(void *)cp = new_value;
5035 
5036 	return (0);
5037 }
5038 
5039 static void
5040 myri10ge_ndd_init(struct myri10ge_priv *mgp)
5041 {
5042 	mgp->nd_head = NULL;
5043 
5044 	(void) nd_load(&mgp->nd_head, "myri10ge_intr_coal_delay",
5045 	    myri10ge_get_coalesce, myri10ge_set_coalesce, (caddr_t)mgp);
5046 	(void) nd_load(&mgp->nd_head, "myri10ge_flow_control",
5047 	    myri10ge_get_pauseparam, myri10ge_set_pauseparam, (caddr_t)mgp);
5048 	(void) nd_load(&mgp->nd_head, "myri10ge_verbose",
5049 	    myri10ge_get_int, myri10ge_set_int, (caddr_t)&myri10ge_verbose);
5050 	(void) nd_load(&mgp->nd_head, "myri10ge_deassert_wait",
5051 	    myri10ge_get_int, myri10ge_set_int,
5052 	    (caddr_t)&myri10ge_deassert_wait);
5053 	(void) nd_load(&mgp->nd_head, "myri10ge_bigbufs_max",
5054 	    myri10ge_get_int, myri10ge_set_int,
5055 	    (caddr_t)&myri10ge_bigbufs_max);
5056 	(void) nd_load(&mgp->nd_head, "myri10ge_lro",
5057 	    myri10ge_get_int, myri10ge_set_int,
5058 	    (caddr_t)&myri10ge_lro);
5059 	(void) nd_load(&mgp->nd_head, "myri10ge_lro_max_aggr",
5060 	    myri10ge_get_int, myri10ge_set_int,
5061 	    (caddr_t)&myri10ge_lro_max_aggr);
5062 	(void) nd_load(&mgp->nd_head, "myri10ge_tx_hash",
5063 	    myri10ge_get_int, myri10ge_set_int,
5064 	    (caddr_t)&myri10ge_tx_hash);
5065 	(void) nd_load(&mgp->nd_head, "myri10ge_lso_copy",
5066 	    myri10ge_get_int, myri10ge_set_int,
5067 	    (caddr_t)&myri10ge_lso_copy);
5068 }
5069 
5070 static void
5071 myri10ge_ndd_fini(struct myri10ge_priv *mgp)
5072 {
5073 	nd_free(&mgp->nd_head);
5074 }
5075 
5076 static void
5077 myri10ge_m_ioctl(void *arg, queue_t *wq, mblk_t *mp)
5078 {
5079 	struct iocblk *iocp;
5080 	struct myri10ge_priv *mgp = arg;
5081 	int cmd, ok, err;
5082 
5083 	iocp = (struct iocblk *)(void *)mp->b_rptr;
5084 	cmd = iocp->ioc_cmd;
5085 
5086 	ok = 0;
5087 	err = 0;
5088 
5089 	switch (cmd) {
5090 	case ND_GET:
5091 	case ND_SET:
5092 		ok = nd_getset(wq, mgp->nd_head, mp);
5093 		break;
5094 	default:
5095 		break;
5096 	}
5097 	if (!ok)
5098 		err = EINVAL;
5099 	else
5100 		err = iocp->ioc_error;
5101 
5102 	if (!err)
5103 		miocack(wq, mp, iocp->ioc_count, err);
5104 	else
5105 		miocnak(wq, mp, 0, err);
5106 }
5107 
5108 static struct myri10ge_priv *mgp_list;
5109 
5110 struct myri10ge_priv *
5111 myri10ge_get_instance(uint_t unit)
5112 {
5113 	struct myri10ge_priv *mgp;
5114 
5115 	mutex_enter(&myri10ge_param_lock);
5116 	for (mgp = mgp_list; mgp != NULL; mgp = mgp->next) {
5117 		if (unit == ddi_get_instance(mgp->dip)) {
5118 			mgp->refcnt++;
5119 			break;
5120 		}
5121 	}
5122 	mutex_exit(&myri10ge_param_lock);
5123 	return (mgp);
5124 }
5125 
5126 void
5127 myri10ge_put_instance(struct myri10ge_priv *mgp)
5128 {
5129 	mutex_enter(&myri10ge_param_lock);
5130 	mgp->refcnt--;
5131 	mutex_exit(&myri10ge_param_lock);
5132 }
5133 
5134 static boolean_t
5135 myri10ge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
5136 {
5137 	struct myri10ge_priv *mgp = arg;
5138 	uint32_t *cap_hcksum;
5139 	mac_capab_lso_t *cap_lso;
5140 	mac_capab_rings_t *cap_rings;
5141 
5142 	switch (cap) {
5143 	case MAC_CAPAB_HCKSUM:
5144 		cap_hcksum = cap_data;
5145 		*cap_hcksum = HCKSUM_INET_PARTIAL;
5146 		break;
5147 	case MAC_CAPAB_RINGS:
5148 		cap_rings = cap_data;
5149 		switch (cap_rings->mr_type) {
5150 		case MAC_RING_TYPE_RX:
5151 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
5152 			cap_rings->mr_rnum = mgp->num_slices;
5153 			cap_rings->mr_gnum = 1;
5154 			cap_rings->mr_rget = myri10ge_fill_ring;
5155 			cap_rings->mr_gget = myri10ge_fill_group;
5156 			break;
5157 		case MAC_RING_TYPE_TX:
5158 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
5159 			cap_rings->mr_rnum = mgp->num_slices;
5160 			cap_rings->mr_gnum = 0;
5161 			cap_rings->mr_rget = myri10ge_fill_ring;
5162 			cap_rings->mr_gget = NULL;
5163 			break;
5164 		default:
5165 			return (B_FALSE);
5166 		}
5167 		break;
5168 	case MAC_CAPAB_LSO:
5169 		cap_lso = cap_data;
5170 		if (!myri10ge_use_lso)
5171 			return (B_FALSE);
5172 		if (!(mgp->features & MYRI10GE_TSO))
5173 			return (B_FALSE);
5174 		cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
5175 		cap_lso->lso_basic_tcp_ipv4.lso_max = (uint16_t)-1;
5176 		break;
5177 
5178 	default:
5179 		return (B_FALSE);
5180 	}
5181 	return (B_TRUE);
5182 }
5183 
5184 
5185 static int
5186 myri10ge_m_stat(void *arg, uint_t stat, uint64_t *val)
5187 {
5188 	struct myri10ge_priv *mgp = arg;
5189 	struct myri10ge_rx_ring_stats *rstat;
5190 	struct myri10ge_tx_ring_stats *tstat;
5191 	mcp_irq_data_t *fw_stats = mgp->ss[0].fw_stats;
5192 	struct myri10ge_slice_state *ss;
5193 	uint64_t tmp = 0;
5194 	int i;
5195 
5196 	switch (stat) {
5197 	case MAC_STAT_IFSPEED:
5198 		*val = 10ull * 1000ull * 1000000ull;
5199 		break;
5200 
5201 	case MAC_STAT_MULTIRCV:
5202 		for (i = 0; i < mgp->num_slices; i++) {
5203 			rstat = &mgp->ss[i].rx_stats;
5204 			tmp += rstat->multircv;
5205 		}
5206 		*val = tmp;
5207 		break;
5208 
5209 	case MAC_STAT_BRDCSTRCV:
5210 		for (i = 0; i < mgp->num_slices; i++) {
5211 			rstat = &mgp->ss[i].rx_stats;
5212 			tmp += rstat->brdcstrcv;
5213 		}
5214 		*val = tmp;
5215 		break;
5216 
5217 	case MAC_STAT_MULTIXMT:
5218 		for (i = 0; i < mgp->num_slices; i++) {
5219 			tstat = &mgp->ss[i].tx.stats;
5220 			tmp += tstat->multixmt;
5221 		}
5222 		*val = tmp;
5223 		break;
5224 
5225 	case MAC_STAT_BRDCSTXMT:
5226 		for (i = 0; i < mgp->num_slices; i++) {
5227 			tstat = &mgp->ss[i].tx.stats;
5228 			tmp += tstat->brdcstxmt;
5229 		}
5230 		*val = tmp;
5231 		break;
5232 
5233 	case MAC_STAT_NORCVBUF:
5234 		tmp = ntohl(fw_stats->dropped_no_big_buffer);
5235 		tmp += ntohl(fw_stats->dropped_no_small_buffer);
5236 		tmp += ntohl(fw_stats->dropped_link_overflow);
5237 		for (i = 0; i < mgp->num_slices; i++) {
5238 			ss = &mgp->ss[i];
5239 			tmp += MYRI10GE_SLICE_STAT(rx_big_nobuf);
5240 			tmp += MYRI10GE_SLICE_STAT(rx_small_nobuf);
5241 		}
5242 		*val = tmp;
5243 		break;
5244 
5245 	case MAC_STAT_IERRORS:
5246 		tmp += ntohl(fw_stats->dropped_bad_crc32);
5247 		tmp += ntohl(fw_stats->dropped_bad_phy);
5248 		tmp += ntohl(fw_stats->dropped_runt);
5249 		tmp += ntohl(fw_stats->dropped_overrun);
5250 		*val = tmp;
5251 		break;
5252 
5253 	case MAC_STAT_OERRORS:
5254 		for (i = 0; i < mgp->num_slices; i++) {
5255 			ss = &mgp->ss[i];
5256 			tmp += MYRI10GE_SLICE_STAT(xmit_lsobadflags);
5257 			tmp += MYRI10GE_SLICE_STAT(xmit_err);
5258 		}
5259 		*val = tmp;
5260 		break;
5261 
5262 	case MAC_STAT_RBYTES:
5263 		for (i = 0; i < mgp->num_slices; i++) {
5264 			rstat = &mgp->ss[i].rx_stats;
5265 			tmp += rstat->ibytes;
5266 		}
5267 		*val = tmp;
5268 		break;
5269 
5270 	case MAC_STAT_IPACKETS:
5271 		for (i = 0; i < mgp->num_slices; i++) {
5272 			rstat = &mgp->ss[i].rx_stats;
5273 			tmp += rstat->ipackets;
5274 		}
5275 		*val = tmp;
5276 		break;
5277 
5278 	case MAC_STAT_OBYTES:
5279 		for (i = 0; i < mgp->num_slices; i++) {
5280 			tstat = &mgp->ss[i].tx.stats;
5281 			tmp += tstat->obytes;
5282 		}
5283 		*val = tmp;
5284 		break;
5285 
5286 	case MAC_STAT_OPACKETS:
5287 		for (i = 0; i < mgp->num_slices; i++) {
5288 			tstat = &mgp->ss[i].tx.stats;
5289 			tmp += tstat->opackets;
5290 		}
5291 		*val = tmp;
5292 		break;
5293 
5294 	case ETHER_STAT_TOOLONG_ERRORS:
5295 		*val = ntohl(fw_stats->dropped_overrun);
5296 		break;
5297 
5298 #ifdef SOLARIS_S11
5299 	case ETHER_STAT_TOOSHORT_ERRORS:
5300 		*val = ntohl(fw_stats->dropped_runt);
5301 		break;
5302 #endif
5303 
5304 	case ETHER_STAT_LINK_PAUSE:
5305 		*val = mgp->pause;
5306 		break;
5307 
5308 	case ETHER_STAT_LINK_AUTONEG:
5309 		*val = 1;
5310 		break;
5311 
5312 	case ETHER_STAT_LINK_DUPLEX:
5313 		*val = LINK_DUPLEX_FULL;
5314 		break;
5315 
5316 	default:
5317 		return (ENOTSUP);
5318 	}
5319 
5320 	return (0);
5321 }
5322 
5323 static mac_callbacks_t myri10ge_m_callbacks = {
5324 	(MC_IOCTL | MC_GETCAPAB),
5325 	myri10ge_m_stat,
5326 	myri10ge_m_start,
5327 	myri10ge_m_stop,
5328 	myri10ge_m_promisc,
5329 	myri10ge_m_multicst,
5330 	NULL,
5331 	NULL,
5332 	myri10ge_m_ioctl,
5333 	myri10ge_m_getcapab
5334 };
5335 
5336 
5337 static int
5338 myri10ge_probe_slices(struct myri10ge_priv *mgp)
5339 {
5340 	myri10ge_cmd_t cmd;
5341 	int status;
5342 
5343 	mgp->num_slices = 1;
5344 
5345 	/* hit the board with a reset to ensure it is alive */
5346 	(void) memset(&cmd, 0, sizeof (cmd));
5347 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_RESET, &cmd);
5348 	if (status != 0) {
5349 		cmn_err(CE_WARN, "%s: failed reset\n", mgp->name);
5350 		return (ENXIO);
5351 	}
5352 
5353 	if (myri10ge_use_msix == 0)
5354 		return (0);
5355 
5356 	/* tell it the size of the interrupt queues */
5357 	cmd.data0 = mgp->max_intr_slots * sizeof (struct mcp_slot);
5358 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
5359 	if (status != 0) {
5360 		cmn_err(CE_WARN, "%s: failed MXGEFW_CMD_SET_INTRQ_SIZE\n",
5361 		    mgp->name);
5362 		return (ENXIO);
5363 	}
5364 
5365 	/* ask the maximum number of slices it supports */
5366 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
5367 	    &cmd);
5368 	if (status != 0)
5369 		return (0);
5370 
5371 	mgp->num_slices = cmd.data0;
5372 
5373 	/*
5374 	 * if the admin did not specify a limit to how many
5375 	 * slices we should use, cap it automatically to the
5376 	 * number of CPUs currently online
5377 	 */
5378 	if (myri10ge_max_slices == -1)
5379 		myri10ge_max_slices = ncpus;
5380 
5381 	if (mgp->num_slices > myri10ge_max_slices)
5382 		mgp->num_slices = myri10ge_max_slices;
5383 
5384 
5385 	/*
5386 	 * Now try to allocate as many MSI-X vectors as we have
5387 	 * slices. We give up on MSI-X if we can only get a single
5388 	 * vector.
5389 	 */
5390 	while (mgp->num_slices > 1) {
5391 		/* make sure it is a power of two */
5392 		while (mgp->num_slices & (mgp->num_slices - 1))
5393 			mgp->num_slices--;
5394 		if (mgp->num_slices == 1)
5395 			return (0);
5396 
5397 		status = myri10ge_add_intrs(mgp, 0);
5398 		if (status == 0) {
5399 			myri10ge_rem_intrs(mgp, 0);
5400 			if (mgp->intr_cnt == mgp->num_slices) {
5401 				if (myri10ge_verbose)
5402 					printf("Got %d slices!\n",
5403 					    mgp->num_slices);
5404 				return (0);
5405 			}
5406 			mgp->num_slices = mgp->intr_cnt;
5407 		} else {
5408 			mgp->num_slices = mgp->num_slices / 2;
5409 		}
5410 	}
5411 
5412 	if (myri10ge_verbose)
5413 		printf("Got %d slices\n", mgp->num_slices);
5414 	return (0);
5415 }
5416 
5417 static void
5418 myri10ge_lro_free(struct myri10ge_slice_state *ss)
5419 {
5420 	struct lro_entry *lro;
5421 
5422 	while (ss->lro_free != NULL) {
5423 		lro = ss->lro_free;
5424 		ss->lro_free = lro->next;
5425 		kmem_free(lro, sizeof (*lro));
5426 	}
5427 }
5428 
5429 static void
5430 myri10ge_lro_alloc(struct myri10ge_slice_state *ss)
5431 {
5432 	struct lro_entry *lro;
5433 	int idx;
5434 
5435 	ss->lro_free = NULL;
5436 	ss->lro_active = NULL;
5437 
5438 	for (idx = 0; idx < myri10ge_lro_cnt; idx++) {
5439 		lro = kmem_zalloc(sizeof (*lro), KM_SLEEP);
5440 		if (lro == NULL)
5441 			continue;
5442 		lro->next = ss->lro_free;
5443 		ss->lro_free = lro;
5444 	}
5445 }
5446 
5447 static void
5448 myri10ge_free_slices(struct myri10ge_priv *mgp)
5449 {
5450 	struct myri10ge_slice_state *ss;
5451 	size_t bytes;
5452 	int i;
5453 
5454 	if (mgp->ss == NULL)
5455 		return;
5456 
5457 	for (i = 0; i < mgp->num_slices; i++) {
5458 		ss = &mgp->ss[i];
5459 		if (ss->rx_done.entry == NULL)
5460 			continue;
5461 		myri10ge_dma_free(&ss->rx_done.dma);
5462 		ss->rx_done.entry = NULL;
5463 		if (ss->fw_stats == NULL)
5464 			continue;
5465 		myri10ge_dma_free(&ss->fw_stats_dma);
5466 		ss->fw_stats = NULL;
5467 		mutex_destroy(&ss->rx_lock);
5468 		mutex_destroy(&ss->tx.lock);
5469 		mutex_destroy(&ss->tx.handle_lock);
5470 		mutex_destroy(&ss->poll_lock);
5471 		myri10ge_jpool_fini(ss);
5472 		myri10ge_slice_stat_destroy(ss);
5473 		myri10ge_lro_free(ss);
5474 	}
5475 	bytes = sizeof (*mgp->ss) * mgp->num_slices;
5476 	kmem_free(mgp->ss, bytes);
5477 	mgp->ss = NULL;
5478 }
5479 
5480 
5481 static int
5482 myri10ge_alloc_slices(struct myri10ge_priv *mgp)
5483 {
5484 	struct myri10ge_slice_state *ss;
5485 	size_t bytes;
5486 	int i;
5487 
5488 	bytes = sizeof (*mgp->ss) * mgp->num_slices;
5489 	mgp->ss = kmem_zalloc(bytes, KM_SLEEP);
5490 	if (mgp->ss == NULL)
5491 		return (ENOMEM);
5492 	for (i = 0; i < mgp->num_slices; i++) {
5493 		ss = &mgp->ss[i];
5494 
5495 		ss->mgp = mgp;
5496 
5497 		/* allocate the per-slice firmware stats */
5498 		bytes = sizeof (*ss->fw_stats);
5499 		ss->fw_stats = (mcp_irq_data_t *)(void *)
5500 		    myri10ge_dma_alloc(mgp->dip, bytes,
5501 		    &myri10ge_misc_dma_attr, &myri10ge_dev_access_attr,
5502 		    DDI_DMA_CONSISTENT, DDI_DMA_READ|DDI_DMA_CONSISTENT,
5503 		    &ss->fw_stats_dma, 1, DDI_DMA_DONTWAIT);
5504 		if (ss->fw_stats == NULL)
5505 			goto abort;
5506 		(void) memset(ss->fw_stats, 0, bytes);
5507 
5508 		/* allocate rx done ring */
5509 		bytes = mgp->max_intr_slots *
5510 		    sizeof (*ss->rx_done.entry);
5511 		ss->rx_done.entry = (mcp_slot_t *)(void *)
5512 		    myri10ge_dma_alloc(mgp->dip, bytes,
5513 		    &myri10ge_misc_dma_attr, &myri10ge_dev_access_attr,
5514 		    DDI_DMA_CONSISTENT, DDI_DMA_READ|DDI_DMA_CONSISTENT,
5515 		    &ss->rx_done.dma, 1, DDI_DMA_DONTWAIT);
5516 		if (ss->rx_done.entry == NULL) {
5517 			goto abort;
5518 		}
5519 		(void) memset(ss->rx_done.entry, 0, bytes);
5520 		mutex_init(&ss->rx_lock,   NULL, MUTEX_DEFAULT, mgp->icookie);
5521 		mutex_init(&ss->tx.lock,   NULL, MUTEX_DEFAULT, NULL);
5522 		mutex_init(&ss->tx.handle_lock,   NULL, MUTEX_DEFAULT, NULL);
5523 		mutex_init(&ss->poll_lock,   NULL, MUTEX_DEFAULT, NULL);
5524 		myri10ge_jpool_init(ss);
5525 		(void) myri10ge_slice_stat_init(ss);
5526 		myri10ge_lro_alloc(ss);
5527 	}
5528 
5529 	return (0);
5530 
5531 abort:
5532 	myri10ge_free_slices(mgp);
5533 	return (ENOMEM);
5534 }
5535 
5536 static int
5537 myri10ge_save_msi_state(struct myri10ge_priv *mgp,
5538     ddi_acc_handle_t handle)
5539 {
5540 	uint8_t ptr;
5541 	int err;
5542 
5543 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_MSI);
5544 	if (err != 0) {
5545 		cmn_err(CE_WARN, "%s: could not find MSI cap\n",
5546 		    mgp->name);
5547 		return (DDI_FAILURE);
5548 	}
5549 	mgp->pci_saved_state.msi_ctrl =
5550 	    pci_config_get16(handle, ptr + PCI_MSI_CTRL);
5551 	mgp->pci_saved_state.msi_addr_low =
5552 	    pci_config_get32(handle, ptr + PCI_MSI_ADDR_OFFSET);
5553 	mgp->pci_saved_state.msi_addr_high =
5554 	    pci_config_get32(handle, ptr + PCI_MSI_ADDR_OFFSET + 4);
5555 	mgp->pci_saved_state.msi_data_32 =
5556 	    pci_config_get16(handle, ptr + PCI_MSI_32BIT_DATA);
5557 	mgp->pci_saved_state.msi_data_64 =
5558 	    pci_config_get16(handle, ptr + PCI_MSI_64BIT_DATA);
5559 	return (DDI_SUCCESS);
5560 }
5561 
5562 static int
5563 myri10ge_restore_msi_state(struct myri10ge_priv *mgp,
5564     ddi_acc_handle_t handle)
5565 {
5566 	uint8_t ptr;
5567 	int err;
5568 
5569 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_MSI);
5570 	if (err != 0) {
5571 		cmn_err(CE_WARN, "%s: could not find MSI cap\n",
5572 		    mgp->name);
5573 		return (DDI_FAILURE);
5574 	}
5575 
5576 	pci_config_put16(handle, ptr + PCI_MSI_CTRL,
5577 	    mgp->pci_saved_state.msi_ctrl);
5578 	pci_config_put32(handle, ptr + PCI_MSI_ADDR_OFFSET,
5579 	    mgp->pci_saved_state.msi_addr_low);
5580 	pci_config_put32(handle, ptr + PCI_MSI_ADDR_OFFSET + 4,
5581 	    mgp->pci_saved_state.msi_addr_high);
5582 	pci_config_put16(handle, ptr + PCI_MSI_32BIT_DATA,
5583 	    mgp->pci_saved_state.msi_data_32);
5584 	pci_config_put16(handle, ptr + PCI_MSI_64BIT_DATA,
5585 	    mgp->pci_saved_state.msi_data_64);
5586 
5587 	return (DDI_SUCCESS);
5588 }
5589 
5590 static int
5591 myri10ge_save_pci_state(struct myri10ge_priv *mgp)
5592 {
5593 	ddi_acc_handle_t handle = mgp->cfg_hdl;
5594 	int i;
5595 	int err = DDI_SUCCESS;
5596 
5597 
5598 	/* Save the non-extended PCI config space 32-bits at a time */
5599 	for (i = 0; i < 16; i++)
5600 		mgp->pci_saved_state.base[i] =
5601 		    pci_config_get32(handle, i*4);
5602 
5603 	/* now save MSI interrupt state *, if needed */
5604 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_MSI)
5605 		err = myri10ge_save_msi_state(mgp, handle);
5606 
5607 	return (err);
5608 }
5609 
5610 static int
5611 myri10ge_restore_pci_state(struct myri10ge_priv *mgp)
5612 {
5613 	ddi_acc_handle_t handle = mgp->cfg_hdl;
5614 	int i;
5615 	int err = DDI_SUCCESS;
5616 
5617 
5618 	/* Restore the non-extended PCI config space 32-bits at a time */
5619 	for (i = 15; i >= 0; i--)
5620 		pci_config_put32(handle, i*4, mgp->pci_saved_state.base[i]);
5621 
5622 	/* now restore MSI interrupt state *, if needed */
5623 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_MSI)
5624 		err = myri10ge_restore_msi_state(mgp, handle);
5625 
5626 	if (mgp->max_read_request_4k)
5627 		(void) myri10ge_set_max_readreq(handle);
5628 	return (err);
5629 }
5630 
5631 
5632 static int
5633 myri10ge_suspend(dev_info_t *dip)
5634 {
5635 	struct myri10ge_priv *mgp = ddi_get_driver_private(dip);
5636 	int status;
5637 
5638 	if (mgp == NULL) {
5639 		cmn_err(CE_WARN, "null dip in myri10ge_suspend\n");
5640 		return (DDI_FAILURE);
5641 	}
5642 	if (mgp->dip != dip) {
5643 		cmn_err(CE_WARN, "bad dip in myri10ge_suspend\n");
5644 		return (DDI_FAILURE);
5645 	}
5646 	mutex_enter(&mgp->intrlock);
5647 	if (mgp->running == MYRI10GE_ETH_RUNNING) {
5648 		mgp->running = MYRI10GE_ETH_STOPPING;
5649 		mutex_exit(&mgp->intrlock);
5650 		(void) untimeout(mgp->timer_id);
5651 		mutex_enter(&mgp->intrlock);
5652 		myri10ge_stop_locked(mgp);
5653 		mgp->running = MYRI10GE_ETH_SUSPENDED_RUNNING;
5654 	}
5655 	status = myri10ge_save_pci_state(mgp);
5656 	mutex_exit(&mgp->intrlock);
5657 	return (status);
5658 }
5659 
5660 static int
5661 myri10ge_resume(dev_info_t *dip)
5662 {
5663 	struct myri10ge_priv *mgp = ddi_get_driver_private(dip);
5664 	int status = DDI_SUCCESS;
5665 
5666 	if (mgp == NULL) {
5667 		cmn_err(CE_WARN, "null dip in myri10ge_resume\n");
5668 		return (DDI_FAILURE);
5669 	}
5670 	if (mgp->dip != dip) {
5671 		cmn_err(CE_WARN, "bad dip in myri10ge_resume\n");
5672 		return (DDI_FAILURE);
5673 	}
5674 
5675 	mutex_enter(&mgp->intrlock);
5676 	status = myri10ge_restore_pci_state(mgp);
5677 	if (status == DDI_SUCCESS &&
5678 	    mgp->running == MYRI10GE_ETH_SUSPENDED_RUNNING) {
5679 		status = myri10ge_start_locked(mgp);
5680 	}
5681 	mutex_exit(&mgp->intrlock);
5682 	if (status != DDI_SUCCESS)
5683 		return (status);
5684 
5685 	/* start the watchdog timer */
5686 	mgp->timer_id = timeout(myri10ge_watchdog, mgp,
5687 	    mgp->timer_ticks);
5688 	return (DDI_SUCCESS);
5689 }
5690 
5691 static int
5692 myri10ge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
5693 {
5694 
5695 	struct myri10ge_priv *mgp;
5696 	mac_register_t *macp, *omacp;
5697 	ddi_acc_handle_t handle;
5698 	uint32_t csr, hdr_offset;
5699 	int status, span, link_width, max_read_request_4k;
5700 	unsigned long bus_number, dev_number, func_number;
5701 	size_t bytes;
5702 	offset_t ss_offset;
5703 	uint8_t vso;
5704 
5705 	if (cmd == DDI_RESUME) {
5706 		return (myri10ge_resume(dip));
5707 	}
5708 
5709 	if (cmd != DDI_ATTACH)
5710 		return (DDI_FAILURE);
5711 	if (pci_config_setup(dip, &handle) != DDI_SUCCESS)
5712 		return (DDI_FAILURE);
5713 
5714 	/* enable busmater and io space access */
5715 	csr = pci_config_get32(handle, PCI_CONF_COMM);
5716 	pci_config_put32(handle, PCI_CONF_COMM,
5717 	    (csr |PCI_COMM_ME|PCI_COMM_MAE));
5718 	status = myri10ge_read_pcie_link_width(handle, &link_width);
5719 	if (status != 0) {
5720 		cmn_err(CE_WARN, "could not read link width!\n");
5721 		link_width = 0;
5722 	}
5723 	max_read_request_4k = !myri10ge_set_max_readreq(handle);
5724 	status = myri10ge_find_cap(handle, &vso, PCI_CAP_ID_VS);
5725 	if (status != 0)
5726 		goto abort_with_cfg_hdl;
5727 	if ((omacp = mac_alloc(MAC_VERSION)) == NULL)
5728 		goto abort_with_cfg_hdl;
5729 	/*
5730 	 * XXXX Hack: mac_register_t grows in newer kernels.  To be
5731 	 * able to write newer fields, such as m_margin, without
5732 	 * writing outside allocated memory, we allocate our own macp
5733 	 * and pass that to mac_register()
5734 	 */
5735 	macp = kmem_zalloc(sizeof (*macp) * 8, KM_SLEEP);
5736 	macp->m_version = omacp->m_version;
5737 
5738 	if ((mgp = (struct myri10ge_priv *)
5739 	    kmem_zalloc(sizeof (*mgp), KM_SLEEP)) == NULL) {
5740 		goto abort_with_macinfo;
5741 	}
5742 	ddi_set_driver_private(dip, mgp);
5743 
5744 	/* setup device name for log messages */
5745 	(void) sprintf(mgp->name, "myri10ge%d", ddi_get_instance(dip));
5746 
5747 	mutex_enter(&myri10ge_param_lock);
5748 	myri10ge_get_props(dip);
5749 	mgp->intr_coal_delay = myri10ge_intr_coal_delay;
5750 	mgp->pause = myri10ge_flow_control;
5751 	mutex_exit(&myri10ge_param_lock);
5752 
5753 	mgp->max_read_request_4k = max_read_request_4k;
5754 	mgp->pcie_link_width = link_width;
5755 	mgp->running = MYRI10GE_ETH_STOPPED;
5756 	mgp->vso = vso;
5757 	mgp->dip = dip;
5758 	mgp->cfg_hdl = handle;
5759 
5760 	mgp->timer_ticks = 5 * drv_usectohz(1000000); /* 5 seconds */
5761 	myri10ge_test_physical(dip);
5762 
5763 	/* allocate command page */
5764 	bytes = sizeof (*mgp->cmd);
5765 	mgp->cmd = (mcp_cmd_response_t *)
5766 	    (void *)myri10ge_dma_alloc(dip, bytes,
5767 	    &myri10ge_misc_dma_attr, &myri10ge_dev_access_attr,
5768 	    DDI_DMA_CONSISTENT,	DDI_DMA_RDWR|DDI_DMA_CONSISTENT,
5769 	    &mgp->cmd_dma, 1, DDI_DMA_DONTWAIT);
5770 	if (mgp->cmd == NULL)
5771 		goto abort_with_mgp;
5772 
5773 	(void) myri10ge_reg_set(dip, &mgp->reg_set, &span, &bus_number,
5774 	    &dev_number, &func_number);
5775 	if (myri10ge_verbose)
5776 		printf("%s at %ld:%ld:%ld attaching\n", mgp->name,
5777 		    bus_number, dev_number, func_number);
5778 	status = ddi_regs_map_setup(dip, mgp->reg_set, (caddr_t *)&mgp->sram,
5779 	    (offset_t)0, (offset_t)span,  &myri10ge_dev_access_attr,
5780 	    &mgp->io_handle);
5781 	if (status != DDI_SUCCESS) {
5782 		cmn_err(CE_WARN, "%s: couldn't map memory space", mgp->name);
5783 		printf("%s: reg_set = %d, span = %d, status = %d",
5784 		    mgp->name, mgp->reg_set, span, status);
5785 		goto abort_with_mgp;
5786 	}
5787 
5788 	hdr_offset = *(uint32_t *)(void*)(mgp->sram +  MCP_HEADER_PTR_OFFSET);
5789 	hdr_offset = ntohl(hdr_offset) & 0xffffc;
5790 	ss_offset = hdr_offset +
5791 	    offsetof(struct mcp_gen_header, string_specs);
5792 	mgp->sram_size = ntohl(*(uint32_t *)(void*)(mgp->sram + ss_offset));
5793 	myri10ge_pio_copy32(mgp->eeprom_strings,
5794 	    (uint32_t *)(void*)((char *)mgp->sram + mgp->sram_size),
5795 	    MYRI10GE_EEPROM_STRINGS_SIZE);
5796 	(void) memset(mgp->eeprom_strings +
5797 	    MYRI10GE_EEPROM_STRINGS_SIZE - 2, 0, 2);
5798 
5799 	status = myri10ge_read_mac_addr(mgp);
5800 	if (status) {
5801 		goto abort_with_mapped;
5802 	}
5803 
5804 	status = myri10ge_select_firmware(mgp);
5805 	if (status != 0) {
5806 		cmn_err(CE_WARN, "%s: failed to load firmware\n", mgp->name);
5807 		goto abort_with_mapped;
5808 	}
5809 
5810 	status = myri10ge_probe_slices(mgp);
5811 	if (status != 0) {
5812 		cmn_err(CE_WARN, "%s: failed to probe slices\n", mgp->name);
5813 		goto abort_with_dummy_rdma;
5814 	}
5815 
5816 	status = myri10ge_alloc_slices(mgp);
5817 	if (status != 0) {
5818 		cmn_err(CE_WARN, "%s: failed to alloc slices\n", mgp->name);
5819 		goto abort_with_dummy_rdma;
5820 	}
5821 
5822 	/* add the interrupt handler */
5823 	status = myri10ge_add_intrs(mgp, 1);
5824 	if (status != 0) {
5825 		cmn_err(CE_WARN, "%s: Failed to add interrupt\n",
5826 		    mgp->name);
5827 		goto abort_with_slices;
5828 	}
5829 
5830 	/* now that we have an iblock_cookie, init the mutexes */
5831 	mutex_init(&mgp->cmd_lock, NULL, MUTEX_DRIVER, mgp->icookie);
5832 	mutex_init(&mgp->intrlock, NULL, MUTEX_DRIVER, mgp->icookie);
5833 
5834 
5835 	status = myri10ge_nic_stat_init(mgp);
5836 	if (status != DDI_SUCCESS)
5837 		goto abort_with_interrupts;
5838 	status = myri10ge_info_init(mgp);
5839 	if (status != DDI_SUCCESS)
5840 		goto abort_with_stats;
5841 
5842 	/*
5843 	 *	Initialize  GLD state
5844 	 */
5845 
5846 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
5847 	macp->m_driver = mgp;
5848 	macp->m_dip = dip;
5849 	macp->m_src_addr = mgp->mac_addr;
5850 	macp->m_callbacks = &myri10ge_m_callbacks;
5851 	macp->m_min_sdu = 0;
5852 	macp->m_max_sdu = myri10ge_mtu -
5853 	    (sizeof (struct ether_header) + MXGEFW_PAD + VLAN_TAGSZ);
5854 #ifdef SOLARIS_S11
5855 	macp->m_margin = VLAN_TAGSZ;
5856 #endif
5857 	macp->m_v12n = MAC_VIRT_LEVEL1;
5858 	status = mac_register(macp, &mgp->mh);
5859 	if (status != 0) {
5860 		cmn_err(CE_WARN, "%s: mac_register failed with %d\n",
5861 		    mgp->name, status);
5862 		goto abort_with_info;
5863 	}
5864 	myri10ge_ndd_init(mgp);
5865 	if (myri10ge_verbose)
5866 		printf("%s: %s, tx bndry %d, fw %s\n", mgp->name,
5867 		    mgp->intr_type, mgp->tx_boundary, mgp->fw_name);
5868 	mutex_enter(&myri10ge_param_lock);
5869 	mgp->next = mgp_list;
5870 	mgp_list = mgp;
5871 	mutex_exit(&myri10ge_param_lock);
5872 	kmem_free(macp, sizeof (*macp) * 8);
5873 	mac_free(omacp);
5874 	return (DDI_SUCCESS);
5875 
5876 abort_with_info:
5877 	myri10ge_info_destroy(mgp);
5878 
5879 abort_with_stats:
5880 	myri10ge_nic_stat_destroy(mgp);
5881 
5882 abort_with_interrupts:
5883 	mutex_destroy(&mgp->cmd_lock);
5884 	mutex_destroy(&mgp->intrlock);
5885 	myri10ge_rem_intrs(mgp, 1);
5886 
5887 abort_with_slices:
5888 	myri10ge_free_slices(mgp);
5889 
5890 abort_with_dummy_rdma:
5891 	myri10ge_dummy_rdma(mgp, 0);
5892 
5893 abort_with_mapped:
5894 	ddi_regs_map_free(&mgp->io_handle);
5895 
5896 	myri10ge_dma_free(&mgp->cmd_dma);
5897 
5898 abort_with_mgp:
5899 	kmem_free(mgp, sizeof (*mgp));
5900 
5901 abort_with_macinfo:
5902 	kmem_free(macp, sizeof (*macp) * 8);
5903 	mac_free(omacp);
5904 
5905 abort_with_cfg_hdl:
5906 	pci_config_teardown(&handle);
5907 	return (DDI_FAILURE);
5908 
5909 }
5910 
5911 
5912 static int
5913 myri10ge_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5914 {
5915 	struct myri10ge_priv	*mgp, *tmp;
5916 	int 			status, i, jbufs_alloced;
5917 
5918 	if (cmd == DDI_SUSPEND) {
5919 		status = myri10ge_suspend(dip);
5920 		return (status);
5921 	}
5922 
5923 	if (cmd != DDI_DETACH) {
5924 		return (DDI_FAILURE);
5925 	}
5926 	/* Get the driver private (gld_mac_info_t) structure */
5927 	mgp = ddi_get_driver_private(dip);
5928 
5929 	mutex_enter(&mgp->intrlock);
5930 	jbufs_alloced = 0;
5931 	for (i = 0; i < mgp->num_slices; i++) {
5932 		myri10ge_remove_jbufs(&mgp->ss[i]);
5933 		jbufs_alloced += mgp->ss[i].jpool.num_alloc;
5934 	}
5935 	mutex_exit(&mgp->intrlock);
5936 	if (jbufs_alloced != 0) {
5937 		cmn_err(CE_NOTE, "%s: %d loaned rx buffers remain\n",
5938 		    mgp->name, jbufs_alloced);
5939 		return (DDI_FAILURE);
5940 	}
5941 
5942 	mutex_enter(&myri10ge_param_lock);
5943 	if (mgp->refcnt != 0) {
5944 		mutex_exit(&myri10ge_param_lock);
5945 		cmn_err(CE_NOTE, "%s: %d external refs remain\n",
5946 		    mgp->name, mgp->refcnt);
5947 		return (DDI_FAILURE);
5948 	}
5949 	mutex_exit(&myri10ge_param_lock);
5950 
5951 	status = mac_unregister(mgp->mh);
5952 	if (status != DDI_SUCCESS)
5953 		return (status);
5954 
5955 	myri10ge_ndd_fini(mgp);
5956 	myri10ge_dummy_rdma(mgp, 0);
5957 	myri10ge_nic_stat_destroy(mgp);
5958 	myri10ge_info_destroy(mgp);
5959 
5960 	mutex_destroy(&mgp->cmd_lock);
5961 	mutex_destroy(&mgp->intrlock);
5962 
5963 	myri10ge_rem_intrs(mgp, 1);
5964 
5965 	myri10ge_free_slices(mgp);
5966 	ddi_regs_map_free(&mgp->io_handle);
5967 	myri10ge_dma_free(&mgp->cmd_dma);
5968 	pci_config_teardown(&mgp->cfg_hdl);
5969 
5970 	mutex_enter(&myri10ge_param_lock);
5971 	if (mgp_list == mgp) {
5972 		mgp_list = mgp->next;
5973 	} else {
5974 		tmp = mgp_list;
5975 		while (tmp->next != mgp && tmp->next != NULL)
5976 			tmp = tmp->next;
5977 		if (tmp->next != NULL)
5978 			tmp->next = tmp->next->next;
5979 	}
5980 	kmem_free(mgp, sizeof (*mgp));
5981 	mutex_exit(&myri10ge_param_lock);
5982 	return (DDI_SUCCESS);
5983 }
5984 
5985 /*
5986  * Helper for quiesce entry point: Interrupt threads are not being
5987  * scheduled, so we must poll for the confirmation DMA to arrive in
5988  * the firmware stats block for slice 0.  We're essentially running
5989  * the guts of the interrupt handler, and just cherry picking the
5990  * confirmation that the NIC is queuesced (stats->link_down)
5991  */
5992 
5993 static int
5994 myri10ge_poll_down(struct myri10ge_priv *mgp)
5995 {
5996 	struct myri10ge_slice_state *ss = mgp->ss;
5997 	mcp_irq_data_t *stats = ss->fw_stats;
5998 	int valid;
5999 	int found_down = 0;
6000 
6001 
6002 	/* check for a pending IRQ */
6003 
6004 	if (! *((volatile uint8_t *)& stats->valid))
6005 		return (0);
6006 	valid = stats->valid;
6007 
6008 	/*
6009 	 * Make sure to tell the NIC to lower a legacy IRQ, else
6010 	 * it may have corrupt state after restarting
6011 	 */
6012 
6013 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_FIXED) {
6014 		/* lower legacy IRQ  */
6015 		*mgp->irq_deassert = 0;
6016 		mb();
6017 		/* wait for irq conf DMA */
6018 		while (*((volatile uint8_t *)& stats->valid))
6019 			;
6020 	}
6021 	if (stats->stats_updated && stats->link_down)
6022 		found_down = 1;
6023 
6024 	if (valid & 0x1)
6025 		*ss->irq_claim = BE_32(3);
6026 	*(ss->irq_claim + 1) = BE_32(3);
6027 
6028 	return (found_down);
6029 }
6030 
6031 static int
6032 myri10ge_quiesce(dev_info_t *dip)
6033 {
6034 	struct myri10ge_priv *mgp;
6035 	myri10ge_cmd_t cmd;
6036 	int status, down, i;
6037 
6038 	mgp = ddi_get_driver_private(dip);
6039 	if (mgp == NULL)
6040 		return (DDI_FAILURE);
6041 
6042 	/* if devices was unplumbed, it is guaranteed to be quiescent */
6043 	if (mgp->running == MYRI10GE_ETH_STOPPED)
6044 		return (DDI_SUCCESS);
6045 
6046 	/* send a down CMD to queuesce NIC */
6047 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
6048 	if (status) {
6049 		cmn_err(CE_WARN, "%s: Couldn't bring down link\n", mgp->name);
6050 		return (DDI_FAILURE);
6051 	}
6052 
6053 	for (i = 0; i < 20; i++) {
6054 		down = myri10ge_poll_down(mgp);
6055 		if (down)
6056 			break;
6057 		delay(drv_usectohz(100000));
6058 		mb();
6059 	}
6060 	if (down)
6061 		return (DDI_SUCCESS);
6062 	return (DDI_FAILURE);
6063 }
6064 
6065 /*
6066  * Distinguish between allocb'ed blocks, and gesballoc'ed attached
6067  * storage.
6068  */
6069 static void
6070 myri10ge_find_lastfree(void)
6071 {
6072 	mblk_t *mp = allocb(1024, 0);
6073 	dblk_t *dbp;
6074 
6075 	if (mp == NULL) {
6076 		cmn_err(CE_WARN, "myri10ge_find_lastfree failed\n");
6077 		return;
6078 	}
6079 	dbp = mp->b_datap;
6080 	myri10ge_db_lastfree = (void *)dbp->db_lastfree;
6081 }
6082 
6083 int
6084 _init(void)
6085 {
6086 	int i;
6087 
6088 	if (myri10ge_verbose)
6089 		cmn_err(CE_NOTE,
6090 		    "Myricom 10G driver (10GbE) version %s loading\n",
6091 		    MYRI10GE_VERSION_STR);
6092 	myri10ge_find_lastfree();
6093 	mac_init_ops(&myri10ge_ops, "myri10ge");
6094 	mutex_init(&myri10ge_param_lock, NULL, MUTEX_DEFAULT, NULL);
6095 	if ((i = mod_install(&modlinkage)) != 0) {
6096 		cmn_err(CE_WARN, "mod_install returned %d\n", i);
6097 		mac_fini_ops(&myri10ge_ops);
6098 		mutex_destroy(&myri10ge_param_lock);
6099 	}
6100 	return (i);
6101 }
6102 
6103 int
6104 _fini(void)
6105 {
6106 	int i;
6107 	i = mod_remove(&modlinkage);
6108 	if (i != 0) {
6109 		return (i);
6110 	}
6111 	mac_fini_ops(&myri10ge_ops);
6112 	mutex_destroy(&myri10ge_param_lock);
6113 	return (0);
6114 }
6115 
6116 int
6117 _info(struct modinfo *modinfop)
6118 {
6119 	return (mod_info(&modlinkage, modinfop));
6120 }
6121 
6122 
6123 /*
6124  *  This file uses MyriGE driver indentation.
6125  *
6126  * Local Variables:
6127  * c-file-style:"sun"
6128  * tab-width:8
6129  * End:
6130  */
6131