xref: /illumos-gate/usr/src/uts/common/io/igb/igb_rx.c (revision 214c2196)
1 /*
2  * CDDL HEADER START
3  *
4  * Copyright(c) 2007-2009 Intel Corporation. All rights reserved.
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
25  * Use is subject to license terms.
26  */
27 
28 #include "igb_sw.h"
29 
30 /* function prototypes */
31 static mblk_t *igb_rx_bind(igb_rx_data_t *, uint32_t, uint32_t);
32 static mblk_t *igb_rx_copy(igb_rx_data_t *, uint32_t, uint32_t);
33 static void igb_rx_assoc_hcksum(mblk_t *, uint32_t);
34 
35 #ifndef IGB_DEBUG
36 #pragma inline(igb_rx_assoc_hcksum)
37 #endif
38 
39 
40 /*
41  * igb_rx_recycle - the call-back function to reclaim rx buffer
42  *
43  * This function is called when an mp is freed by the user thru
44  * freeb call (Only for mp constructed through desballoc call).
45  * It returns back the freed buffer to the free list.
46  */
47 void
48 igb_rx_recycle(caddr_t arg)
49 {
50 	igb_t *igb;
51 	igb_rx_ring_t *rx_ring;
52 	igb_rx_data_t	*rx_data;
53 	rx_control_block_t *recycle_rcb;
54 	uint32_t free_index;
55 	uint32_t ref_cnt;
56 
57 	recycle_rcb = (rx_control_block_t *)(uintptr_t)arg;
58 	rx_data = recycle_rcb->rx_data;
59 	rx_ring = rx_data->rx_ring;
60 	igb = rx_ring->igb;
61 
62 	if (recycle_rcb->ref_cnt == 0) {
63 		/*
64 		 * This case only happens when rx buffers are being freed
65 		 * in igb_stop() and freemsg() is called.
66 		 */
67 		return;
68 	}
69 
70 	ASSERT(recycle_rcb->mp == NULL);
71 
72 	/*
73 	 * Using the recycled data buffer to generate a new mblk
74 	 */
75 	recycle_rcb->mp = desballoc((unsigned char *)
76 	    recycle_rcb->rx_buf.address,
77 	    recycle_rcb->rx_buf.size,
78 	    0, &recycle_rcb->free_rtn);
79 
80 	/*
81 	 * Put the recycled rx control block into free list
82 	 */
83 	mutex_enter(&rx_data->recycle_lock);
84 
85 	free_index = rx_data->rcb_tail;
86 	ASSERT(rx_data->free_list[free_index] == NULL);
87 
88 	rx_data->free_list[free_index] = recycle_rcb;
89 	rx_data->rcb_tail = NEXT_INDEX(free_index, 1, rx_data->free_list_size);
90 
91 	mutex_exit(&rx_data->recycle_lock);
92 
93 	/*
94 	 * The atomic operation on the number of the available rx control
95 	 * blocks in the free list is used to make the recycling mutual
96 	 * exclusive with the receiving.
97 	 */
98 	atomic_inc_32(&rx_data->rcb_free);
99 	ASSERT(rx_data->rcb_free <= rx_data->free_list_size);
100 
101 	/*
102 	 * Considering the case that the interface is unplumbed
103 	 * and there are still some buffers held by the upper layer.
104 	 * When the buffer is returned back, we need to free it.
105 	 */
106 	ref_cnt = atomic_dec_32_nv(&recycle_rcb->ref_cnt);
107 	if (ref_cnt == 0) {
108 		if (recycle_rcb->mp != NULL) {
109 			freemsg(recycle_rcb->mp);
110 			recycle_rcb->mp = NULL;
111 		}
112 
113 		igb_free_dma_buffer(&recycle_rcb->rx_buf);
114 
115 		mutex_enter(&igb->rx_pending_lock);
116 		atomic_dec_32(&rx_data->rcb_pending);
117 		atomic_dec_32(&igb->rcb_pending);
118 
119 		/*
120 		 * When there is not any buffer belonging to this rx_data
121 		 * held by the upper layer, the rx_data can be freed.
122 		 */
123 		if ((rx_data->flag & IGB_RX_STOPPED) &&
124 		    (rx_data->rcb_pending == 0))
125 			igb_free_rx_ring_data(rx_data);
126 
127 		mutex_exit(&igb->rx_pending_lock);
128 	}
129 }
130 
131 /*
132  * igb_rx_copy - Use copy to process the received packet
133  *
134  * This function will use bcopy to process the packet
135  * and send the copied packet upstream
136  */
137 static mblk_t *
138 igb_rx_copy(igb_rx_data_t *rx_data, uint32_t index, uint32_t pkt_len)
139 {
140 	rx_control_block_t *current_rcb;
141 	mblk_t *mp;
142 	igb_t *igb = rx_data->rx_ring->igb;
143 
144 	current_rcb = rx_data->work_list[index];
145 
146 	DMA_SYNC(&current_rcb->rx_buf, DDI_DMA_SYNC_FORKERNEL);
147 
148 	if (igb_check_dma_handle(
149 	    current_rcb->rx_buf.dma_handle) != DDI_FM_OK) {
150 		ddi_fm_service_impact(igb->dip, DDI_SERVICE_DEGRADED);
151 		atomic_or_32(&igb->igb_state, IGB_ERROR);
152 		return (NULL);
153 	}
154 
155 	/*
156 	 * Allocate buffer to receive this packet
157 	 */
158 	mp = allocb(pkt_len + IPHDR_ALIGN_ROOM, 0);
159 	if (mp == NULL) {
160 		igb_log(igb, "igb_rx_copy: allocate buffer failed");
161 		return (NULL);
162 	}
163 
164 	/*
165 	 * Copy the data received into the new cluster
166 	 */
167 	mp->b_rptr += IPHDR_ALIGN_ROOM;
168 	bcopy(current_rcb->rx_buf.address, mp->b_rptr, pkt_len);
169 	mp->b_wptr = mp->b_rptr + pkt_len;
170 
171 	return (mp);
172 }
173 
174 /*
175  * igb_rx_bind - Use existing DMA buffer to build mblk for receiving
176  *
177  * This function will use pre-bound DMA buffer to receive the packet
178  * and build mblk that will be sent upstream.
179  */
180 static mblk_t *
181 igb_rx_bind(igb_rx_data_t *rx_data, uint32_t index, uint32_t pkt_len)
182 {
183 	rx_control_block_t *current_rcb;
184 	rx_control_block_t *free_rcb;
185 	uint32_t free_index;
186 	mblk_t *mp;
187 	igb_t *igb = rx_data->rx_ring->igb;
188 
189 	/*
190 	 * If the free list is empty, we cannot proceed to send
191 	 * the current DMA buffer upstream. We'll have to return
192 	 * and use bcopy to process the packet.
193 	 */
194 	if (igb_atomic_reserve(&rx_data->rcb_free, 1) < 0)
195 		return (NULL);
196 
197 	current_rcb = rx_data->work_list[index];
198 	/*
199 	 * If the mp of the rx control block is NULL, try to do
200 	 * desballoc again.
201 	 */
202 	if (current_rcb->mp == NULL) {
203 		current_rcb->mp = desballoc((unsigned char *)
204 		    current_rcb->rx_buf.address,
205 		    current_rcb->rx_buf.size,
206 		    0, &current_rcb->free_rtn);
207 		/*
208 		 * If it is failed to built a mblk using the current
209 		 * DMA buffer, we have to return and use bcopy to
210 		 * process the packet.
211 		 */
212 		if (current_rcb->mp == NULL) {
213 			atomic_inc_32(&rx_data->rcb_free);
214 			return (NULL);
215 		}
216 	}
217 	/*
218 	 * Sync up the data received
219 	 */
220 	DMA_SYNC(&current_rcb->rx_buf, DDI_DMA_SYNC_FORKERNEL);
221 
222 	if (igb_check_dma_handle(
223 	    current_rcb->rx_buf.dma_handle) != DDI_FM_OK) {
224 		ddi_fm_service_impact(igb->dip, DDI_SERVICE_DEGRADED);
225 		atomic_or_32(&igb->igb_state, IGB_ERROR);
226 		atomic_inc_32(&rx_data->rcb_free);
227 		return (NULL);
228 	}
229 
230 	mp = current_rcb->mp;
231 	current_rcb->mp = NULL;
232 	atomic_inc_32(&current_rcb->ref_cnt);
233 
234 	mp->b_wptr = mp->b_rptr + pkt_len;
235 	mp->b_next = mp->b_cont = NULL;
236 
237 	/*
238 	 * Strip off one free rx control block from the free list
239 	 */
240 	free_index = rx_data->rcb_head;
241 	free_rcb = rx_data->free_list[free_index];
242 	ASSERT(free_rcb != NULL);
243 	rx_data->free_list[free_index] = NULL;
244 	rx_data->rcb_head = NEXT_INDEX(free_index, 1, rx_data->free_list_size);
245 
246 	/*
247 	 * Put the rx control block to the work list
248 	 */
249 	rx_data->work_list[index] = free_rcb;
250 
251 	return (mp);
252 }
253 
254 /*
255  * igb_rx_assoc_hcksum
256  *
257  * Check the rx hardware checksum status and associate the hcksum flags
258  */
259 static void
260 igb_rx_assoc_hcksum(mblk_t *mp, uint32_t status_error)
261 {
262 	uint32_t hcksum_flags = 0;
263 
264 	/* Ignore Checksum Indication */
265 	if (status_error & E1000_RXD_STAT_IXSM)
266 		return;
267 
268 	/*
269 	 * Check TCP/UDP checksum
270 	 */
271 	if (((status_error & E1000_RXD_STAT_TCPCS) ||
272 	    (status_error & E1000_RXD_STAT_UDPCS)) &&
273 	    !(status_error & E1000_RXDEXT_STATERR_TCPE))
274 		hcksum_flags |= HCK_FULLCKSUM_OK;
275 
276 	/*
277 	 * Check IP Checksum
278 	 */
279 	if ((status_error & E1000_RXD_STAT_IPCS) &&
280 	    !(status_error & E1000_RXDEXT_STATERR_IPE))
281 		hcksum_flags |= HCK_IPV4_HDRCKSUM_OK;
282 
283 	if (hcksum_flags != 0) {
284 		mac_hcksum_set(mp, 0, 0, 0, 0, hcksum_flags);
285 	}
286 }
287 
288 mblk_t *
289 igb_rx_ring_poll(void *arg, int bytes)
290 {
291 	igb_rx_ring_t *rx_ring = (igb_rx_ring_t *)arg;
292 	mblk_t *mp = NULL;
293 
294 	ASSERT(bytes >= 0);
295 
296 	if ((bytes == 0) || (rx_ring->igb->igb_state & IGB_SUSPENDED) ||
297 	    !(rx_ring->igb->igb_state & IGB_STARTED))
298 		return (NULL);
299 
300 	mutex_enter(&rx_ring->rx_lock);
301 	mp = igb_rx(rx_ring, bytes);
302 	mutex_exit(&rx_ring->rx_lock);
303 
304 	return (mp);
305 }
306 
307 /*
308  * igb_rx - Receive the data of one ring
309  *
310  * This function goes throught h/w descriptor in one specified rx ring,
311  * receives the data if the descriptor status shows the data is ready.
312  * It returns a chain of mblks containing the received data, to be
313  * passed up to mac_rx().
314  */
315 mblk_t *
316 igb_rx(igb_rx_ring_t *rx_ring, int poll_bytes)
317 {
318 	union e1000_adv_rx_desc *current_rbd;
319 	rx_control_block_t *current_rcb;
320 	mblk_t *mp;
321 	mblk_t *mblk_head;
322 	mblk_t **mblk_tail;
323 	uint32_t rx_next;
324 	uint32_t rx_tail;
325 	uint32_t pkt_len;
326 	uint32_t status_error;
327 	uint32_t pkt_num;
328 	uint32_t total_bytes;
329 	igb_t *igb = rx_ring->igb;
330 	igb_rx_data_t *rx_data = rx_ring->rx_data;
331 
332 	mblk_head = NULL;
333 	mblk_tail = &mblk_head;
334 
335 	if (igb->igb_state & IGB_ERROR)
336 		return (NULL);
337 
338 	/*
339 	 * Sync the receive descriptors before
340 	 * accepting the packets
341 	 */
342 	DMA_SYNC(&rx_data->rbd_area, DDI_DMA_SYNC_FORKERNEL);
343 
344 	if (igb_check_dma_handle(
345 	    rx_data->rbd_area.dma_handle) != DDI_FM_OK) {
346 		ddi_fm_service_impact(igb->dip, DDI_SERVICE_DEGRADED);
347 		atomic_or_32(&igb->igb_state, IGB_ERROR);
348 		return (NULL);
349 	}
350 
351 	/*
352 	 * Get the start point of rx bd ring which should be examined
353 	 * during this cycle.
354 	 */
355 	rx_next = rx_data->rbd_next;
356 
357 	current_rbd = &rx_data->rbd_ring[rx_next];
358 	pkt_num = 0;
359 	total_bytes = 0;
360 	status_error = current_rbd->wb.upper.status_error;
361 	while (status_error & E1000_RXD_STAT_DD) {
362 		/*
363 		 * If hardware has found the errors, but the error
364 		 * is hardware checksum error, here does not discard the
365 		 * packet, and let upper layer compute the checksum;
366 		 * Otherwise discard the packet.
367 		 */
368 		if ((status_error & E1000_RXDEXT_ERR_FRAME_ERR_MASK) ||
369 		    !(status_error & E1000_RXD_STAT_EOP)) {
370 			IGB_DEBUG_STAT(rx_ring->stat_frame_error);
371 			goto rx_discard;
372 		}
373 
374 		IGB_DEBUG_STAT_COND(rx_ring->stat_cksum_error,
375 		    (status_error & E1000_RXDEXT_STATERR_TCPE) ||
376 		    (status_error & E1000_RXDEXT_STATERR_IPE));
377 
378 		pkt_len = current_rbd->wb.upper.length;
379 
380 		if ((poll_bytes != IGB_NO_POLL) &&
381 		    ((pkt_len + total_bytes) > poll_bytes))
382 			break;
383 
384 		IGB_DEBUG_STAT(rx_ring->stat_pkt_cnt);
385 		total_bytes += pkt_len;
386 
387 		mp = NULL;
388 		/*
389 		 * For packets with length more than the copy threshold,
390 		 * we'll firstly try to use the existed DMA buffer to built
391 		 * a mblk and send the mblk upstream.
392 		 *
393 		 * If the first method fails, or the packet length is less
394 		 * than the copy threshold, we'll allocate a new mblk and
395 		 * copy the packet data to the mblk.
396 		 */
397 		if (pkt_len > igb->rx_copy_thresh)
398 			mp = igb_rx_bind(rx_data, rx_next, pkt_len);
399 
400 		if (mp == NULL)
401 			mp = igb_rx_copy(rx_data, rx_next, pkt_len);
402 
403 		if (mp != NULL) {
404 			/*
405 			 * Check h/w checksum offload status
406 			 */
407 			if (igb->rx_hcksum_enable)
408 				igb_rx_assoc_hcksum(mp, status_error);
409 
410 			*mblk_tail = mp;
411 			mblk_tail = &mp->b_next;
412 		}
413 
414 		/* Update per-ring rx statistics */
415 		rx_ring->rx_pkts++;
416 		rx_ring->rx_bytes += pkt_len;
417 
418 rx_discard:
419 		/*
420 		 * Reset rx descriptor read bits
421 		 */
422 		current_rcb = rx_data->work_list[rx_next];
423 		current_rbd->read.pkt_addr = current_rcb->rx_buf.dma_address;
424 		current_rbd->read.hdr_addr = 0;
425 
426 		rx_next = NEXT_INDEX(rx_next, 1, rx_data->ring_size);
427 
428 		/*
429 		 * The receive function is in interrupt context, so here
430 		 * rx_limit_per_intr is used to avoid doing receiving too long
431 		 * per interrupt.
432 		 */
433 		if (++pkt_num > igb->rx_limit_per_intr) {
434 			IGB_DEBUG_STAT(rx_ring->stat_exceed_pkt);
435 			break;
436 		}
437 
438 		current_rbd = &rx_data->rbd_ring[rx_next];
439 		status_error = current_rbd->wb.upper.status_error;
440 	}
441 
442 	DMA_SYNC(&rx_data->rbd_area, DDI_DMA_SYNC_FORDEV);
443 
444 	rx_data->rbd_next = rx_next;
445 
446 	/*
447 	 * Update the h/w tail accordingly
448 	 */
449 	rx_tail = PREV_INDEX(rx_next, 1, rx_data->ring_size);
450 
451 	E1000_WRITE_REG(&igb->hw, E1000_RDT(rx_ring->index), rx_tail);
452 
453 	if (igb_check_acc_handle(igb->osdep.reg_handle) != DDI_FM_OK) {
454 		ddi_fm_service_impact(igb->dip, DDI_SERVICE_DEGRADED);
455 		atomic_or_32(&igb->igb_state, IGB_ERROR);
456 	}
457 
458 	return (mblk_head);
459 }
460