1 // Copyright 2012 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Parallel for algorithm.
6 
7 #include "runtime.h"
8 #include "arch.h"
9 
10 struct ParForThread
11 {
12 	// the thread's iteration space [32lsb, 32msb)
13 	uint64 pos;
14 	// stats
15 	uint64 nsteal;
16 	uint64 nstealcnt;
17 	uint64 nprocyield;
18 	uint64 nosyield;
19 	uint64 nsleep;
20 	byte pad[CacheLineSize];
21 };
22 
23 ParFor*
runtime_parforalloc(uint32 nthrmax)24 runtime_parforalloc(uint32 nthrmax)
25 {
26 	ParFor *desc;
27 
28 	// The ParFor object is followed by CacheLineSize padding
29 	// and then nthrmax ParForThread.
30 	desc = (ParFor*)runtime_malloc(sizeof(ParFor) + CacheLineSize + nthrmax * sizeof(ParForThread));
31 	desc->thr = (ParForThread*)((byte*)(desc+1) + CacheLineSize);
32 	desc->nthrmax = nthrmax;
33 	return desc;
34 }
35 
36 void
runtime_parforsetup(ParFor * desc,uint32 nthr,uint32 n,bool wait,const FuncVal * body)37 runtime_parforsetup(ParFor *desc, uint32 nthr, uint32 n, bool wait, const FuncVal *body)
38 {
39 	uint32 i, begin, end;
40 	uint64 *pos;
41 
42 	if(desc == nil || nthr == 0 || nthr > desc->nthrmax || body == nil) {
43 		runtime_printf("desc=%p nthr=%d count=%d body=%p\n", desc, nthr, n, body);
44 		runtime_throw("parfor: invalid args");
45 	}
46 
47 	desc->body = body;
48 	desc->done = 0;
49 	desc->nthr = nthr;
50 	desc->thrseq = 0;
51 	desc->cnt = n;
52 	desc->wait = wait;
53 	desc->nsteal = 0;
54 	desc->nstealcnt = 0;
55 	desc->nprocyield = 0;
56 	desc->nosyield = 0;
57 	desc->nsleep = 0;
58 	for(i=0; i<nthr; i++) {
59 		begin = (uint64)n*i / nthr;
60 		end = (uint64)n*(i+1) / nthr;
61 		pos = &desc->thr[i].pos;
62 		if(((uintptr)pos & 7) != 0)
63 			runtime_throw("parforsetup: pos is not aligned");
64 		*pos = (uint64)begin | (((uint64)end)<<32);
65 	}
66 }
67 
68 void
runtime_parfordo(ParFor * desc)69 runtime_parfordo(ParFor *desc)
70 {
71 	ParForThread *me;
72 	uint32 tid, begin, end, begin2, try, victim, i;
73 	uint64 *mypos, *victimpos, pos, newpos;
74 	const FuncVal *body;
75 	void (*bodyfn)(ParFor*, uint32);
76 	bool idle;
77 
78 	// Obtain 0-based thread index.
79 	tid = runtime_xadd(&desc->thrseq, 1) - 1;
80 	if(tid >= desc->nthr) {
81 		runtime_printf("tid=%d nthr=%d\n", tid, desc->nthr);
82 		runtime_throw("parfor: invalid tid");
83 	}
84 
85 	body = desc->body;
86 	bodyfn = (void (*)(ParFor*, uint32))(void*)body->fn;
87 
88 	// If single-threaded, just execute the for serially.
89 	if(desc->nthr==1) {
90 		for(i=0; i<desc->cnt; i++)
91 		  __builtin_call_with_static_chain (bodyfn(desc, i), body);
92 		return;
93 	}
94 
95 	me = &desc->thr[tid];
96 	mypos = &me->pos;
97 	for(;;) {
98 		for(;;) {
99 			// While there is local work,
100 			// bump low index and execute the iteration.
101 			pos = runtime_xadd64(mypos, 1);
102 			begin = (uint32)pos-1;
103 			end = (uint32)(pos>>32);
104 			if(begin < end) {
105 				__builtin_call_with_static_chain(bodyfn(desc, begin), body);
106 				continue;
107 			}
108 			break;
109 		}
110 
111 		// Out of work, need to steal something.
112 		idle = false;
113 		for(try=0;; try++) {
114 			// If we don't see any work for long enough,
115 			// increment the done counter...
116 			if(try > desc->nthr*4 && !idle) {
117 				idle = true;
118 				runtime_xadd(&desc->done, 1);
119 			}
120 			// ...if all threads have incremented the counter,
121 			// we are done.
122 			if(desc->done + !idle == desc->nthr) {
123 				if(!idle)
124 					runtime_xadd(&desc->done, 1);
125 				goto exit;
126 			}
127 			// Choose a random victim for stealing.
128 			victim = runtime_fastrand1() % (desc->nthr-1);
129 			if(victim >= tid)
130 				victim++;
131 			victimpos = &desc->thr[victim].pos;
132 			for(;;) {
133 				// See if it has any work.
134 				pos = runtime_atomicload64(victimpos);
135 				begin = (uint32)pos;
136 				end = (uint32)(pos>>32);
137 				if(begin+1 >= end) {
138 					begin = end = 0;
139 					break;
140 				}
141 				if(idle) {
142 					runtime_xadd(&desc->done, -1);
143 					idle = false;
144 				}
145 				begin2 = begin + (end-begin)/2;
146 				newpos = (uint64)begin | (uint64)begin2<<32;
147 				if(runtime_cas64(victimpos, pos, newpos)) {
148 					begin = begin2;
149 					break;
150 				}
151 			}
152 			if(begin < end) {
153 				// Has successfully stolen some work.
154 				if(idle)
155 					runtime_throw("parfor: should not be idle");
156 				runtime_atomicstore64(mypos, (uint64)begin | (uint64)end<<32);
157 				me->nsteal++;
158 				me->nstealcnt += end-begin;
159 				break;
160 			}
161 			// Backoff.
162 			if(try < desc->nthr) {
163 				// nothing
164 			} else if (try < 4*desc->nthr) {
165 				me->nprocyield++;
166 				runtime_procyield(20);
167 			// If a caller asked not to wait for the others, exit now
168 			// (assume that most work is already done at this point).
169 			} else if (!desc->wait) {
170 				if(!idle)
171 					runtime_xadd(&desc->done, 1);
172 				goto exit;
173 			} else if (try < 6*desc->nthr) {
174 				me->nosyield++;
175 				runtime_osyield();
176 			} else {
177 				me->nsleep++;
178 				runtime_usleep(1);
179 			}
180 		}
181 	}
182 exit:
183 	runtime_xadd64(&desc->nsteal, me->nsteal);
184 	runtime_xadd64(&desc->nstealcnt, me->nstealcnt);
185 	runtime_xadd64(&desc->nprocyield, me->nprocyield);
186 	runtime_xadd64(&desc->nosyield, me->nosyield);
187 	runtime_xadd64(&desc->nsleep, me->nsleep);
188 	me->nsteal = 0;
189 	me->nstealcnt = 0;
190 	me->nprocyield = 0;
191 	me->nosyield = 0;
192 	me->nsleep = 0;
193 }
194 
195 // For testing from Go.
196 void
runtime_parforiters(ParFor * desc,uintptr tid,uintptr * start,uintptr * end)197 runtime_parforiters(ParFor *desc, uintptr tid, uintptr *start, uintptr *end)
198 {
199 	*start = (uint32)desc->thr[tid].pos;
200 	*end = (uint32)(desc->thr[tid].pos>>32);
201 }
202