1 #include "chess.h"
2 #include "data.h"
3 #include "epdglue.h"
4 /* modified 08/03/16 */
5 /*
6 *******************************************************************************
7 * *
8 * Split() is the driver for the threaded parallel search in Crafty. The *
9 * basic idea is that whenever we notice that one (or more) threads are in *
10 * their idle loop, we drop into Split(), from Search(), and begin a new *
11 * parallel search from this node. This is simply a problem of establishing *
12 * a new split point, and then letting each thread join this split point and *
13 * copy whatever data they need. *
14 * *
15 * This is generation II of Split(). The primary differences address two *
16 * specific performance-robbing issues. (1) Excessive waiting for a split *
17 * to be done, and (b) excessive waiting on specific locks. Generation II *
18 * addresses both of these to significantly improve performance. *
19 * *
20 * The main difference between Gen I and Gen II is the effort required to *
21 * split the tree and which thread(s) expend this effort. In generation I, *
22 * the parent thread was responsible for allocating a split block for each *
23 * child thread, and then copying the necessary data from the parent split *
24 * block to these child split blocks. When all of this was completed, the *
25 * child processes were released to start the parallel search after being *
26 * held while the split / copy operations were done. In the generation II *
27 * Split() we now simply allocate a new split block for THIS thread, flag *
28 * the parent split block as joinable, and then go directly to ThreadWait() *
29 * which will drop us back in to the search immediately. The idle threads *
30 * are continually looping on Join() which will jump them right into this *
31 * split block letting them do ALL the work of allocating a split block, *
32 * filling it in, and then copying the data to their local split block. *
33 * This distributes the split overhead among all the threads that split, *
34 * rather than this thread having to do all the work while the other threads *
35 * sit idle. *
36 * *
37 * Generation II is also much more lightweight, in that it copies only the *
38 * bare minimum from parent to child. Generation I safely copied far too *
39 * much since this code was being changed regularly, but that is no longer *
40 * necessary overhead. *
41 * *
42 * Generation II has a zero contention split algorithm. In the past, when a *
43 * thread became idle, it posted a global split request and any thread that *
44 * was at an appropriate point would try to split. But while it was doing *
45 * the splitting, other threads that were also willing to split would "get *
46 * in line" because Crafty used a global lock to prevent two threads from *
47 * attempting to split at the same instant in time. They got in line, and *
48 * waited for the original splitter to release the lock, but now they have *
49 * no idle threads to split with. A waste of time. Now we allow ANY thread *
50 * to attempt to split at the current ply. When we do what might be called *
51 * a "gratuitous split" the only restriction is that if we have existing *
52 * "gratuitous split points" (split blocks that are joinable but have not *
53 * yet been joined), then we limit the number of such splits (per thread) to *
54 * avoid excessive overhead. *
55 * *
56 * Generation II takes another idea from DTS, the idea of "late-join". The *
57 * idea is fairly simple. If, when a thread becomes idle, there are already *
58 * other split points being searched in parallel, then we will try to join *
59 * one of them rather than waiting for someone to ask us to help. We use *
60 * some simple criteria: (1) The split point must be joinable, which simply *
61 * means that no processor has exited the split point yet (which would let *
62 * us know there is no more work here and a join would be futile); (2) We *
63 * compute an "interest" value which is a simple formula based on depth at *
64 * the split point, and the number of moves already searched. It seems less *
65 * risky to choose a split point with max depth AND minimum moves already *
66 * searched so that there is plenty to do. This was quite simple to add *
67 * after the rest of the Generation II rewrite. In fact, this is now THE *
68 * way threads join a split point, period, which further simplifies the code *
69 * and improves efficiency. IE, a thread can split when idle threads are *
70 * noticed, or if it is far enough away from the tips to make the cost *
71 * negligible. At that point any idle thread(s) can join immediately, those *
72 * that become idle later can join when they are ready. *
73 * *
74 * There are a number of settable options via the command-line or .craftyrc *
75 * initialization file. Here's a concise explanation for each option and an *
76 * occasional suggestion for testing/tuning. *
77 * *
78 * smp_affinity (command = smpaffinity=<n> <p> is used to enable or disable *
79 * processor affinity. -1 disables affinity and lets threads run on any *
80 * available core. If you use an integer <n> then thread zero will bind *
81 * itself to cpu <n> and each additional thread will bind to the next *
82 * higher cpu number. This is useful if you try to run two copies of *
83 * crafty on the same machine, now you can cause one to bind to the first *
84 * <n> cores, and the second to the last <n> cores. For the first *
85 * instance of Crafty, you would use smpaffinity=0, and for the second *
86 * smpaffinity=8, assuming you are running 8 threads per copy on a 16 cpu *
87 * machine. If you get this wrong, you can have more than one thread on *
88 * the same cpu which will significantly impact performance. *
89 * *
90 * smp_max_threads (command = smpmt=n) sets the total number of allowable *
91 * threads for the search. The default is one (1) as Crafty does not *
92 * assume it should use all available resources. For optimal performance *
93 * this should be set to the number of physical cores your machine has, *
94 * which does NOT include hyperthreaded cores. *
95 * *
96 * smp_split_group (command = smpgroup=n) sets the maximum number of threads *
97 * at any single split point, with the exception of split points fairly *
98 * close to the root where ALL threads are allowed to split together, *
99 * ignoring this limit. Note that this is ignored in the first 1/2 of *
100 * the tree (the nodes closer to the root). There it is actually good to *
101 * split and get all active threads involved. *
102 * *
103 * smp_min_split_depth (command = smpmin=n) avoids splitting when remaining *
104 * depth < n. This is used to balance splitting overhead cost against *
105 * the speed gain the parallel search produces. The default is currently *
106 * 5 (which could change with future generations of Intel hardware) but *
107 * values between 4 and 8 will work. Larger values allow somewhat fewer *
108 * splits, which reduces overhead, but it also increases the percentage *
109 * of the time where a thread is waiting on work. *
110 * *
111 * smp_split_at_root (command = smproot=0 or 1) enables (1) or disables (0) *
112 * splitting the tree at the root. This defaults to 1 which produces the *
113 * best performance by a signficiant margin. But it can be disabled if *
114 * you are playing with code changes. *
115 * *
116 * smp_gratuitous_depth (command = smpgd=<n>) controls " gratuitous splits" *
117 * which are splits that are done without any idle threads. This sets a *
118 * depth limit (remaining depth) that must be present before such a split *
119 * can be done. Making this number larger will reduce the number of *
120 * these splits. Making it too small will increase overhead slightly and *
121 * increase split block usage significantly. *
122 * *
123 * smp_gratuitous_limit (command = smpgl=<n>) limits the number of these *
124 * splits that a thread can do. Once a thread has this number of *
125 * unjoined split points, it will not be allowed to split any more until *
126 * one or more threads join at least one of the existing split points. *
127 * In the smp search statistics, where you see output that looks like *
128 * this: *
129 * *
130 * splits=m(n) ... *
131 * *
132 * m is the total splits done, n is the number of "wasted splits" which *
133 * are basically gratuitous splits where no thread joined before this *
134 * split point was completed and deallocated. *
135 * *
136 * The best way to tune all of these paramenters is to use the "autotune" *
137 * command (see autotune.c and help autotune) which will automatically run *
138 * tests and optimize the parameters. More details are in the autotune.c *
139 * source file. *
140 * *
141 * A few basic "rules of the road" for anyone interested in changing or *
142 * adding to any of this code. *
143 * *
144 * 1. If, at any time, you want to modify your private split block, no lock *
145 * is required. *
146 * *
147 * 2. If, at any time, you want to modify another split block, such as the *
148 * parent split block shared move list, you must acquire the lock in the *
149 * split block first. IE tree->parent->lock to lock the parent split *
150 * block during NextMove() and such. *
151 * *
152 * 3. If you want to modify any SMP-related data that spans multiple split *
153 * blocks, such as telling sibling processes to stop, etc, then you must *
154 * acquire the global "lock_smp" lock first. This prevents a deadlock *
155 * caused by two different threads walking the split block chain from *
156 * different directions, and acquiring the split block locks in *
157 * different orders, which could cause a catastrophic deadlock to occur. *
158 * This is an infrequent event so the overhead is not significant. *
159 * *
160 * 4. If you want to do any sort of I/O operation, you must acquire the *
161 * "lock_io" lock first. Since threads share descriptors, there are *
162 * lots of potential race conditions, from the simple tty-output getting *
163 * interlaced from different threads, to outright data corruption in the *
164 * book or log files. *
165 * *
166 * Some of the bugs caused by failing to acquire the correct lock will only *
167 * occur infrequently, and they are extremely difficult to find. Some only *
168 * show up in a public game where everyone is watching, to cause maximum *
169 * embarassment and causes the program to do something extremely stupid. *
170 * *
171 *******************************************************************************
172 */
Split(TREE * RESTRICT tree)173 int Split(TREE * RESTRICT tree) {
174 TREE *child;
175 int tid, tstart, tend;
176
177 /*
178 ************************************************************
179 * *
180 * Here we prepare to split the tree. All we really do in *
181 * the Generation II threading is grab a split block for *
182 * this thread, then flag the parent as "joinable" and *
183 * then jump right to ThreadWait() to resume where we left *
184 * off, with the expectation (but not a requirement) that *
185 * other threads will join us to help. *
186 * *
187 * Idle threads are sitting in ThreadWait() repeatedly *
188 * calling Join() to find them a split point, which we are *
189 * fixing to provide. They will then join as quickly as *
190 * they can, and other threads that become idle later can *
191 * also join without any further splitting needed. *
192 * *
193 * If we are unable to allocate a split block, we simply *
194 * abort this attempted split and return to the search *
195 * since other threads will also split quickly. *
196 * *
197 ************************************************************
198 */
199 tstart = ReadClock();
200 tree->nprocs = 0;
201 for (tid = 0; tid < smp_max_threads; tid++)
202 tree->siblings[tid] = 0;
203 child = GetBlock(tree, tree->thread_id);
204 if (!child)
205 return 0;
206 CopyFromParent(child);
207 thread[tree->thread_id].tree = child;
208 tree->joined = 0;
209 tree->joinable = 1;
210 parallel_splits++;
211 smp_split = 0;
212 tend = ReadClock();
213 thread[tree->thread_id].idle += tend - tstart;
214 /*
215 ************************************************************
216 * *
217 * We have successfully created a split point, which means *
218 * we are done. The instant we set the "joinable" flag, *
219 * idle threads may begin to join in at this split point *
220 * to help. Since this thread may finish before any or *
221 * all of the other parallel threads, this thread is sent *
222 * to ThreadWait() which will immediately send it to *
223 * SearchMoveList() like the other threads; however, going *
224 * to ThreadWait() allows this thread to join others if it *
225 * runs out of work to do. We do pass ThreadWait() the *
226 * address of the parent split block, so that if this *
227 * thread becomes idle, and this thread block shows no *
228 * threads are still busy, then this thread can return to *
229 * here and then back up into the previous ply as it *
230 * should. Note that no other thread can back up to the *
231 * previous ply since their recursive call stacks are not *
232 * set for that, while this call stack will bring us back *
233 * to this point where we return to the normal search, *
234 * which we just completed. *
235 * *
236 ************************************************************
237 */
238 ThreadWait(tree->thread_id, tree);
239 if (!tree->joined)
240 parallel_splits_wasted++;
241 return 1;
242 }
243
244 /* modified 08/03/16 */
245 /*
246 *******************************************************************************
247 * *
248 * Join() is called just when we enter the usual spin-loop waiting for work. *
249 * We take a quick look at all active split blocks to see if any look *
250 * "joinable". If so, we compute an "interest" value, which will be defined *
251 * below. We then join the most interesting split point directly. This *
252 * split point might have been created specifically for this thread to join, *
253 * or it might be one that was already active when this thread became idle, *
254 * which allows us to join that existing split point and not request a new *
255 * split operation, saving time. *
256 * *
257 *******************************************************************************
258 */
Join(int64_t tid)259 int Join(int64_t tid) {
260 TREE *tree, *join_block, *child;
261 int interest, best_interest, current, pass = 0;
262
263 /*
264 ************************************************************
265 * *
266 * First we pass over ALL split blocks, looking for those *
267 * flagged as "joinable" (which means they are actually *
268 * active split points and that no processor at that split *
269 * point has run out of work (there is no point in joining *
270 * a split point with no remaining work) and no fail high *
271 * has been found which would raise the "stop" flag.) This *
272 * is "racy" because we do not acquire any locks, which *
273 * means that the busy threads continue working, and there *
274 * is a small probability that the split point will *
275 * disappear while we are in this loop. To resolve the *
276 * potential race, after we find the most attractive split *
277 * point, we acquire the lock for that split block and *
278 * test again, but this time if the block is joinable, we *
279 * can safely join under control of the lock, which is not *
280 * held for very long at all. If the block is not *
281 * joinable once we acquire the lock, we abort joining *
282 * since it is futile. Note that if this happens, we will *
283 * try to find an existing split point we can join three *
284 * times before we exit, setting split to 1 to ask other *
285 * threads to produce more candidate split points. *
286 * *
287 * Special case: We don't want to join a split point that *
288 * was created by this thread. While it works, it can add *
289 * overhead since we can encounter a later split point *
290 * that originated at the current split point, and we *
291 * would continue searching even though most of the work *
292 * has already been completed. The hash table would help *
293 * avoid most (if not all) of this overhead, but there is *
294 * no good reason to take the chance of this happening. *
295 * *
296 ************************************************************
297 */
298 for (pass = 0; pass < 3; pass++) {
299 best_interest = -999999;
300 join_block = 0;
301 for (current = 0; current <= smp_max_threads * 64; current++) {
302 tree = block[current];
303 if (tree->joinable && (tree->ply <= tree->depth / 2 ||
304 tree->nprocs < smp_split_group) && tree->thread_id != tid) {
305 interest = tree->depth * 2 - tree->searched[0];
306 if (interest > best_interest) {
307 best_interest = interest;
308 join_block = tree;
309 }
310 }
311 }
312 /*
313 ************************************************************
314 * *
315 * Now we acquire the lock for this split block, and then *
316 * check to see if the block is still flagged as joinable. *
317 * If so, we set things up, and then we get pretty tricky *
318 * as we then release the lock, and then copy the data *
319 * from the parent to our split block. There is a chance *
320 * that while we are copying this data, the split point *
321 * gets completed by other threads. Which would leave an *
322 * apparent race condition exposed where we start copying *
323 * data here, the split point is completed, the parent *
324 * block is released and then reacquired and we continue *
325 * if nothing has happened here, getting data copied from *
326 * two different positions. *
327 * *
328 * Fortunately, we linked this new split block to the old *
329 * (original parent). If that split block is released, we *
330 * will discover this because that operation will also set *
331 * our "stop" flag which will prevent us from using this *
332 * data and breaking things. We allow threads to copy *
333 * this data without any lock protection to eliminate a *
334 * serialization (each node would copy the data serially, *
335 * rather than all at once) with the only consequence to *
336 * this being the overhead of copying and then throwing *
337 * the data away, which can happen on occasion even if we *
338 * used a lock for protection, since once we release the *
339 * lock it still takes time to get into the search and we *
340 * could STILL find that this split block has already been *
341 * completed, once again. Less contention and serial *
342 * computing improves performance. *
343 * *
344 ************************************************************
345 */
346 if (join_block) {
347 Lock(join_block->lock);
348 if (join_block->joinable) {
349 child = GetBlock(join_block, tid);
350 Unlock(join_block->lock);
351 if (child) {
352 CopyFromParent(child);
353 thread[tid].tree = child;
354 parallel_joins++;
355 return 1;
356 }
357 } else {
358 Unlock(join_block->lock);
359 break;
360 }
361 }
362 }
363 /*
364 ************************************************************
365 * *
366 * We did not acquire a split point to join, so we set *
367 * smp_split to 1 to ask busy threads to create joinable *
368 * split points. *
369 * *
370 ************************************************************
371 */
372 smp_split = 1;
373 return 0;
374 }
375
376 /* modified 08/03/16 */
377 /*
378 *******************************************************************************
379 * *
380 * ThreadAffinity() is called to "pin" a thread to a specific processor. It *
381 * is a "noop" (no-operation) if Crafty was not compiled with -DAFFINITY, or *
382 * if smp_affinity is negative (smpaffinity=-1 disables affinity). It *
383 * simply sets the affinity for the current thread to the requested CPU and *
384 * returns. NOTE: If hyperthreading is enabled, there is no guarantee that *
385 * this will work as expected and pin one thread per physical core. It *
386 * depends on how the O/S numbers the SMT cores. *
387 * *
388 *******************************************************************************
389 */
ThreadAffinity(int cpu)390 void ThreadAffinity(int cpu) {
391 #if defined(AFFINITY)
392 cpu_set_t cpuset;
393 pthread_t current_thread = pthread_self();
394
395 if (smp_affinity >= 0) {
396 CPU_ZERO(&cpuset);
397 CPU_SET(smp_affinity_increment * (cpu + smp_affinity), &cpuset);
398 pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset);
399 }
400 #endif
401 }
402
403 /* modified 08/03/16 */
404 /*
405 *******************************************************************************
406 * *
407 * ThreadInit() is called after a process is created. Its main task is to *
408 * initialize the process local memory so that it will fault in and be *
409 * allocated on the local node rather than the node where the original *
410 * (first) process was running. All threads will hang here via a custom *
411 * WaitForALlThreadsInitialized() procedure so that all the local thread *
412 * blocks are usable before the search actually begins. *
413 * *
414 *******************************************************************************
415 */
ThreadInit(void * t)416 void *STDCALL ThreadInit(void *t) {
417 int tid = (int64_t) t;
418
419 ThreadAffinity(tid);
420 #if !defined(UNIX)
421 ThreadMalloc((uint64_t) tid);
422 #endif
423 thread[tid].blocks = 0xffffffffffffffffull;
424 Lock(lock_smp);
425 initialized_threads++;
426 Unlock(lock_smp);
427 WaitForAllThreadsInitialized();
428 ThreadWait(tid, (TREE *) 0);
429 Lock(lock_smp);
430 smp_threads--;
431 Unlock(lock_smp);
432 return 0;
433 }
434
435 /* modified 08/03/16 */
436 /*
437 *******************************************************************************
438 * *
439 * ThreadSplit() is used to determine if we should split at the current ply. *
440 * There are some basic constraints on when splits can be done, such as the *
441 * depth remaining in the search (don't split to near the tips), and have we *
442 * searched at least one move to get a score or bound (YBW condition). *
443 * *
444 * If those conditions are satisfied, AND either a thread has requested a *
445 * split OR we are far enough away from the tips of the tree to justify a *
446 * "gratuitout split" then we return "success." A "gratuitout split" is a *
447 * split done without any idle threads. Since splits are not free, we only *
448 * do this well away from tips to limit overhead. We do this so that when a *
449 * thread becomes idle, it will find these split points immediately and not *
450 * have to wait for a split after the fact. *
451 * *
452 *******************************************************************************
453 */
ThreadSplit(TREE * tree,int ply,int depth,int alpha,int o_alpha,int done)454 int ThreadSplit(TREE * tree, int ply, int depth, int alpha, int o_alpha,
455 int done) {
456 TREE *used;
457 int64_t tblocks;
458 int temp, unused = 0;
459
460 /*
461 ************************************************************
462 * *
463 * First, we see if we meet the basic criteria to create a *
464 * split point, that being that we must not be too far *
465 * from the root (smp_min_split_depth). *
466 * *
467 ************************************************************
468 */
469 if (depth < smp_min_split_depth)
470 return 0;
471 /*
472 ************************************************************
473 * *
474 * If smp_split is NOT set, we are checking to see if it *
475 * is acceptable to do a gratuitous split here. *
476 * *
477 * (1) if we are too far from the root we do not do *
478 * gratuitous splits to avoid the overhead. *
479 * *
480 * (2) if we have searched more than one move at this ply, *
481 * we don't do any further tests to see if a *
482 * gratuitous split is acceptable, since we have *
483 * previously done this test at this ply and decided *
484 * one should not be done. That condition has likely *
485 * not changed. *
486 * *
487 * (3) if we have pre-existing gratuitous split points for *
488 * this thread, we make certain we don't create more *
489 * than the gratuitous split limit as excessive splits *
490 * just add to the overhead with no benefit. *
491 * *
492 ************************************************************
493 */
494 if (!smp_split) {
495 if (depth < smp_gratuitous_depth || done > 1)
496 return 0;
497 tblocks = ~thread[tree->thread_id].blocks;
498 while (tblocks) {
499 temp = LSB(tblocks);
500 used = block[temp + tree->thread_id * 64 + 1];
501 if (used->joinable && !used->joined)
502 unused++;
503 Clear(temp, tblocks);
504 }
505 if (unused > smp_gratuitous_limit)
506 return 0;
507 }
508 /*
509 ************************************************************
510 * *
511 * If smp_split IS set, we are checking to see if it is *
512 * acceptable to do a split because there are idle threads *
513 * that need work to do. *
514 * *
515 * The only reason this would be false is if we have a *
516 * pre-existing split point that is joinable but has not *
517 * been joined. If one exists, there is no need to split *
518 * again as there is already an accessible split point. *
519 * Otherwise, if we are at the root and we are either not *
520 * allowed to split at the root, or we have additional *
521 * root moves that have to be searched one at a time using *
522 * all available threads we also can not split here. *
523 * *
524 ************************************************************
525 */
526 else {
527 if (ply == 1 && (!smp_split_at_root || !NextRootMoveParallel() ||
528 alpha == o_alpha))
529 return 0;
530 tblocks = ~thread[tree->thread_id].blocks;
531 while (tblocks) {
532 temp = LSB(tblocks);
533 used = block[temp + tree->thread_id * 64 + 1];
534 if (used->joinable && !used->joined)
535 unused++;
536 Clear(temp, tblocks);
537 }
538 if (unused > smp_gratuitous_limit)
539 return 0;
540 }
541 return 1;
542 }
543
544 /* modified 08/03/16 */
545 /*
546 *******************************************************************************
547 * *
548 * ThreadStop() is called from SearchMoveList() when it detects a beta *
549 * cutoff (fail high) at a node that is being searched in parallel. We need *
550 * to stop all threads here, and since this threading algorithm is recursive *
551 * it may be necessary to stop other threads that are helping search this *
552 * branch further down into the tree. This function simply sets appropriate *
553 * tree->stop variables to 1, which will stop those particular threads *
554 * instantly and return them to the idle loop in ThreadWait(). *
555 * *
556 *******************************************************************************
557 */
ThreadStop(TREE * RESTRICT tree)558 void ThreadStop(TREE * RESTRICT tree) {
559 int proc;
560
561 Lock(tree->lock);
562 tree->stop = 1;
563 tree->joinable = 0;
564 for (proc = 0; proc < smp_max_threads; proc++)
565 if (tree->siblings[proc])
566 ThreadStop(tree->siblings[proc]);
567 Unlock(tree->lock);
568 }
569
570 /* modified 08/03/16 */
571 /*
572 *******************************************************************************
573 * *
574 * ThreadTrace() is a debugging tool that simply walks the split block tree *
575 * and displays interesting data to help debug the parallel search whenever *
576 * changes break things. *
577 * *
578 *******************************************************************************
579 */
ThreadTrace(TREE * RESTRICT tree,int depth,int brief)580 void ThreadTrace(TREE * RESTRICT tree, int depth, int brief) {
581 int proc, i;
582
583 Lock(tree->lock);
584 Lock(lock_io);
585 if (!brief) {
586 for (i = 0; i < 4 * depth; i++)
587 Print(4095, " ");
588 depth++;
589 Print(4095, "block[%d] thread=%d ply=%d nprocs=%d ",
590 FindBlockID(tree), tree->thread_id, tree->ply, tree->nprocs);
591 Print(4095, "joined=%d joinable=%d stop=%d nodes=%d", tree->joined,
592 tree->joinable, tree->stop, tree->nodes_searched);
593 Print(4095, " parent=%d\n", FindBlockID(tree->parent));
594 } else {
595 if (tree->nprocs > 1) {
596 for (i = 0; i < 4 * depth; i++)
597 Print(4095, " ");
598 depth++;
599 Print(4095, "(ply %d)", tree->ply);
600 }
601 }
602 if (tree->nprocs) {
603 if (!brief) {
604 for (i = 0; i < 4 * depth; i++)
605 Print(4095, " ");
606 Print(4095, " parent=%d sibling threads=",
607 FindBlockID(tree->parent));
608 for (proc = 0; proc < smp_max_threads; proc++)
609 if (tree->siblings[proc])
610 Print(4095, " %d(%d)", proc, FindBlockID(tree->siblings[proc]));
611 Print(4095, "\n");
612 } else {
613 if (tree->nprocs > 1) {
614 Print(4095, " helping= ");
615 for (proc = 0; proc < smp_max_threads; proc++)
616 if (tree->siblings[proc]) {
617 if (proc == tree->thread_id)
618 Print(4095, "[");
619 Print(4095, "%d", proc);
620 if (proc == tree->thread_id)
621 Print(4095, "]");
622 Print(4095, " ");
623 }
624 Print(4095, "\n");
625 }
626 }
627 }
628 Unlock(lock_io);
629 for (proc = 0; proc < smp_max_threads; proc++)
630 if (tree->siblings[proc])
631 ThreadTrace(tree->siblings[proc], depth, brief);
632 Unlock(tree->lock);
633 }
634
635 /* modified 08/03/16 */
636 /*
637 *******************************************************************************
638 * *
639 * ThreadWait() is the idle loop for the N threads that are created at the *
640 * beginning when Crafty searches. Threads are "parked" here waiting on a *
641 * pointer to something they should search (a parameter block built in the *
642 * function Split() in this case. When this pointer becomes non-zero, each *
643 * thread "parked" here will immediately call SearchMoveList() and begin the *
644 * parallel search as directed. *
645 * *
646 * Upon entry, all threads except for the "master" will arrive here with a *
647 * value of zero (0) in the waiting parameter below. This indicates that *
648 * they will search and them be done. The "master" will arrive here with a *
649 * pointer to the parent split block in "waiting" which says I will sit here *
650 * waiting for work OR when the waiting split block has no threads working *
651 * on it, at which point I should return which will let me "unsplit" here *
652 * and clean things up. The call to here in Split() passes this block *
653 * address while threads that are helping get here with a zero. *
654 * *
655 *******************************************************************************
656 */
ThreadWait(int tid,TREE * RESTRICT waiting)657 int ThreadWait(int tid, TREE * RESTRICT waiting) {
658 int value, tstart, tend;
659
660 /*
661 ************************************************************
662 * *
663 * When we reach this point, one of three possible *
664 * conditions is true (1) we already have work to do, as *
665 * we are the "master thread" and we have already split *
666 * the tree, we are coming here to join in; (2) we are *
667 * the master, and we are waiting on our split point to *
668 * complete, so we come here to join and help currently *
669 * active threads; (3) we have no work to do, so we will *
670 * spin until Join() locates a split pont we can join to *
671 * help out. *
672 * *
673 * Note that when we get here, the parent already has a *
674 * split block and does not need to call Join(), it simply *
675 * falls through the while spin loop below because its *
676 * "tree" pointer is already non-zero. *
677 * *
678 ************************************************************
679 */
680 while (FOREVER) {
681 tstart = ReadClock();
682 while (!thread[tid].tree && (!waiting || waiting->nprocs) && !Join(tid) &&
683 !thread[tid].terminate);
684 tend = ReadClock();
685 if (!thread[tid].tree)
686 thread[tid].tree = waiting;
687 thread[tid].idle += tend - tstart;
688 if (thread[tid].tree == waiting || thread[tid].terminate)
689 return 0;
690 /*
691 ************************************************************
692 * *
693 * Once we get here, we have a good split point, so we are *
694 * ready to participate in a parallel search. Once we *
695 * return from SearchMoveList() we copy our results back *
696 * to the parent via CopyToParent() before we look for a *
697 * new split point. If we are a parent, we will slip out *
698 * of the spin loop at the top and return to the normal *
699 * serial search to finish up here. *
700 * *
701 * When we return from SearchMoveList(), we need to *
702 * decrement the "nprocs" value since there is now one *
703 * less thread working at this split point. *
704 * *
705 * Note: CopyToParent() marks the current split block as *
706 * unused once the copy is completed, so we don't have to *
707 * do anything about that here. *
708 * *
709 ************************************************************
710 */
711 value =
712 SearchMoveList(thread[tid].tree, thread[tid].tree->ply,
713 thread[tid].tree->depth, thread[tid].tree->wtm,
714 thread[tid].tree->alpha, thread[tid].tree->beta,
715 thread[tid].tree->searched, thread[tid].tree->in_check, 0, parallel);
716 tstart = ReadClock();
717 Lock(thread[tid].tree->parent->lock);
718 thread[tid].tree->parent->joinable = 0;
719 CopyToParent((TREE *) thread[tid].tree->parent, thread[tid].tree, value);
720 thread[tid].tree->parent->nprocs--;
721 thread[tid].tree->parent->siblings[tid] = 0;
722 Unlock(thread[tid].tree->parent->lock);
723 thread[tid].tree = 0;
724 tend = ReadClock();
725 thread[tid].idle += tend - tstart;
726 }
727 }
728
729 /* modified 08/03/16 */
730 /*
731 *******************************************************************************
732 * *
733 * CopyFromParent() is used to copy data from a parent thread to a child *
734 * thread. This only copies the appropriate parts of the TREE structure to *
735 * avoid burning memory bandwidth by copying everything. *
736 * *
737 *******************************************************************************
738 */
CopyFromParent(TREE * RESTRICT child)739 void CopyFromParent(TREE * RESTRICT child) {
740 TREE *parent = child->parent;
741 int i, ply;
742
743 /*
744 ************************************************************
745 * *
746 * We have allocated a split block. Now we copy the tree *
747 * search state from the parent block to the child in *
748 * preparation for starting the parallel search. *
749 * *
750 ************************************************************
751 */
752 ply = parent->ply;
753 child->ply = ply;
754 child->position = parent->position;
755 for (i = 0; i <= rep_index + parent->ply; i++)
756 child->rep_list[i] = parent->rep_list[i];
757 for (i = ply - 1; i < MAXPLY; i++)
758 child->killers[i] = parent->killers[i];
759 for (i = 0; i < 4096; i++) {
760 child->counter_move[i] = parent->counter_move[i];
761 child->move_pair[i] = parent->move_pair[i];
762 }
763 for (i = ply - 1; i <= ply; i++) {
764 child->curmv[i] = parent->curmv[i];
765 child->pv[i] = parent->pv[i];
766 }
767 child->in_check = parent->in_check;
768 child->last[ply] = child->move_list;
769 child->status[ply] = parent->status[ply];
770 child->status[1] = parent->status[1];
771 child->save_hash_key[ply] = parent->save_hash_key[ply];
772 child->save_pawn_hash_key[ply] = parent->save_pawn_hash_key[ply];
773 child->nodes_searched = 0;
774 child->fail_highs = 0;
775 child->fail_high_first_move = 0;
776 child->evaluations = 0;
777 child->egtb_probes = 0;
778 child->egtb_hits = 0;
779 child->extensions_done = 0;
780 child->qchecks_done = 0;
781 child->moves_fpruned = 0;
782 child->moves_mpruned = 0;
783 for (i = 0; i < 16; i++) {
784 child->LMR_done[i] = 0;
785 child->null_done[i] = 0;
786 }
787 child->alpha = parent->alpha;
788 child->beta = parent->beta;
789 child->value = parent->value;
790 child->wtm = parent->wtm;
791 child->depth = parent->depth;
792 child->searched = parent->searched;
793 strcpy(child->root_move_text, parent->root_move_text);
794 strcpy(child->remaining_moves_text, parent->remaining_moves_text);
795 }
796
797 /* modified 08/03/16 */
798 /*
799 *******************************************************************************
800 * *
801 * CopyToParent() is used to copy data from a child thread to a parent *
802 * thread. This only copies the appropriate parts of the TREE structure to *
803 * avoid burning memory bandwidth by copying everything. *
804 * *
805 *******************************************************************************
806 */
CopyToParent(TREE * RESTRICT parent,TREE * RESTRICT child,int value)807 void CopyToParent(TREE * RESTRICT parent, TREE * RESTRICT child, int value) {
808 int i, ply = parent->ply, which;
809
810 /*
811 ************************************************************
812 * *
813 * The only concern here is to make sure that the info is *
814 * only copied to the parent if our score is > than the *
815 * parent value, and that we were not stopped for any *
816 * reason which could produce a partial score that is *
817 * worthless and dangerous to use. *
818 * *
819 * One important special case. If we get here with the *
820 * thread->stop flag set, and ply is 1, then we need to *
821 * clear the "this move has been searched" flag in the ply *
822 * 1 move list since we did not complete the search. If *
823 * we fail to do this, then a move being searched in *
824 * parallel at the root will be "lost" for this iteration *
825 * and won't be searched again until the next iteration. *
826 * *
827 * In any case, we add our statistical counters to the *
828 * parent's totals no matter whether we finished or not *
829 * since the total nodes searched and such should consider *
830 * everything searched, not just the "useful stuff." *
831 * *
832 * After we finish copying everything, we mark this split *
833 * block as free in the split block bitmap. *
834 * *
835 ************************************************************
836 */
837 if (child->nodes_searched && !child->stop && value > parent->value &&
838 !abort_search) {
839 parent->pv[ply] = child->pv[ply];
840 parent->value = value;
841 parent->cutmove = child->curmv[ply];
842 for (i = 0; i < 4096; i++) {
843 parent->counter_move[i] = child->counter_move[i];
844 parent->move_pair[i] = child->move_pair[i];
845 }
846 }
847 if (child->stop && ply == 1)
848 for (which = 0; which < n_root_moves; which++)
849 if (root_moves[which].move == child->curmv[ply]) {
850 root_moves[which].status &= 7;
851 break;
852 }
853 parent->nodes_searched += child->nodes_searched;
854 parent->fail_highs += child->fail_highs;
855 parent->fail_high_first_move += child->fail_high_first_move;
856 parent->evaluations += child->evaluations;
857 parent->egtb_probes += child->egtb_probes;
858 parent->egtb_hits += child->egtb_hits;
859 parent->extensions_done += child->extensions_done;
860 parent->qchecks_done += child->qchecks_done;
861 parent->moves_fpruned += child->moves_fpruned;
862 parent->moves_mpruned += child->moves_mpruned;
863 for (i = 1; i < 16; i++) {
864 parent->LMR_done[i] += child->LMR_done[i];
865 parent->null_done[i] += child->null_done[i];
866 }
867 which = FindBlockID(child) - 64 * child->thread_id - 1;
868 Set(which, thread[child->thread_id].blocks);
869 }
870
871 /* modified 08/03/16 */
872 /*
873 *******************************************************************************
874 * *
875 * GetBlock() is used to allocate a split block and fill in only SMP- *
876 * critical information. The child process will copy the rest of the split *
877 * block information as needed. *
878 * *
879 * When we arrive here, the parent split block must be locked since we are *
880 * going to change data in that block as well as copy data from that block *
881 * the current split block. The only exception is when this is the original *
882 * split operation, since this is done "out of sight" of other threads which *
883 * means no locks are needed until after the "joinable" flag is set, which *
884 * exposes this split point to other threads instantly. *
885 * *
886 *******************************************************************************
887 */
GetBlock(TREE * RESTRICT parent,int tid)888 TREE *GetBlock(TREE * RESTRICT parent, int tid) {
889 TREE *child;
890 static int warnings = 0;
891 int i, unused;
892 /*
893 ************************************************************
894 * *
895 * One NUMA-related trick is that we only allocate a split *
896 * block in the thread's local memory. Each thread has a *
897 * group of split blocks that were first touched by the *
898 * correct CPU so that the split blocks page-faulted into *
899 * local memory for that specific processor. If we can't *
900 * find an optimally-placed block, we return a zero which *
901 * will prevent this thread from joining the split point. *
902 * This is highly unlikely as it would mean the current *
903 * thread has 64 active split blocks where it is waiting *
904 * on other threads to complete the last bit of work at *
905 * each. This is extremely unlikely. *
906 * *
907 * Here we use a simple 64 bit bit-map per thread that *
908 * indicates which blocks are free (1) and which blocks *
909 * used (0). We simply use LSB() to find the rightmost *
910 * one-bit set and that is the local block number. We *
911 * convert that to a global block number by doing the *
912 * simple computation: *
913 * *
914 * global = local + 64 * tid + 1 *
915 * *
916 * Each thread has exactly 64 split blocks, and block 0 *
917 * is the "master block" that never gets allocated or *
918 * freed. Once we find a free block for the current *
919 * thread, we zero that bit so that the block won't be *
920 * used again until it is released. *
921 * *
922 ************************************************************
923 */
924 if (thread[tid].blocks) {
925 unused = LSB(thread[tid].blocks);
926 Clear(unused, thread[tid].blocks);
927 Set(unused, thread[tid].max_blocks);
928 } else {
929 if (++warnings < 6)
930 Print(2048,
931 "WARNING. local SMP block cannot be allocated, thread %d\n", tid);
932 return 0;
933 }
934 child = block[unused + tid * 64 + 1];
935 /*
936 ************************************************************
937 * *
938 * Found a split block. Now we need to fill in only the *
939 * critical information that can't be delayed due to race *
940 * conditions. When we get here, the parent split block *
941 * must be locked, that lets us safely update the number *
942 * of processors working here, etc, without any ugly race *
943 * conditions that would corrupt this critical data. *
944 * *
945 ************************************************************
946 */
947 for (i = 0; i < smp_max_threads; i++)
948 child->siblings[i] = 0;
949 child->nprocs = 0;
950 child->stop = 0;
951 child->joinable = 0;
952 child->joined = 0;
953 child->parent = parent;
954 child->thread_id = tid;
955 parent->nprocs++;
956 parent->siblings[tid] = child;
957 parent->joined = 1;
958 return child;
959 }
960
961 /*
962 *******************************************************************************
963 * *
964 * WaitForAllThreadsInitialized() waits until all smp_max_threads are *
965 * initialized. We have to initialize each thread and malloc() its split *
966 * blocks before we start the actual parallel search. Otherwise we will see *
967 * invalid memory accesses and crash instantly. *
968 * *
969 *******************************************************************************
970 */
WaitForAllThreadsInitialized(void)971 void WaitForAllThreadsInitialized(void) {
972 while (initialized_threads < smp_max_threads); /* Do nothing */
973 }
974
975 #if !defined (UNIX)
976 /* modified 08/03/16 */
977 /*
978 *******************************************************************************
979 * *
980 * ThreadMalloc() is called from the ThreadInit() function. It malloc's the *
981 * split blocks in the local memory for the processor associated with the *
982 * specific thread that is calling this code. *
983 * *
984 *******************************************************************************
985 */
986 extern void *WinMalloc(size_t, int);
ThreadMalloc(int64_t tid)987 void ThreadMalloc(int64_t tid) {
988 int i;
989
990 for (i = tid * 64 + 1; i < tid * 64 + 65; i++) {
991 if (block[i] == NULL)
992 block[i] =
993 (TREE *) ((~(size_t) 127) & (127 + (size_t) WinMalloc(sizeof(TREE) +
994 127, tid)));
995 block[i]->parent = NULL;
996 LockInit(block[i]->lock);
997 }
998 }
999 #endif
1000