1 
2 /*--------------------------------------------------------------------*/
3 /*--- LibHB: a library for implementing and checking               ---*/
4 /*--- the happens-before relationship in concurrent programs.      ---*/
5 /*---                                                 libhb_main.c ---*/
6 /*--------------------------------------------------------------------*/
7 
8 /*
9    This file is part of LibHB, a library for implementing and checking
10    the happens-before relationship in concurrent programs.
11 
12    Copyright (C) 2008-2017 OpenWorks Ltd
13       info@open-works.co.uk
14 
15    This program is free software; you can redistribute it and/or
16    modify it under the terms of the GNU General Public License as
17    published by the Free Software Foundation; either version 2 of the
18    License, or (at your option) any later version.
19 
20    This program is distributed in the hope that it will be useful, but
21    WITHOUT ANY WARRANTY; without even the implied warranty of
22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23    General Public License for more details.
24 
25    You should have received a copy of the GNU General Public License
26    along with this program; if not, write to the Free Software
27    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
28    02111-1307, USA.
29 
30    The GNU General Public License is contained in the file COPYING.
31 */
32 
33 #include "pub_tool_basics.h"
34 #include "pub_tool_poolalloc.h"
35 #include "pub_tool_libcassert.h"
36 #include "pub_tool_libcbase.h"
37 #include "pub_tool_libcprint.h"
38 #include "pub_tool_machine.h"
39 #include "pub_tool_mallocfree.h"
40 #include "pub_tool_wordfm.h"
41 #include "pub_tool_hashtable.h"
42 #include "pub_tool_xarray.h"
43 #include "pub_tool_oset.h"
44 #include "pub_tool_threadstate.h"
45 #include "pub_tool_aspacemgr.h"
46 #include "pub_tool_stacktrace.h"
47 #include "pub_tool_execontext.h"
48 #include "pub_tool_errormgr.h"
49 #include "pub_tool_debuginfo.h"
50 #include "pub_tool_gdbserver.h"
51 #include "pub_tool_options.h"        // VG_(clo_stats)
52 #include "hg_basics.h"
53 #include "hg_wordset.h"
54 #include "hg_lock_n_thread.h"
55 #include "hg_errors.h"
56 
57 #include "libhb.h"
58 
59 
60 /////////////////////////////////////////////////////////////////
61 /////////////////////////////////////////////////////////////////
62 //                                                             //
63 // Debugging #defines                                          //
64 //                                                             //
65 /////////////////////////////////////////////////////////////////
66 /////////////////////////////////////////////////////////////////
67 
68 /* Check the sanity of shadow values in the core memory state
69    machine.  Change #if 0 to #if 1 to enable this. */
70 #if 0
71 #  define CHECK_MSM 1
72 #else
73 #  define CHECK_MSM 0
74 #endif
75 
76 
77 /* Check sanity (reference counts, etc) in the conflicting access
78    machinery.  Change #if 0 to #if 1 to enable this. */
79 #if 0
80 #  define CHECK_CEM 1
81 #else
82 #  define CHECK_CEM 0
83 #endif
84 
85 
86 /* Check sanity in the compressed shadow memory machinery,
87    particularly in its caching innards.  Unfortunately there's no
88    almost-zero-cost way to make them selectable at run time.  Hence
89    set the #if 0 to #if 1 and rebuild if you want them. */
90 #if 0
91 #  define CHECK_ZSM 1  /* do sanity-check CacheLine stuff */
92 #  define inline __attribute__((noinline))
93    /* probably want to ditch -fomit-frame-pointer too */
94 #else
95 #  define CHECK_ZSM 0   /* don't sanity-check CacheLine stuff */
96 #endif
97 
98 /* Define to 1 to activate tracing cached rcec. */
99 #define DEBUG_CACHED_RCEC 0
100 
101 /////////////////////////////////////////////////////////////////
102 /////////////////////////////////////////////////////////////////
103 //                                                             //
104 // data decls: VtsID                                           //
105 //                                                             //
106 /////////////////////////////////////////////////////////////////
107 /////////////////////////////////////////////////////////////////
108 
109 /* VtsIDs: Unique small-integer IDs for VTSs.  VtsIDs can't exceed 30
110    bits, since they have to be packed into the lowest 30 bits of an
111    SVal. */
112 typedef  UInt  VtsID;
113 #define VtsID_INVALID 0xFFFFFFFF
114 
115 
116 
117 /////////////////////////////////////////////////////////////////
118 /////////////////////////////////////////////////////////////////
119 //                                                             //
120 // data decls: SVal                                            //
121 //                                                             //
122 /////////////////////////////////////////////////////////////////
123 /////////////////////////////////////////////////////////////////
124 
125 typedef  ULong  SVal;
126 
127 /* This value has special significance to the implementation, and callers
128    may not store it in the shadow memory. */
129 #define SVal_INVALID (3ULL << 62)
130 
131 /* This is the default value for shadow memory.  Initially the shadow
132    memory contains no accessible areas and so all reads produce this
133    value.  TODO: make this caller-defineable. */
134 #define SVal_NOACCESS (2ULL << 62)
135 
136 
137 
138 /////////////////////////////////////////////////////////////////
139 /////////////////////////////////////////////////////////////////
140 //                                                             //
141 // data decls: ScalarTS                                        //
142 //                                                             //
143 /////////////////////////////////////////////////////////////////
144 /////////////////////////////////////////////////////////////////
145 
146 /* Scalar Timestamp.  We have to store a lot of these, so there is
147    some effort to make them as small as possible.  Logically they are
148    a pair, (Thr*, ULong), but that takes 16 bytes on a 64-bit target.
149    We pack it into 64 bits by representing the Thr* using a ThrID, a
150    small integer (18 bits), and a 46 bit integer for the timestamp
151    number.  The 46/18 split is arbitrary, but has the effect that
152    Helgrind can only handle programs that create 2^18 or fewer threads
153    over their entire lifetime, and have no more than 2^46 timestamp
154    ticks (synchronisation operations on the same thread).
155 
156    This doesn't seem like much of a limitation.  2^46 ticks is
157    7.06e+13, and if each tick (optimistically) takes the machine 1000
158    cycles to process, then the minimum time to process that many ticks
159    at a clock rate of 5 GHz is 162.9 days.  And that's doing nothing
160    but VTS ticks, which isn't realistic.
161 
162    NB1: SCALARTS_N_THRBITS must be 27 or lower.  The obvious limit is
163    32 since a ThrID is a UInt.  27 comes from the fact that
164    'Thr_n_RCEC', which records information about old accesses, packs
165    in tsw not only a ThrID but also minimum 4+1 other bits (access size
166    and writeness) in a UInt, hence limiting size to 32-(4+1) == 27.
167 
168    NB2: thrid values are issued upwards from 1024, and values less
169    than that aren't valid.  This isn't per se necessary (any order
170    will do, so long as they are unique), but it does help ensure they
171    are less likely to get confused with the various other kinds of
172    small-integer thread ids drifting around (eg, TId).
173    So, SCALARTS_N_THRBITS must be 11 or more.
174    See also NB5.
175 
176    NB3: this probably also relies on the fact that Thr's are never
177    deallocated -- they exist forever.  Hence the 1-1 mapping from
178    Thr's to thrid values (set up in Thr__new) persists forever.
179 
180    NB4: temp_max_sized_VTS is allocated at startup and never freed.
181    It is a maximum sized VTS, so has (1 << SCALARTS_N_TYMBITS)
182    ScalarTSs.  So we can't make SCALARTS_N_THRBITS too large without
183    making the memory use for this go sky-high.  With
184    SCALARTS_N_THRBITS at 18, it occupies 2MB of memory, which seems
185    like an OK tradeoff.  If more than 256k threads need to be
186    supported, we could change SCALARTS_N_THRBITS to 20, which would
187    facilitate supporting 1 million threads at the cost of 8MB storage
188    for temp_max_sized_VTS.
189 
190    NB5: the conflicting-map mechanism (Thr_n_RCEC, specifically) uses
191    ThrID == 0 to denote an empty Thr_n_RCEC record.  So ThrID == 0
192    must never be a valid ThrID.  Given NB2 that's OK.
193 */
194 #define SCALARTS_N_THRBITS 18  /* valid range: 11 to 27 inclusive,
195                                   See NB1 and NB2 above. */
196 
197 #define SCALARTS_N_TYMBITS (64 - SCALARTS_N_THRBITS)
198 typedef
199    struct {
200       ThrID thrid : SCALARTS_N_THRBITS;
201       ULong tym   : SCALARTS_N_TYMBITS;
202    }
203    ScalarTS;
204 
205 #define ThrID_MAX_VALID ((1 << SCALARTS_N_THRBITS) - 1)
206 
207 
208 
209 /////////////////////////////////////////////////////////////////
210 /////////////////////////////////////////////////////////////////
211 //                                                             //
212 // data decls: Filter                                          //
213 //                                                             //
214 /////////////////////////////////////////////////////////////////
215 /////////////////////////////////////////////////////////////////
216 
217 // baseline: 5, 9
218 #define FI_LINE_SZB_LOG2  5
219 #define FI_NUM_LINES_LOG2 10
220 
221 #define FI_LINE_SZB       (1 << FI_LINE_SZB_LOG2)
222 #define FI_NUM_LINES      (1 << FI_NUM_LINES_LOG2)
223 
224 #define FI_TAG_MASK        (~(Addr)(FI_LINE_SZB - 1))
225 #define FI_GET_TAG(_a)     ((_a) & FI_TAG_MASK)
226 
227 #define FI_GET_LINENO(_a)  ( ((_a) >> FI_LINE_SZB_LOG2) \
228                              & (Addr)(FI_NUM_LINES-1) )
229 
230 
231 /* In the lines, each 8 bytes are treated individually, and are mapped
232    to a UShort.  Regardless of endianness of the underlying machine,
233    bits 1 and 0 pertain to the lowest address and bits 15 and 14 to
234    the highest address.
235 
236    Of each bit pair, the higher numbered bit is set if a R has been
237    seen, so the actual layout is:
238 
239    15 14             ...  01 00
240 
241    R  W  for addr+7  ...  R  W  for addr+0
242 
243    So a mask for the R-bits is 0xAAAA and for the W bits is 0x5555.
244 */
245 
246 /* tags are separated from lines.  tags are Addrs and are
247    the base address of the line. */
248 typedef
249    struct {
250       UShort u16s[FI_LINE_SZB / 8]; /* each UShort covers 8 bytes */
251    }
252    FiLine;
253 
254 typedef
255    struct {
256       Addr   tags[FI_NUM_LINES];
257       FiLine lines[FI_NUM_LINES];
258    }
259    Filter;
260 
261 
262 
263 /////////////////////////////////////////////////////////////////
264 /////////////////////////////////////////////////////////////////
265 //                                                             //
266 // data decls: Thr, ULong_n_EC                                 //
267 //                                                             //
268 /////////////////////////////////////////////////////////////////
269 /////////////////////////////////////////////////////////////////
270 
271 // Records stacks for H1 history mechanism (DRD-style)
272 typedef
273    struct { ULong ull; ExeContext* ec; }
274    ULong_n_EC;
275 
276 
277 /* How many of the above records to collect for each thread?  Older
278    ones are dumped when we run out of space.  62.5k requires 1MB per
279    thread, since each ULong_n_EC record is 16 bytes long.  When more
280    than N_KWs_N_STACKs_PER_THREAD are present, the older half are
281    deleted to make space.  Hence in the worst case we will be able to
282    produce a stack at least for the last N_KWs_N_STACKs_PER_THREAD / 2
283    Kw transitions (segments in this thread).  For the current setting
284    that gives a guaranteed stack for at least the last 31.25k
285    segments. */
286 #define N_KWs_N_STACKs_PER_THREAD 62500
287 
288 
289 #define N_FRAMES 8
290 // (UInt) `echo "Reference Counted Execution Context" | md5sum`
291 #define RCEC_MAGIC 0xab88abb2UL
292 
293 /* RCEC usage is commented more in details in the section 'Change-event map2'
294    later in this file */
295 typedef
296    struct _RCEC {
297       UWord magic;  /* sanity check only */
298       struct _RCEC* next;
299       UWord rc;
300       UWord rcX; /* used for crosschecking */
301       UWord frames_hash;          /* hash of all the frames */
302       UWord frames[N_FRAMES];
303    }
304    RCEC;
305 
306 struct _Thr {
307    /* Current VTSs for this thread.  They change as we go along.  viR
308       is the VTS to be used for reads, viW for writes.  Usually they
309       are the same, but can differ when we deal with reader-writer
310       locks.  It is always the case that
311          VtsID__cmpLEQ(viW,viR) == True
312       that is, viW must be the same, or lagging behind, viR. */
313    VtsID viR;
314    VtsID viW;
315 
316    /* Is initially False, and is set to True after the thread really
317       has done a low-level exit.  When True, we expect to never see
318       any more memory references done by this thread. */
319    Bool llexit_done;
320 
321    /* Is initially False, and is set to True after the thread has been
322       joined with (reaped by some other thread).  After this point, we
323       do not expect to see any uses of .viR or .viW, so it is safe to
324       set them to VtsID_INVALID. */
325    Bool joinedwith_done;
326 
327    /* A small integer giving a unique identity to this Thr.  See
328       comments on the definition of ScalarTS for details. */
329    ThrID thrid : SCALARTS_N_THRBITS;
330 
331    /* A filter that removes references for which we believe that
332       msmcread/msmcwrite will not change the state, nor report a
333       race. */
334    Filter* filter;
335 
336    /* A pointer back to the top level Thread structure.  There is a
337       1-1 mapping between Thread and Thr structures -- each Thr points
338       at its corresponding Thread, and vice versa.  Really, Thr and
339       Thread should be merged into a single structure. */
340    Thread* hgthread;
341 
342    /* cached_rcec maintains the last RCEC that was retrieved for this thread. */
343    RCEC cached_rcec; // cached_rcec value, not ref-counted.
344    /* The shadow register vex_shadow1 SP register (SP_s1) is used to maintain
345       the validity of the cached rcec.
346       If SP_s1 is 0, then the cached rcec is invalid (cannot be used).
347       If SP_S1 is != 0, then the cached rcec is valid. The valid cached rcec
348       can be used to generate a new RCEC by changing just the last frame. */
349 
350    /* The ULongs (scalar Kws) in this accumulate in strictly
351       increasing order, without duplicates.  This is important because
352       we need to be able to find a given scalar Kw in this array
353       later, by binary search. */
354    XArray* /* ULong_n_EC */ local_Kws_n_stacks;
355 };
356 
357 
358 
359 /////////////////////////////////////////////////////////////////
360 /////////////////////////////////////////////////////////////////
361 //                                                             //
362 // data decls: SO                                              //
363 //                                                             //
364 /////////////////////////////////////////////////////////////////
365 /////////////////////////////////////////////////////////////////
366 
367 // (UInt) `echo "Synchronisation object" | md5sum`
368 #define SO_MAGIC 0x56b3c5b0U
369 
370 struct _SO {
371    struct _SO* admin_prev;
372    struct _SO* admin_next;
373    VtsID viR; /* r-clock of sender */
374    VtsID viW; /* w-clock of sender */
375    UInt  magic;
376 };
377 
378 
379 
380 /////////////////////////////////////////////////////////////////
381 /////////////////////////////////////////////////////////////////
382 //                                                             //
383 // Forward declarations                                        //
384 //                                                             //
385 /////////////////////////////////////////////////////////////////
386 /////////////////////////////////////////////////////////////////
387 
388 /* fwds for
389    Globals needed by other parts of the library.  These are set
390    once at startup and then never changed. */
391 static void        (*main_get_stacktrace)( Thr*, Addr*, UWord ) = NULL;
392 static ExeContext* (*main_get_EC)( Thr* ) = NULL;
393 
394 /* misc fn and data fwdses */
395 static void VtsID__rcinc ( VtsID ii );
396 static void VtsID__rcdec ( VtsID ii );
397 
398 static inline Bool SVal__isC ( SVal s );
399 static inline VtsID SVal__unC_Rmin ( SVal s );
400 static inline VtsID SVal__unC_Wmin ( SVal s );
401 static inline SVal SVal__mkC ( VtsID rmini, VtsID wmini );
402 static inline void SVal__rcinc ( SVal s );
403 static inline void SVal__rcdec ( SVal s );
404 /* SVal in LineZ are used to store various pointers. */
405 static inline void *SVal2Ptr (SVal s);
406 static inline SVal Ptr2SVal (void* ptr);
407 
408 /* A double linked list of all the SO's. */
409 SO* admin_SO;
410 
411 
412 
413 /////////////////////////////////////////////////////////////////
414 /////////////////////////////////////////////////////////////////
415 //                                                             //
416 // SECTION BEGIN compressed shadow memory                      //
417 //                                                             //
418 /////////////////////////////////////////////////////////////////
419 /////////////////////////////////////////////////////////////////
420 
421 #ifndef __HB_ZSM_H
422 #define __HB_ZSM_H
423 
424 /* Initialise the library.  Once initialised, it will (or may) call
425    SVal__rcinc and SVal__rcdec in response to all the calls below, in order to
426    allow the user to do reference counting on the SVals stored herein.
427    It is important to understand, however, that due to internal
428    caching, the reference counts are in general inaccurate, and can be
429    both above or below the true reference count for an item.  In
430    particular, the library may indicate that the reference count for
431    an item is zero, when in fact it is not.
432 
433    To make the reference counting exact and therefore non-pointless,
434    call zsm_flush_cache.  Immediately after it returns, the reference
435    counts for all items, as deduced by the caller by observing calls
436    to SVal__rcinc and SVal__rcdec, will be correct, and so any items with a
437    zero reference count may be freed (or at least considered to be
438    unreferenced by this library).
439 */
440 static void zsm_init ( void );
441 
442 static void zsm_sset_range  ( Addr, SizeT, SVal );
443 static void zsm_sset_range_SMALL ( Addr a, SizeT len, SVal svNew );
444 static void zsm_scopy_range ( Addr, Addr, SizeT );
445 static void zsm_flush_cache ( void );
446 
447 #endif /* ! __HB_ZSM_H */
448 
449 
450 /* Round a up to the next multiple of N.  N must be a power of 2 */
451 #define ROUNDUP(a, N)   ((a + N - 1) & ~(N-1))
452 /* Round a down to the next multiple of N.  N must be a power of 2 */
453 #define ROUNDDN(a, N)   ((a) & ~(N-1))
454 
455 /* True if a belongs in range [start, start + szB[
456    (i.e. start + szB is excluded). */
address_in_range(Addr a,Addr start,SizeT szB)457 static inline Bool address_in_range (Addr a, Addr start,  SizeT szB)
458 {
459    /* Checking start <= a && a < start + szB.
460       As start and a are unsigned addresses, the condition can
461       be simplified. */
462    if (CHECK_ZSM)
463       tl_assert ((a - start < szB)
464                  == (start <= a
465                      &&       a < start + szB));
466    return a - start < szB;
467 }
468 
469 /* ------ CacheLine ------ */
470 
471 #define N_LINE_BITS      6 /* must be >= 3 */
472 #define N_LINE_ARANGE    (1 << N_LINE_BITS)
473 #define N_LINE_TREES     (N_LINE_ARANGE >> 3)
474 
475 typedef
476    struct {
477       UShort descrs[N_LINE_TREES];
478       SVal   svals[N_LINE_ARANGE]; // == N_LINE_TREES * 8
479    }
480    CacheLine;
481 
482 #define TREE_DESCR_16_0 (1<<0)
483 #define TREE_DESCR_32_0 (1<<1)
484 #define TREE_DESCR_16_1 (1<<2)
485 #define TREE_DESCR_64   (1<<3)
486 #define TREE_DESCR_16_2 (1<<4)
487 #define TREE_DESCR_32_1 (1<<5)
488 #define TREE_DESCR_16_3 (1<<6)
489 #define TREE_DESCR_8_0  (1<<7)
490 #define TREE_DESCR_8_1  (1<<8)
491 #define TREE_DESCR_8_2  (1<<9)
492 #define TREE_DESCR_8_3  (1<<10)
493 #define TREE_DESCR_8_4  (1<<11)
494 #define TREE_DESCR_8_5  (1<<12)
495 #define TREE_DESCR_8_6  (1<<13)
496 #define TREE_DESCR_8_7  (1<<14)
497 #define TREE_DESCR_DTY  (1<<15)
498 
499 typedef
500    struct {
501       SVal  dict[4]; /* can represent up to 4 diff values in the line */
502       UChar ix2s[N_LINE_ARANGE/4]; /* array of N_LINE_ARANGE 2-bit
503                                       dict indexes */
504       /* if dict[0] == SVal_INVALID then dict[1] is a pointer to the
505          LineF to use, and dict[2..] are also SVal_INVALID. */
506    }
507    LineZ; /* compressed rep for a cache line */
508 
509 /* LineZ.dict[1] is used to store various pointers:
510    * In the first lineZ of a free SecMap, it points to the next free SecMap.
511    * In a lineZ for which we need to use a lineF, it points to the lineF. */
512 
513 
514 typedef
515    struct {
516       SVal w64s[N_LINE_ARANGE];
517    }
518    LineF; /* full rep for a cache line */
519 
520 /* We use a pool allocator for LineF, as LineF is relatively small,
521    and we will often alloc/release such lines. */
522 static PoolAlloc* LineF_pool_allocator;
523 
524 /* SVal in a lineZ are used to store various pointers.
525    Below are conversion functions to support that. */
LineF_Ptr(LineZ * lineZ)526 static inline LineF *LineF_Ptr (LineZ *lineZ)
527 {
528    tl_assert(lineZ->dict[0] == SVal_INVALID);
529    return SVal2Ptr (lineZ->dict[1]);
530 }
531 
532 /* Shadow memory.
533    Primary map is a WordFM Addr SecMap*.
534    SecMaps cover some page-size-ish section of address space and hold
535      a compressed representation.
536    CacheLine-sized chunks of SecMaps are copied into a Cache, being
537    decompressed when moved into the cache and recompressed on the
538    way out.  Because of this, the cache must operate as a writeback
539    cache, not a writethrough one.
540 
541    Each SecMap must hold a power-of-2 number of CacheLines.  Hence
542    N_SECMAP_BITS must >= N_LINE_BITS.
543 */
544 #define N_SECMAP_BITS   13
545 #define N_SECMAP_ARANGE (1 << N_SECMAP_BITS)
546 
547 // # CacheLines held by a SecMap
548 #define N_SECMAP_ZLINES (N_SECMAP_ARANGE / N_LINE_ARANGE)
549 
550 /* The data in the SecMap is held in the array of LineZs.  Each LineZ
551    either carries the required data directly, in a compressed
552    representation, or it holds (in .dict[1]) a pointer to a LineF
553    that holds the full representation.
554 
555    As each in-use LineF is referred to by exactly one LineZ,
556    the number of .linesZ[] that refer to a lineF should equal
557    the number of used lineF.
558 
559    RC obligations: the RCs presented to the user include exactly
560    the values in:
561    * direct Z reps, that is, ones for which .dict[0] != SVal_INVALID
562    * F reps that are in use
563 
564    Hence the following actions at the following transitions are required:
565 
566    F rep: alloc'd       -> freed                -- rcdec_LineF
567    F rep:               -> alloc'd              -- rcinc_LineF
568    Z rep: .dict[0] from other to SVal_INVALID   -- rcdec_LineZ
569    Z rep: .dict[0] from SVal_INVALID to other   -- rcinc_LineZ
570 */
571 
572 typedef
573    struct {
574       UInt   magic;
575       LineZ  linesZ[N_SECMAP_ZLINES];
576    }
577    SecMap;
578 
579 #define SecMap_MAGIC   0x571e58cbU
580 
581 // (UInt) `echo "Free SecMap" | md5sum`
582 #define SecMap_free_MAGIC 0x5a977f30U
583 
584 __attribute__((unused))
is_sane_SecMap(SecMap * sm)585 static inline Bool is_sane_SecMap ( SecMap* sm ) {
586    return sm != NULL && sm->magic == SecMap_MAGIC;
587 }
588 
589 /* ------ Cache ------ */
590 
591 #define N_WAY_BITS 16
592 #define N_WAY_NENT (1 << N_WAY_BITS)
593 
594 /* Each tag is the address of the associated CacheLine, rounded down
595    to a CacheLine address boundary.  A CacheLine size must be a power
596    of 2 and must be 8 or more.  Hence an easy way to initialise the
597    cache so it is empty is to set all the tag values to any value % 8
598    != 0, eg 1.  This means all queries in the cache initially miss.
599    It does however require us to detect and not writeback, any line
600    with a bogus tag. */
601 typedef
602    struct {
603       CacheLine lyns0[N_WAY_NENT];
604       Addr      tags0[N_WAY_NENT];
605    }
606    Cache;
607 
is_valid_scache_tag(Addr tag)608 static inline Bool is_valid_scache_tag ( Addr tag ) {
609    /* a valid tag should be naturally aligned to the start of
610       a CacheLine. */
611    return 0 == (tag & (N_LINE_ARANGE - 1));
612 }
613 
614 
615 /* --------- Primary data structures --------- */
616 
617 /* Shadow memory primary map */
618 static WordFM* map_shmem = NULL; /* WordFM Addr SecMap* */
619 static Cache   cache_shmem;
620 
621 
622 static UWord stats__secmaps_search       = 0; // # SM finds
623 static UWord stats__secmaps_search_slow  = 0; // # SM lookupFMs
624 static UWord stats__secmaps_allocd       = 0; // # SecMaps issued
625 static UWord stats__secmaps_in_map_shmem = 0; // # SecMaps 'live'
626 static UWord stats__secmaps_scanGC       = 0; // # nr of scan GC done.
627 static UWord stats__secmaps_scanGCed     = 0; // # SecMaps GC-ed via scan
628 static UWord stats__secmaps_ssetGCed     = 0; // # SecMaps GC-ed via setnoaccess
629 static UWord stats__secmap_ga_space_covered = 0; // # ga bytes covered
630 static UWord stats__secmap_linesZ_allocd = 0; // # LineZ's issued
631 static UWord stats__secmap_linesZ_bytes  = 0; // .. using this much storage
632 static UWord stats__cache_Z_fetches      = 0; // # Z lines fetched
633 static UWord stats__cache_Z_wbacks       = 0; // # Z lines written back
634 static UWord stats__cache_F_fetches      = 0; // # F lines fetched
635 static UWord stats__cache_F_wbacks       = 0; // # F lines written back
636 static UWord stats__cache_flushes_invals = 0; // # cache flushes and invals
637 static UWord stats__cache_totrefs        = 0; // # total accesses
638 static UWord stats__cache_totmisses      = 0; // # misses
639 static ULong stats__cache_make_New_arange = 0; // total arange made New
640 static ULong stats__cache_make_New_inZrep = 0; // arange New'd on Z reps
641 static UWord stats__cline_normalises     = 0; // # calls to cacheline_normalise
642 static UWord stats__cline_cread64s       = 0; // # calls to s_m_read64
643 static UWord stats__cline_cread32s       = 0; // # calls to s_m_read32
644 static UWord stats__cline_cread16s       = 0; // # calls to s_m_read16
645 static UWord stats__cline_cread08s       = 0; // # calls to s_m_read8
646 static UWord stats__cline_cwrite64s      = 0; // # calls to s_m_write64
647 static UWord stats__cline_cwrite32s      = 0; // # calls to s_m_write32
648 static UWord stats__cline_cwrite16s      = 0; // # calls to s_m_write16
649 static UWord stats__cline_cwrite08s      = 0; // # calls to s_m_write8
650 static UWord stats__cline_sread08s       = 0; // # calls to s_m_set8
651 static UWord stats__cline_swrite08s      = 0; // # calls to s_m_get8
652 static UWord stats__cline_swrite16s      = 0; // # calls to s_m_get8
653 static UWord stats__cline_swrite32s      = 0; // # calls to s_m_get8
654 static UWord stats__cline_swrite64s      = 0; // # calls to s_m_get8
655 static UWord stats__cline_scopy08s       = 0; // # calls to s_m_copy8
656 static UWord stats__cline_64to32splits   = 0; // # 64-bit accesses split
657 static UWord stats__cline_32to16splits   = 0; // # 32-bit accesses split
658 static UWord stats__cline_16to8splits    = 0; // # 16-bit accesses split
659 static UWord stats__cline_64to32pulldown = 0; // # calls to pulldown_to_32
660 static UWord stats__cline_32to16pulldown = 0; // # calls to pulldown_to_16
661 static UWord stats__cline_16to8pulldown  = 0; // # calls to pulldown_to_8
662 static UWord stats__vts__tick            = 0; // # calls to VTS__tick
663 static UWord stats__vts__join            = 0; // # calls to VTS__join
664 static UWord stats__vts__cmpLEQ          = 0; // # calls to VTS__cmpLEQ
665 static UWord stats__vts__cmp_structural  = 0; // # calls to VTS__cmp_structural
666 static UWord stats__vts_tab_GC           = 0; // # nr of vts_tab GC
667 static UWord stats__vts_pruning          = 0; // # nr of vts pruning
668 
669 // # calls to VTS__cmp_structural w/ slow case
670 static UWord stats__vts__cmp_structural_slow = 0;
671 
672 // # calls to VTS__indexAt_SLOW
673 static UWord stats__vts__indexat_slow = 0;
674 
675 // # calls to vts_set__find__or__clone_and_add
676 static UWord stats__vts_set__focaa    = 0;
677 
678 // # calls to vts_set__find__or__clone_and_add that lead to an
679 // allocation
680 static UWord stats__vts_set__focaa_a  = 0;
681 
682 
shmem__round_to_SecMap_base(Addr a)683 static inline Addr shmem__round_to_SecMap_base ( Addr a ) {
684    return a & ~(N_SECMAP_ARANGE - 1);
685 }
shmem__get_SecMap_offset(Addr a)686 static inline UWord shmem__get_SecMap_offset ( Addr a ) {
687    return a & (N_SECMAP_ARANGE - 1);
688 }
689 
690 
691 /*----------------------------------------------------------------*/
692 /*--- map_shmem :: WordFM Addr SecMap                          ---*/
693 /*--- shadow memory (low level handlers) (shmem__* fns)        ---*/
694 /*----------------------------------------------------------------*/
695 
696 /*--------------- SecMap allocation --------------- */
697 
698 static HChar* shmem__bigchunk_next = NULL;
699 static HChar* shmem__bigchunk_end1 = NULL;
700 
shmem__bigchunk_alloc(SizeT n)701 static void* shmem__bigchunk_alloc ( SizeT n )
702 {
703    const SizeT sHMEM__BIGCHUNK_SIZE = 4096 * 256 * 4;
704    tl_assert(n > 0);
705    n = VG_ROUNDUP(n, 16);
706    tl_assert(shmem__bigchunk_next <= shmem__bigchunk_end1);
707    tl_assert(shmem__bigchunk_end1 - shmem__bigchunk_next
708              <= (SSizeT)sHMEM__BIGCHUNK_SIZE);
709    if (shmem__bigchunk_next + n > shmem__bigchunk_end1) {
710       if (0)
711       VG_(printf)("XXXXX bigchunk: abandoning %d bytes\n",
712                   (Int)(shmem__bigchunk_end1 - shmem__bigchunk_next));
713       shmem__bigchunk_next = VG_(am_shadow_alloc)( sHMEM__BIGCHUNK_SIZE );
714       if (shmem__bigchunk_next == NULL)
715          VG_(out_of_memory_NORETURN)(
716             "helgrind:shmem__bigchunk_alloc", sHMEM__BIGCHUNK_SIZE );
717       shmem__bigchunk_end1 = shmem__bigchunk_next + sHMEM__BIGCHUNK_SIZE;
718    }
719    tl_assert(shmem__bigchunk_next);
720    tl_assert( 0 == (((Addr)shmem__bigchunk_next) & (16-1)) );
721    tl_assert(shmem__bigchunk_next + n <= shmem__bigchunk_end1);
722    shmem__bigchunk_next += n;
723    return shmem__bigchunk_next - n;
724 }
725 
726 /* SecMap changed to be fully SVal_NOACCESS are inserted in a list of
727    recycled SecMap. When a new SecMap is needed, a recycled SecMap
728    will be used in preference to allocating a new SecMap. */
729 /* We make a linked list of SecMap. The first LineZ is re-used to
730    implement the linked list. */
731 /* Returns the SecMap following sm in the free list.
732    NULL if sm is the last SecMap. sm must be on the free list. */
SecMap_freelist_next(SecMap * sm)733 static inline SecMap *SecMap_freelist_next ( SecMap* sm )
734 {
735    tl_assert (sm);
736    tl_assert (sm->magic == SecMap_free_MAGIC);
737    return SVal2Ptr (sm->linesZ[0].dict[1]);
738 }
set_SecMap_freelist_next(SecMap * sm,SecMap * next)739 static inline void set_SecMap_freelist_next ( SecMap* sm, SecMap* next )
740 {
741    tl_assert (sm);
742    tl_assert (sm->magic == SecMap_free_MAGIC);
743    tl_assert (next == NULL || next->magic == SecMap_free_MAGIC);
744    sm->linesZ[0].dict[1] = Ptr2SVal (next);
745 }
746 
747 static SecMap *SecMap_freelist = NULL;
SecMap_freelist_length(void)748 static UWord SecMap_freelist_length(void)
749 {
750    SecMap *sm;
751    UWord n = 0;
752 
753    sm = SecMap_freelist;
754    while (sm) {
755      n++;
756      sm = SecMap_freelist_next (sm);
757    }
758    return n;
759 }
760 
push_SecMap_on_freelist(SecMap * sm)761 static void push_SecMap_on_freelist(SecMap* sm)
762 {
763    if (0) VG_(message)(Vg_DebugMsg, "%p push\n", sm);
764    sm->magic = SecMap_free_MAGIC;
765    set_SecMap_freelist_next(sm, SecMap_freelist);
766    SecMap_freelist = sm;
767 }
768 /* Returns a free SecMap if there is one.
769    Otherwise, returns NULL. */
pop_SecMap_from_freelist(void)770 static SecMap *pop_SecMap_from_freelist(void)
771 {
772    SecMap *sm;
773 
774    sm = SecMap_freelist;
775    if (sm) {
776       tl_assert (sm->magic == SecMap_free_MAGIC);
777       SecMap_freelist = SecMap_freelist_next (sm);
778       if (0) VG_(message)(Vg_DebugMsg, "%p pop\n", sm);
779    }
780    return sm;
781 }
782 
shmem__alloc_or_recycle_SecMap(void)783 static SecMap* shmem__alloc_or_recycle_SecMap ( void )
784 {
785    Word    i, j;
786    SecMap* sm = pop_SecMap_from_freelist();
787 
788    if (!sm) {
789       sm = shmem__bigchunk_alloc( sizeof(SecMap) );
790       stats__secmaps_allocd++;
791       stats__secmap_ga_space_covered += N_SECMAP_ARANGE;
792       stats__secmap_linesZ_allocd += N_SECMAP_ZLINES;
793       stats__secmap_linesZ_bytes += N_SECMAP_ZLINES * sizeof(LineZ);
794    }
795    if (0) VG_(printf)("alloc_SecMap %p\n",sm);
796    tl_assert(sm);
797    sm->magic = SecMap_MAGIC;
798    for (i = 0; i < N_SECMAP_ZLINES; i++) {
799       sm->linesZ[i].dict[0] = SVal_NOACCESS;
800       sm->linesZ[i].dict[1] = SVal_INVALID;
801       sm->linesZ[i].dict[2] = SVal_INVALID;
802       sm->linesZ[i].dict[3] = SVal_INVALID;
803       for (j = 0; j < N_LINE_ARANGE/4; j++)
804          sm->linesZ[i].ix2s[j] = 0; /* all reference dict[0] */
805    }
806    return sm;
807 }
808 
809 typedef struct { Addr gaKey; SecMap* sm; } SMCacheEnt;
810 static SMCacheEnt smCache[3] = { {1,NULL}, {1,NULL}, {1,NULL} };
811 
shmem__find_SecMap(Addr ga)812 static SecMap* shmem__find_SecMap ( Addr ga )
813 {
814    SecMap* sm    = NULL;
815    Addr    gaKey = shmem__round_to_SecMap_base(ga);
816    // Cache
817    stats__secmaps_search++;
818    if (LIKELY(gaKey == smCache[0].gaKey))
819       return smCache[0].sm;
820    if (LIKELY(gaKey == smCache[1].gaKey)) {
821       SMCacheEnt tmp = smCache[0];
822       smCache[0] = smCache[1];
823       smCache[1] = tmp;
824       return smCache[0].sm;
825    }
826    if (gaKey == smCache[2].gaKey) {
827       SMCacheEnt tmp = smCache[1];
828       smCache[1] = smCache[2];
829       smCache[2] = tmp;
830       return smCache[1].sm;
831    }
832    // end Cache
833    stats__secmaps_search_slow++;
834    if (VG_(lookupFM)( map_shmem,
835                       NULL/*keyP*/, (UWord*)&sm, (UWord)gaKey )) {
836       tl_assert(sm != NULL);
837       smCache[2] = smCache[1];
838       smCache[1] = smCache[0];
839       smCache[0].gaKey = gaKey;
840       smCache[0].sm    = sm;
841    } else {
842       tl_assert(sm == NULL);
843    }
844    return sm;
845 }
846 
847 /* Scan the SecMap and count the SecMap that can be GC-ed.
848    If really, really does the GC of the SecMap. */
849 /* NOT TO BE CALLED FROM WITHIN libzsm. */
850 static UWord next_SecMap_GC_at = 1000;
851 __attribute__((noinline))
shmem__SecMap_do_GC(Bool really)852 static UWord shmem__SecMap_do_GC(Bool really)
853 {
854    UWord secmapW = 0;
855    Addr  gaKey;
856    UWord examined = 0;
857    UWord ok_GCed = 0;
858 
859    /* First invalidate the smCache */
860    smCache[0].gaKey = 1;
861    smCache[1].gaKey = 1;
862    smCache[2].gaKey = 1;
863    STATIC_ASSERT (3 == sizeof(smCache)/sizeof(smCache[0]));
864 
865    VG_(initIterFM)( map_shmem );
866    while (VG_(nextIterFM)( map_shmem, &gaKey, &secmapW )) {
867       UWord   i;
868       UWord   j;
869       UWord   n_linesF = 0;
870       SecMap* sm = (SecMap*)secmapW;
871       tl_assert(sm->magic == SecMap_MAGIC);
872       Bool ok_to_GC = True;
873 
874       examined++;
875 
876       /* Deal with the LineZs and the possible LineF of a LineZ. */
877       for (i = 0; i < N_SECMAP_ZLINES && ok_to_GC; i++) {
878          LineZ* lineZ = &sm->linesZ[i];
879          if (lineZ->dict[0] != SVal_INVALID) {
880             ok_to_GC = lineZ->dict[0] == SVal_NOACCESS
881                && !SVal__isC (lineZ->dict[1])
882                && !SVal__isC (lineZ->dict[2])
883                && !SVal__isC (lineZ->dict[3]);
884          } else {
885             LineF *lineF = LineF_Ptr(lineZ);
886             n_linesF++;
887             for (j = 0; j < N_LINE_ARANGE && ok_to_GC; j++)
888                ok_to_GC = lineF->w64s[j] == SVal_NOACCESS;
889          }
890       }
891       if (ok_to_GC)
892          ok_GCed++;
893       if (ok_to_GC && really) {
894         SecMap *fm_sm;
895         Addr fm_gaKey;
896         /* We cannot remove a SecMap from map_shmem while iterating.
897            So, stop iteration, remove from map_shmem, recreate the iteration
898            on the next SecMap. */
899         VG_(doneIterFM) ( map_shmem );
900         /* No need to rcdec linesZ or linesF, these are all SVal_NOACCESS.
901            We just need to free the lineF referenced by the linesZ. */
902         if (n_linesF > 0) {
903            for (i = 0; i < N_SECMAP_ZLINES && n_linesF > 0; i++) {
904               LineZ* lineZ = &sm->linesZ[i];
905               if (lineZ->dict[0] == SVal_INVALID) {
906                  VG_(freeEltPA)( LineF_pool_allocator, LineF_Ptr(lineZ) );
907                  n_linesF--;
908               }
909            }
910         }
911         if (!VG_(delFromFM)(map_shmem, &fm_gaKey, (UWord*)&fm_sm, gaKey))
912           tl_assert (0);
913         stats__secmaps_in_map_shmem--;
914         tl_assert (gaKey == fm_gaKey);
915         tl_assert (sm == fm_sm);
916         stats__secmaps_scanGCed++;
917         push_SecMap_on_freelist (sm);
918         VG_(initIterAtFM) (map_shmem, gaKey + N_SECMAP_ARANGE);
919       }
920    }
921    VG_(doneIterFM)( map_shmem );
922 
923    if (really) {
924       stats__secmaps_scanGC++;
925       /* Next GC when we approach the max allocated */
926       next_SecMap_GC_at = stats__secmaps_allocd - 1000;
927       /* Unless we GCed less than 10%. We then allow to alloc 10%
928          more before GCing. This avoids doing a lot of costly GC
929          for the worst case : the 'growing phase' of an application
930          that allocates a lot of memory.
931          Worst can can be reproduced e.g. by
932              perf/memrw -t 30000000 -b 1000 -r 1 -l 1
933          that allocates around 30Gb of memory. */
934       if (ok_GCed < stats__secmaps_allocd/10)
935          next_SecMap_GC_at = stats__secmaps_allocd + stats__secmaps_allocd/10;
936 
937    }
938 
939    if (VG_(clo_stats) && really) {
940       VG_(message)(Vg_DebugMsg,
941                   "libhb: SecMap GC: #%lu scanned %lu, GCed %lu,"
942                    " next GC at %lu\n",
943                    stats__secmaps_scanGC, examined, ok_GCed,
944                    next_SecMap_GC_at);
945    }
946 
947    return ok_GCed;
948 }
949 
shmem__find_or_alloc_SecMap(Addr ga)950 static SecMap* shmem__find_or_alloc_SecMap ( Addr ga )
951 {
952    SecMap* sm = shmem__find_SecMap ( ga );
953    if (LIKELY(sm)) {
954       if (CHECK_ZSM) tl_assert(is_sane_SecMap(sm));
955       return sm;
956    } else {
957       /* create a new one */
958       Addr gaKey = shmem__round_to_SecMap_base(ga);
959       sm = shmem__alloc_or_recycle_SecMap();
960       tl_assert(sm);
961       VG_(addToFM)( map_shmem, (UWord)gaKey, (UWord)sm );
962       stats__secmaps_in_map_shmem++;
963       if (CHECK_ZSM) tl_assert(is_sane_SecMap(sm));
964       return sm;
965    }
966 }
967 
968 /* Returns the nr of linesF which are in use. Note: this is scanning
969    the secmap wordFM. So, this is to be used for statistics only. */
970 __attribute__((noinline))
shmem__SecMap_used_linesF(void)971 static UWord shmem__SecMap_used_linesF(void)
972 {
973    UWord secmapW = 0;
974    Addr  gaKey;
975    UWord inUse = 0;
976 
977    VG_(initIterFM)( map_shmem );
978    while (VG_(nextIterFM)( map_shmem, &gaKey, &secmapW )) {
979       UWord   i;
980       SecMap* sm = (SecMap*)secmapW;
981       tl_assert(sm->magic == SecMap_MAGIC);
982 
983       for (i = 0; i < N_SECMAP_ZLINES; i++) {
984          LineZ* lineZ = &sm->linesZ[i];
985          if (lineZ->dict[0] == SVal_INVALID)
986             inUse++;
987       }
988    }
989    VG_(doneIterFM)( map_shmem );
990 
991    return inUse;
992 }
993 
994 /* ------------ LineF and LineZ related ------------ */
995 
rcinc_LineF(LineF * lineF)996 static void rcinc_LineF ( LineF* lineF ) {
997    UWord i;
998    for (i = 0; i < N_LINE_ARANGE; i++)
999       SVal__rcinc(lineF->w64s[i]);
1000 }
1001 
rcdec_LineF(LineF * lineF)1002 static void rcdec_LineF ( LineF* lineF ) {
1003    UWord i;
1004    for (i = 0; i < N_LINE_ARANGE; i++)
1005       SVal__rcdec(lineF->w64s[i]);
1006 }
1007 
rcinc_LineZ(LineZ * lineZ)1008 static void rcinc_LineZ ( LineZ* lineZ ) {
1009    tl_assert(lineZ->dict[0] != SVal_INVALID);
1010    SVal__rcinc(lineZ->dict[0]);
1011    if (lineZ->dict[1] != SVal_INVALID) SVal__rcinc(lineZ->dict[1]);
1012    if (lineZ->dict[2] != SVal_INVALID) SVal__rcinc(lineZ->dict[2]);
1013    if (lineZ->dict[3] != SVal_INVALID) SVal__rcinc(lineZ->dict[3]);
1014 }
1015 
rcdec_LineZ(LineZ * lineZ)1016 static void rcdec_LineZ ( LineZ* lineZ ) {
1017    tl_assert(lineZ->dict[0] != SVal_INVALID);
1018    SVal__rcdec(lineZ->dict[0]);
1019    if (lineZ->dict[1] != SVal_INVALID) SVal__rcdec(lineZ->dict[1]);
1020    if (lineZ->dict[2] != SVal_INVALID) SVal__rcdec(lineZ->dict[2]);
1021    if (lineZ->dict[3] != SVal_INVALID) SVal__rcdec(lineZ->dict[3]);
1022 }
1023 
1024 inline
write_twobit_array(UChar * arr,UWord ix,UWord b2)1025 static void write_twobit_array ( UChar* arr, UWord ix, UWord b2 ) {
1026    Word bix, shft, mask, prep;
1027    tl_assert(ix >= 0);
1028    bix  = ix >> 2;
1029    shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
1030    mask = 3 << shft;
1031    prep = b2 << shft;
1032    arr[bix] = (arr[bix] & ~mask) | prep;
1033 }
1034 
1035 inline
read_twobit_array(UChar * arr,UWord ix)1036 static UWord read_twobit_array ( UChar* arr, UWord ix ) {
1037    Word bix, shft;
1038    tl_assert(ix >= 0);
1039    bix  = ix >> 2;
1040    shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
1041    return (arr[bix] >> shft) & 3;
1042 }
1043 
1044 /* We cache one free lineF, to avoid pool allocator calls.
1045    Measurement on firefox has shown that this avoids more than 90%
1046    of the PA calls. */
1047 static LineF *free_lineF = NULL;
1048 
1049 /* Allocates a lineF for LineZ. Sets lineZ in a state indicating
1050    lineF has to be used. */
alloc_LineF_for_Z(LineZ * lineZ)1051 static inline LineF *alloc_LineF_for_Z (LineZ *lineZ)
1052 {
1053    LineF *lineF;
1054 
1055    tl_assert(lineZ->dict[0] == SVal_INVALID);
1056 
1057    if (LIKELY(free_lineF)) {
1058       lineF = free_lineF;
1059       free_lineF = NULL;
1060    } else {
1061       lineF = VG_(allocEltPA) ( LineF_pool_allocator );
1062    }
1063    lineZ->dict[0] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
1064    lineZ->dict[1] = Ptr2SVal (lineF);
1065 
1066    return lineF;
1067 }
1068 
1069 /* rcdec the LineF of lineZ, frees the lineF, and sets lineZ
1070    back to its initial state SVal_NOACCESS (i.e. ready to be
1071    read or written just after SecMap allocation). */
clear_LineF_of_Z(LineZ * lineZ)1072 static inline void clear_LineF_of_Z (LineZ *lineZ)
1073 {
1074    LineF *lineF = LineF_Ptr(lineZ);
1075 
1076    rcdec_LineF(lineF);
1077    if (UNLIKELY(free_lineF)) {
1078       VG_(freeEltPA)( LineF_pool_allocator, lineF );
1079    } else {
1080       free_lineF = lineF;
1081    }
1082    lineZ->dict[0] = SVal_NOACCESS;
1083    lineZ->dict[1] = SVal_INVALID;
1084 }
1085 
1086 /* Given address 'tag', find either the Z or F line containing relevant
1087    data, so it can be read into the cache.
1088 */
find_ZF_for_reading(LineZ ** zp,LineF ** fp,Addr tag)1089 static void find_ZF_for_reading ( /*OUT*/LineZ** zp,
1090                                   /*OUT*/LineF** fp, Addr tag ) {
1091    LineZ* lineZ;
1092    LineF* lineF;
1093    UWord   zix;
1094    SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
1095    UWord   smoff = shmem__get_SecMap_offset(tag);
1096    /* since smoff is derived from a valid tag, it should be
1097       cacheline-aligned. */
1098    tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
1099    zix = smoff >> N_LINE_BITS;
1100    tl_assert(zix < N_SECMAP_ZLINES);
1101    lineZ = &sm->linesZ[zix];
1102    lineF = NULL;
1103    if (lineZ->dict[0] == SVal_INVALID) {
1104       lineF = LineF_Ptr (lineZ);
1105       lineZ = NULL;
1106    }
1107    *zp = lineZ;
1108    *fp = lineF;
1109 }
1110 
1111 /* Given address 'tag', return the relevant SecMap and the index of
1112    the LineZ within it, in the expectation that the line is to be
1113    overwritten.  Regardless of whether 'tag' is currently associated
1114    with a Z or F representation, to rcdec on the current
1115    representation, in recognition of the fact that the contents are
1116    just about to be overwritten. */
1117 static __attribute__((noinline))
find_Z_for_writing(SecMap ** smp,Word * zixp,Addr tag)1118 void find_Z_for_writing ( /*OUT*/SecMap** smp,
1119                           /*OUT*/Word* zixp,
1120                           Addr tag ) {
1121    LineZ* lineZ;
1122    UWord   zix;
1123    SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
1124    UWord   smoff = shmem__get_SecMap_offset(tag);
1125    /* since smoff is derived from a valid tag, it should be
1126       cacheline-aligned. */
1127    tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
1128    zix = smoff >> N_LINE_BITS;
1129    tl_assert(zix < N_SECMAP_ZLINES);
1130    lineZ = &sm->linesZ[zix];
1131    /* re RCs, we are rcdec_LineZ/clear_LineF_of_Z this LineZ so that new data
1132       can be parked in it.  Hence have to rcdec it accordingly. */
1133    /* If lineZ has an associated lineF, free it up. */
1134    if (lineZ->dict[0] == SVal_INVALID)
1135       clear_LineF_of_Z(lineZ);
1136    else
1137       rcdec_LineZ(lineZ);
1138    *smp  = sm;
1139    *zixp = zix;
1140 }
1141 
1142 /* ------------ CacheLine and implicit-tree related ------------ */
1143 
1144 __attribute__((unused))
pp_CacheLine(CacheLine * cl)1145 static void pp_CacheLine ( CacheLine* cl ) {
1146    Word i;
1147    if (!cl) {
1148       VG_(printf)("%s","pp_CacheLine(NULL)\n");
1149       return;
1150    }
1151    for (i = 0; i < N_LINE_TREES; i++)
1152       VG_(printf)("   descr: %04lx\n", (UWord)cl->descrs[i]);
1153    for (i = 0; i < N_LINE_ARANGE; i++)
1154       VG_(printf)("    sval: %08lx\n", (UWord)cl->svals[i]);
1155 }
1156 
descr_to_validbits(UShort descr)1157 static UChar descr_to_validbits ( UShort descr )
1158 {
1159    /* a.k.a Party Time for gcc's constant folder */
1160 #  define DESCR(b8_7, b8_6, b8_5, b8_4, b8_3, b8_2, b8_1, b8_0, \
1161                 b16_3, b32_1, b16_2, b64, b16_1, b32_0, b16_0)  \
1162              ( (UShort) ( ( (b8_7)  << 14) | ( (b8_6)  << 13) | \
1163                           ( (b8_5)  << 12) | ( (b8_4)  << 11) | \
1164                           ( (b8_3)  << 10) | ( (b8_2)  << 9)  | \
1165                           ( (b8_1)  << 8)  | ( (b8_0)  << 7)  | \
1166                           ( (b16_3) << 6)  | ( (b32_1) << 5)  | \
1167                           ( (b16_2) << 4)  | ( (b64)   << 3)  | \
1168                           ( (b16_1) << 2)  | ( (b32_0) << 1)  | \
1169                           ( (b16_0) << 0) ) )
1170 
1171 #  define BYTE(bit7, bit6, bit5, bit4, bit3, bit2, bit1, bit0) \
1172              ( (UChar) ( ( (bit7) << 7) | ( (bit6) << 6) | \
1173                          ( (bit5) << 5) | ( (bit4) << 4) | \
1174                          ( (bit3) << 3) | ( (bit2) << 2) | \
1175                          ( (bit1) << 1) | ( (bit0) << 0) ) )
1176 
1177    /* these should all get folded out at compile time */
1178    tl_assert(DESCR(1,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_7);
1179    tl_assert(DESCR(0,0,0,0,0,0,0,1, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_0);
1180    tl_assert(DESCR(0,0,0,0,0,0,0,0, 1,0,0, 0, 0,0,0) == TREE_DESCR_16_3);
1181    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,0,0) == TREE_DESCR_32_1);
1182    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,1, 0, 0,0,0) == TREE_DESCR_16_2);
1183    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0) == TREE_DESCR_64);
1184    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 1,0,0) == TREE_DESCR_16_1);
1185    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,1,0) == TREE_DESCR_32_0);
1186    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,1) == TREE_DESCR_16_0);
1187 
1188    switch (descr) {
1189    /*
1190               +--------------------------------- TREE_DESCR_8_7
1191               |             +------------------- TREE_DESCR_8_0
1192               |             |  +---------------- TREE_DESCR_16_3
1193               |             |  | +-------------- TREE_DESCR_32_1
1194               |             |  | | +------------ TREE_DESCR_16_2
1195               |             |  | | |  +--------- TREE_DESCR_64
1196               |             |  | | |  |  +------ TREE_DESCR_16_1
1197               |             |  | | |  |  | +---- TREE_DESCR_32_0
1198               |             |  | | |  |  | | +-- TREE_DESCR_16_0
1199               |             |  | | |  |  | | |
1200               |             |  | | |  |  | | |   GRANULARITY, 7 -> 0 */
1201    case DESCR(1,1,1,1,1,1,1,1, 0,0,0, 0, 0,0,0): /* 8 8 8 8  8 8 8 8 */
1202                                                  return BYTE(1,1,1,1,1,1,1,1);
1203    case DESCR(1,1,0,0,1,1,1,1, 0,0,1, 0, 0,0,0): /* 8 8 16   8 8 8 8 */
1204                                                  return BYTE(1,1,0,1,1,1,1,1);
1205    case DESCR(0,0,1,1,1,1,1,1, 1,0,0, 0, 0,0,0): /* 16  8 8  8 8 8 8 */
1206                                                  return BYTE(0,1,1,1,1,1,1,1);
1207    case DESCR(0,0,0,0,1,1,1,1, 1,0,1, 0, 0,0,0): /* 16  16   8 8 8 8 */
1208                                                  return BYTE(0,1,0,1,1,1,1,1);
1209 
1210    case DESCR(1,1,1,1,1,1,0,0, 0,0,0, 0, 0,0,1): /* 8 8 8 8  8 8 16 */
1211                                                  return BYTE(1,1,1,1,1,1,0,1);
1212    case DESCR(1,1,0,0,1,1,0,0, 0,0,1, 0, 0,0,1): /* 8 8 16   8 8 16 */
1213                                                  return BYTE(1,1,0,1,1,1,0,1);
1214    case DESCR(0,0,1,1,1,1,0,0, 1,0,0, 0, 0,0,1): /* 16  8 8  8 8 16 */
1215                                                  return BYTE(0,1,1,1,1,1,0,1);
1216    case DESCR(0,0,0,0,1,1,0,0, 1,0,1, 0, 0,0,1): /* 16  16   8 8 16 */
1217                                                  return BYTE(0,1,0,1,1,1,0,1);
1218 
1219    case DESCR(1,1,1,1,0,0,1,1, 0,0,0, 0, 1,0,0): /* 8 8 8 8  16 8 8 */
1220                                                  return BYTE(1,1,1,1,0,1,1,1);
1221    case DESCR(1,1,0,0,0,0,1,1, 0,0,1, 0, 1,0,0): /* 8 8 16   16 8 8 */
1222                                                  return BYTE(1,1,0,1,0,1,1,1);
1223    case DESCR(0,0,1,1,0,0,1,1, 1,0,0, 0, 1,0,0): /* 16  8 8  16 8 8 */
1224                                                  return BYTE(0,1,1,1,0,1,1,1);
1225    case DESCR(0,0,0,0,0,0,1,1, 1,0,1, 0, 1,0,0): /* 16  16   16 8 8 */
1226                                                  return BYTE(0,1,0,1,0,1,1,1);
1227 
1228    case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 1,0,1): /* 8 8 8 8  16 16 */
1229                                                  return BYTE(1,1,1,1,0,1,0,1);
1230    case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 1,0,1): /* 8 8 16   16 16 */
1231                                                  return BYTE(1,1,0,1,0,1,0,1);
1232    case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 1,0,1): /* 16  8 8  16 16 */
1233                                                  return BYTE(0,1,1,1,0,1,0,1);
1234    case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 1,0,1): /* 16  16   16 16 */
1235                                                  return BYTE(0,1,0,1,0,1,0,1);
1236 
1237    case DESCR(0,0,0,0,1,1,1,1, 0,1,0, 0, 0,0,0): /* 32  8 8 8 8 */
1238                                                  return BYTE(0,0,0,1,1,1,1,1);
1239    case DESCR(0,0,0,0,1,1,0,0, 0,1,0, 0, 0,0,1): /* 32  8 8 16  */
1240                                                  return BYTE(0,0,0,1,1,1,0,1);
1241    case DESCR(0,0,0,0,0,0,1,1, 0,1,0, 0, 1,0,0): /* 32  16  8 8 */
1242                                                  return BYTE(0,0,0,1,0,1,1,1);
1243    case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 1,0,1): /* 32  16  16  */
1244                                                  return BYTE(0,0,0,1,0,1,0,1);
1245 
1246    case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 0,1,0): /* 8 8 8 8  32 */
1247                                                  return BYTE(1,1,1,1,0,0,0,1);
1248    case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 0,1,0): /* 8 8 16   32 */
1249                                                  return BYTE(1,1,0,1,0,0,0,1);
1250    case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 0,1,0): /* 16  8 8  32 */
1251                                                  return BYTE(0,1,1,1,0,0,0,1);
1252    case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 0,1,0): /* 16  16   32 */
1253                                                  return BYTE(0,1,0,1,0,0,0,1);
1254 
1255    case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,1,0): /* 32 32 */
1256                                                  return BYTE(0,0,0,1,0,0,0,1);
1257 
1258    case DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0): /* 64 */
1259                                                  return BYTE(0,0,0,0,0,0,0,1);
1260 
1261    default: return BYTE(0,0,0,0,0,0,0,0);
1262                    /* INVALID - any valid descr produces at least one
1263                       valid bit in tree[0..7]*/
1264    }
1265    /* NOTREACHED*/
1266    tl_assert(0);
1267 
1268 #  undef DESCR
1269 #  undef BYTE
1270 }
1271 
1272 __attribute__((unused))
is_sane_Descr(UShort descr)1273 static Bool is_sane_Descr ( UShort descr ) {
1274    return descr_to_validbits(descr) != 0;
1275 }
1276 
sprintf_Descr(HChar * dst,UShort descr)1277 static void sprintf_Descr ( /*OUT*/HChar* dst, UShort descr ) {
1278    VG_(sprintf)(dst,
1279                 "%d%d%d%d%d%d%d%d %d%d%d %d %d%d%d",
1280                 (Int)((descr & TREE_DESCR_8_7) ? 1 : 0),
1281                 (Int)((descr & TREE_DESCR_8_6) ? 1 : 0),
1282                 (Int)((descr & TREE_DESCR_8_5) ? 1 : 0),
1283                 (Int)((descr & TREE_DESCR_8_4) ? 1 : 0),
1284                 (Int)((descr & TREE_DESCR_8_3) ? 1 : 0),
1285                 (Int)((descr & TREE_DESCR_8_2) ? 1 : 0),
1286                 (Int)((descr & TREE_DESCR_8_1) ? 1 : 0),
1287                 (Int)((descr & TREE_DESCR_8_0) ? 1 : 0),
1288                 (Int)((descr & TREE_DESCR_16_3) ? 1 : 0),
1289                 (Int)((descr & TREE_DESCR_32_1) ? 1 : 0),
1290                 (Int)((descr & TREE_DESCR_16_2) ? 1 : 0),
1291                 (Int)((descr & TREE_DESCR_64)   ? 1 : 0),
1292                 (Int)((descr & TREE_DESCR_16_1) ? 1 : 0),
1293                 (Int)((descr & TREE_DESCR_32_0) ? 1 : 0),
1294                 (Int)((descr & TREE_DESCR_16_0) ? 1 : 0)
1295    );
1296 }
sprintf_Byte(HChar * dst,UChar byte)1297 static void sprintf_Byte ( /*OUT*/HChar* dst, UChar byte ) {
1298    VG_(sprintf)(dst, "%d%d%d%d%d%d%d%d",
1299                      (Int)((byte & 128) ? 1 : 0),
1300                      (Int)((byte &  64) ? 1 : 0),
1301                      (Int)((byte &  32) ? 1 : 0),
1302                      (Int)((byte &  16) ? 1 : 0),
1303                      (Int)((byte &   8) ? 1 : 0),
1304                      (Int)((byte &   4) ? 1 : 0),
1305                      (Int)((byte &   2) ? 1 : 0),
1306                      (Int)((byte &   1) ? 1 : 0)
1307    );
1308 }
1309 
is_sane_Descr_and_Tree(UShort descr,SVal * tree)1310 static Bool is_sane_Descr_and_Tree ( UShort descr, SVal* tree ) {
1311    Word  i;
1312    UChar validbits = descr_to_validbits(descr);
1313    HChar buf[128], buf2[128];    // large enough
1314    if (validbits == 0)
1315       goto bad;
1316    for (i = 0; i < 8; i++) {
1317       if (validbits & (1<<i)) {
1318          if (tree[i] == SVal_INVALID)
1319             goto bad;
1320       } else {
1321          if (tree[i] != SVal_INVALID)
1322             goto bad;
1323       }
1324    }
1325    return True;
1326   bad:
1327    sprintf_Descr( buf, descr );
1328    sprintf_Byte( buf2, validbits );
1329    VG_(printf)("%s","is_sane_Descr_and_Tree: bad tree {\n");
1330    VG_(printf)("   validbits 0x%02lx    %s\n", (UWord)validbits, buf2);
1331    VG_(printf)("       descr 0x%04lx  %s\n", (UWord)descr, buf);
1332    for (i = 0; i < 8; i++)
1333       VG_(printf)("   [%ld] 0x%016llx\n", i, tree[i]);
1334    VG_(printf)("%s","}\n");
1335    return 0;
1336 }
1337 
is_sane_CacheLine(CacheLine * cl)1338 static Bool is_sane_CacheLine ( CacheLine* cl )
1339 {
1340    Word tno, cloff;
1341 
1342    if (!cl) goto bad;
1343 
1344    for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
1345       UShort descr = cl->descrs[tno];
1346       SVal*  tree  = &cl->svals[cloff];
1347       if (!is_sane_Descr_and_Tree(descr, tree))
1348          goto bad;
1349    }
1350    tl_assert(cloff == N_LINE_ARANGE);
1351    return True;
1352   bad:
1353    pp_CacheLine(cl);
1354    return False;
1355 }
1356 
normalise_tree(SVal * tree)1357 static UShort normalise_tree ( /*MOD*/SVal* tree )
1358 {
1359    UShort descr;
1360    /* pre: incoming tree[0..7] does not have any invalid shvals, in
1361       particular no zeroes. */
1362    if (CHECK_ZSM
1363        && UNLIKELY(tree[7] == SVal_INVALID || tree[6] == SVal_INVALID
1364                    || tree[5] == SVal_INVALID || tree[4] == SVal_INVALID
1365                    || tree[3] == SVal_INVALID || tree[2] == SVal_INVALID
1366                    || tree[1] == SVal_INVALID || tree[0] == SVal_INVALID))
1367       tl_assert(0);
1368 
1369    descr = TREE_DESCR_8_7 | TREE_DESCR_8_6 | TREE_DESCR_8_5
1370            | TREE_DESCR_8_4 | TREE_DESCR_8_3 | TREE_DESCR_8_2
1371            | TREE_DESCR_8_1 | TREE_DESCR_8_0;
1372    /* build 16-bit layer */
1373    if (tree[1] == tree[0]) {
1374       tree[1] = SVal_INVALID;
1375       descr &= ~(TREE_DESCR_8_1 | TREE_DESCR_8_0);
1376       descr |= TREE_DESCR_16_0;
1377    }
1378    if (tree[3] == tree[2]) {
1379       tree[3] = SVal_INVALID;
1380       descr &= ~(TREE_DESCR_8_3 | TREE_DESCR_8_2);
1381       descr |= TREE_DESCR_16_1;
1382    }
1383    if (tree[5] == tree[4]) {
1384       tree[5] = SVal_INVALID;
1385       descr &= ~(TREE_DESCR_8_5 | TREE_DESCR_8_4);
1386       descr |= TREE_DESCR_16_2;
1387    }
1388    if (tree[7] == tree[6]) {
1389       tree[7] = SVal_INVALID;
1390       descr &= ~(TREE_DESCR_8_7 | TREE_DESCR_8_6);
1391       descr |= TREE_DESCR_16_3;
1392    }
1393    /* build 32-bit layer */
1394    if (tree[2] == tree[0]
1395        && (descr & TREE_DESCR_16_1) && (descr & TREE_DESCR_16_0)) {
1396       tree[2] = SVal_INVALID; /* [3,1] must already be SVal_INVALID */
1397       descr &= ~(TREE_DESCR_16_1 | TREE_DESCR_16_0);
1398       descr |= TREE_DESCR_32_0;
1399    }
1400    if (tree[6] == tree[4]
1401        && (descr & TREE_DESCR_16_3) && (descr & TREE_DESCR_16_2)) {
1402       tree[6] = SVal_INVALID; /* [7,5] must already be SVal_INVALID */
1403       descr &= ~(TREE_DESCR_16_3 | TREE_DESCR_16_2);
1404       descr |= TREE_DESCR_32_1;
1405    }
1406    /* build 64-bit layer */
1407    if (tree[4] == tree[0]
1408        && (descr & TREE_DESCR_32_1) && (descr & TREE_DESCR_32_0)) {
1409       tree[4] = SVal_INVALID; /* [7,6,5,3,2,1] must already be SVal_INVALID */
1410       descr &= ~(TREE_DESCR_32_1 | TREE_DESCR_32_0);
1411       descr |= TREE_DESCR_64;
1412    }
1413    return descr;
1414 }
1415 
1416 /* This takes a cacheline where all the data is at the leaves
1417    (w8[..]) and builds a correctly normalised tree. */
normalise_CacheLine(CacheLine * cl)1418 static void normalise_CacheLine ( /*MOD*/CacheLine* cl )
1419 {
1420    Word tno, cloff;
1421    for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
1422       SVal* tree = &cl->svals[cloff];
1423       cl->descrs[tno] = normalise_tree( tree );
1424    }
1425    tl_assert(cloff == N_LINE_ARANGE);
1426    if (CHECK_ZSM)
1427       tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1428    stats__cline_normalises++;
1429 }
1430 
1431 
1432 typedef struct { UChar count; SVal sval; } CountedSVal;
1433 
1434 static
sequentialise_CacheLine(CountedSVal * dst,Word * dstUsedP,Word nDst,CacheLine * src)1435 void sequentialise_CacheLine ( /*OUT*/CountedSVal* dst,
1436                                /*OUT*/Word* dstUsedP,
1437                                Word nDst, CacheLine* src )
1438 {
1439    Word  tno, cloff, dstUsed;
1440 
1441    tl_assert(nDst == N_LINE_ARANGE);
1442    dstUsed = 0;
1443 
1444    for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
1445       UShort descr = src->descrs[tno];
1446       SVal*  tree  = &src->svals[cloff];
1447 
1448       /* sequentialise the tree described by (descr,tree). */
1449 #     define PUT(_n,_v)                                \
1450          do { dst[dstUsed  ].count = (_n);             \
1451               dst[dstUsed++].sval  = (_v);             \
1452          } while (0)
1453 
1454       /* byte 0 */
1455       if (descr & TREE_DESCR_64)   PUT(8, tree[0]); else
1456       if (descr & TREE_DESCR_32_0) PUT(4, tree[0]); else
1457       if (descr & TREE_DESCR_16_0) PUT(2, tree[0]); else
1458       if (descr & TREE_DESCR_8_0)  PUT(1, tree[0]);
1459       /* byte 1 */
1460       if (descr & TREE_DESCR_8_1)  PUT(1, tree[1]);
1461       /* byte 2 */
1462       if (descr & TREE_DESCR_16_1) PUT(2, tree[2]); else
1463       if (descr & TREE_DESCR_8_2)  PUT(1, tree[2]);
1464       /* byte 3 */
1465       if (descr & TREE_DESCR_8_3)  PUT(1, tree[3]);
1466       /* byte 4 */
1467       if (descr & TREE_DESCR_32_1) PUT(4, tree[4]); else
1468       if (descr & TREE_DESCR_16_2) PUT(2, tree[4]); else
1469       if (descr & TREE_DESCR_8_4)  PUT(1, tree[4]);
1470       /* byte 5 */
1471       if (descr & TREE_DESCR_8_5)  PUT(1, tree[5]);
1472       /* byte 6 */
1473       if (descr & TREE_DESCR_16_3) PUT(2, tree[6]); else
1474       if (descr & TREE_DESCR_8_6)  PUT(1, tree[6]);
1475       /* byte 7 */
1476       if (descr & TREE_DESCR_8_7)  PUT(1, tree[7]);
1477 
1478 #     undef PUT
1479       /* END sequentialise the tree described by (descr,tree). */
1480 
1481    }
1482    tl_assert(cloff == N_LINE_ARANGE);
1483    tl_assert(dstUsed <= nDst);
1484 
1485    *dstUsedP = dstUsed;
1486 }
1487 
1488 /* Write the cacheline 'wix' to backing store.  Where it ends up
1489    is determined by its tag field. */
cacheline_wback(UWord wix)1490 static __attribute__((noinline)) void cacheline_wback ( UWord wix )
1491 {
1492    Word        i, j, k, m;
1493    Addr        tag;
1494    SecMap*     sm;
1495    CacheLine*  cl;
1496    LineZ* lineZ;
1497    LineF* lineF;
1498    Word        zix, fix, csvalsUsed;
1499    CountedSVal csvals[N_LINE_ARANGE];
1500    SVal        sv;
1501 
1502    if (0)
1503    VG_(printf)("scache wback line %d\n", (Int)wix);
1504 
1505    tl_assert(wix >= 0 && wix < N_WAY_NENT);
1506 
1507    tag =  cache_shmem.tags0[wix];
1508    cl  = &cache_shmem.lyns0[wix];
1509 
1510    /* The cache line may have been invalidated; if so, ignore it. */
1511    if (!is_valid_scache_tag(tag))
1512       return;
1513 
1514    /* Where are we going to put it? */
1515    sm         = NULL;
1516    lineZ      = NULL;
1517    lineF      = NULL;
1518    zix = fix = -1;
1519 
1520    /* find the Z line to write in and rcdec it or the associated F
1521       line. */
1522    find_Z_for_writing( &sm, &zix, tag );
1523 
1524    tl_assert(sm);
1525    tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES);
1526    lineZ = &sm->linesZ[zix];
1527 
1528    /* Generate the data to be stored */
1529    if (CHECK_ZSM)
1530       tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1531 
1532    csvalsUsed = -1;
1533    sequentialise_CacheLine( csvals, &csvalsUsed,
1534                             N_LINE_ARANGE, cl );
1535    tl_assert(csvalsUsed >= 1 && csvalsUsed <= N_LINE_ARANGE);
1536    if (0) VG_(printf)("%ld ", csvalsUsed);
1537 
1538    lineZ->dict[0] = lineZ->dict[1]
1539                   = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
1540 
1541    /* i indexes actual shadow values, k is cursor in csvals */
1542    i = 0;
1543    for (k = 0; k < csvalsUsed; k++) {
1544 
1545       sv = csvals[k].sval;
1546       if (CHECK_ZSM)
1547          tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
1548       /* do we already have it? */
1549       if (sv == lineZ->dict[0]) { j = 0; goto dict_ok; }
1550       if (sv == lineZ->dict[1]) { j = 1; goto dict_ok; }
1551       if (sv == lineZ->dict[2]) { j = 2; goto dict_ok; }
1552       if (sv == lineZ->dict[3]) { j = 3; goto dict_ok; }
1553       /* no.  look for a free slot. */
1554       if (CHECK_ZSM)
1555          tl_assert(sv != SVal_INVALID);
1556       if (lineZ->dict[0]
1557           == SVal_INVALID) { lineZ->dict[0] = sv; j = 0; goto dict_ok; }
1558       if (lineZ->dict[1]
1559           == SVal_INVALID) { lineZ->dict[1] = sv; j = 1; goto dict_ok; }
1560       if (lineZ->dict[2]
1561           == SVal_INVALID) { lineZ->dict[2] = sv; j = 2; goto dict_ok; }
1562       if (lineZ->dict[3]
1563           == SVal_INVALID) { lineZ->dict[3] = sv; j = 3; goto dict_ok; }
1564       break; /* we'll have to use the f rep */
1565      dict_ok:
1566       m = csvals[k].count;
1567       if (m == 8) {
1568          write_twobit_array( lineZ->ix2s, i+0, j );
1569          write_twobit_array( lineZ->ix2s, i+1, j );
1570          write_twobit_array( lineZ->ix2s, i+2, j );
1571          write_twobit_array( lineZ->ix2s, i+3, j );
1572          write_twobit_array( lineZ->ix2s, i+4, j );
1573          write_twobit_array( lineZ->ix2s, i+5, j );
1574          write_twobit_array( lineZ->ix2s, i+6, j );
1575          write_twobit_array( lineZ->ix2s, i+7, j );
1576          i += 8;
1577       }
1578       else if (m == 4) {
1579          write_twobit_array( lineZ->ix2s, i+0, j );
1580          write_twobit_array( lineZ->ix2s, i+1, j );
1581          write_twobit_array( lineZ->ix2s, i+2, j );
1582          write_twobit_array( lineZ->ix2s, i+3, j );
1583          i += 4;
1584       }
1585       else if (m == 1) {
1586          write_twobit_array( lineZ->ix2s, i+0, j );
1587          i += 1;
1588       }
1589       else if (m == 2) {
1590          write_twobit_array( lineZ->ix2s, i+0, j );
1591          write_twobit_array( lineZ->ix2s, i+1, j );
1592          i += 2;
1593       }
1594       else {
1595          tl_assert(0); /* 8 4 2 or 1 are the only legitimate values for m */
1596       }
1597 
1598    }
1599 
1600    if (LIKELY(i == N_LINE_ARANGE)) {
1601       /* Construction of the compressed representation was
1602          successful. */
1603       rcinc_LineZ(lineZ);
1604       stats__cache_Z_wbacks++;
1605    } else {
1606       /* Cannot use the compressed(z) representation.  Use the full(f)
1607          rep instead. */
1608       tl_assert(i >= 0 && i < N_LINE_ARANGE);
1609       lineZ->dict[0] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
1610       lineF = alloc_LineF_for_Z (lineZ);
1611       i = 0;
1612       for (k = 0; k < csvalsUsed; k++) {
1613          if (CHECK_ZSM)
1614             tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
1615          sv = csvals[k].sval;
1616          if (CHECK_ZSM)
1617             tl_assert(sv != SVal_INVALID);
1618          for (m = csvals[k].count; m > 0; m--) {
1619             lineF->w64s[i] = sv;
1620             i++;
1621          }
1622       }
1623       tl_assert(i == N_LINE_ARANGE);
1624       rcinc_LineF(lineF);
1625       stats__cache_F_wbacks++;
1626    }
1627 }
1628 
1629 /* Fetch the cacheline 'wix' from the backing store.  The tag
1630    associated with 'wix' is assumed to have already been filled in;
1631    hence that is used to determine where in the backing store to read
1632    from. */
cacheline_fetch(UWord wix)1633 static __attribute__((noinline)) void cacheline_fetch ( UWord wix )
1634 {
1635    Word       i;
1636    Addr       tag;
1637    CacheLine* cl;
1638    LineZ*     lineZ;
1639    LineF*     lineF;
1640 
1641    if (0)
1642    VG_(printf)("scache fetch line %d\n", (Int)wix);
1643 
1644    tl_assert(wix >= 0 && wix < N_WAY_NENT);
1645 
1646    tag =  cache_shmem.tags0[wix];
1647    cl  = &cache_shmem.lyns0[wix];
1648 
1649    /* reject nonsense requests */
1650    tl_assert(is_valid_scache_tag(tag));
1651 
1652    lineZ = NULL;
1653    lineF = NULL;
1654    find_ZF_for_reading( &lineZ, &lineF, tag );
1655    tl_assert( (lineZ && !lineF) || (!lineZ && lineF) );
1656 
1657    /* expand the data into the bottom layer of the tree, then get
1658       cacheline_normalise to build the descriptor array. */
1659    if (lineF) {
1660       for (i = 0; i < N_LINE_ARANGE; i++) {
1661          cl->svals[i] = lineF->w64s[i];
1662       }
1663       stats__cache_F_fetches++;
1664    } else {
1665       for (i = 0; i < N_LINE_ARANGE; i++) {
1666          UWord ix = read_twobit_array( lineZ->ix2s, i );
1667          if (CHECK_ZSM) tl_assert(ix >= 0 && ix <= 3);
1668          cl->svals[i] = lineZ->dict[ix];
1669          if (CHECK_ZSM) tl_assert(cl->svals[i] != SVal_INVALID);
1670       }
1671       stats__cache_Z_fetches++;
1672    }
1673    normalise_CacheLine( cl );
1674 }
1675 
1676 /* Invalid the cachelines corresponding to the given range, which
1677    must start and end on a cacheline boundary. */
shmem__invalidate_scache_range(Addr ga,SizeT szB)1678 static void shmem__invalidate_scache_range (Addr ga, SizeT szB)
1679 {
1680    Word wix;
1681 
1682    /* ga must be on a cacheline boundary. */
1683    tl_assert (is_valid_scache_tag (ga));
1684    /* szB must be a multiple of cacheline size. */
1685    tl_assert (0 == (szB & (N_LINE_ARANGE - 1)));
1686 
1687 
1688    Word ga_ix = (ga >> N_LINE_BITS) & (N_WAY_NENT - 1);
1689    Word nwix = szB / N_LINE_ARANGE;
1690 
1691    if (nwix > N_WAY_NENT)
1692       nwix = N_WAY_NENT; // no need to check several times the same entry.
1693 
1694    for (wix = 0; wix < nwix; wix++) {
1695       if (address_in_range(cache_shmem.tags0[ga_ix], ga, szB))
1696          cache_shmem.tags0[ga_ix] = 1/*INVALID*/;
1697       ga_ix++;
1698       if (UNLIKELY(ga_ix == N_WAY_NENT))
1699          ga_ix = 0;
1700    }
1701 }
1702 
1703 
shmem__flush_and_invalidate_scache(void)1704 static void shmem__flush_and_invalidate_scache ( void ) {
1705    Word wix;
1706    Addr tag;
1707    if (0) VG_(printf)("%s","scache flush and invalidate\n");
1708    tl_assert(!is_valid_scache_tag(1));
1709    for (wix = 0; wix < N_WAY_NENT; wix++) {
1710       tag = cache_shmem.tags0[wix];
1711       if (tag == 1/*INVALID*/) {
1712          /* already invalid; nothing to do */
1713       } else {
1714          tl_assert(is_valid_scache_tag(tag));
1715          cacheline_wback( wix );
1716       }
1717       cache_shmem.tags0[wix] = 1/*INVALID*/;
1718    }
1719    stats__cache_flushes_invals++;
1720 }
1721 
1722 
aligned16(Addr a)1723 static inline Bool aligned16 ( Addr a ) {
1724    return 0 == (a & 1);
1725 }
aligned32(Addr a)1726 static inline Bool aligned32 ( Addr a ) {
1727    return 0 == (a & 3);
1728 }
aligned64(Addr a)1729 static inline Bool aligned64 ( Addr a ) {
1730    return 0 == (a & 7);
1731 }
get_cacheline_offset(Addr a)1732 static inline UWord get_cacheline_offset ( Addr a ) {
1733    return (UWord)(a & (N_LINE_ARANGE - 1));
1734 }
cacheline_ROUNDUP(Addr a)1735 static inline Addr cacheline_ROUNDUP ( Addr a ) {
1736    return ROUNDUP(a, N_LINE_ARANGE);
1737 }
cacheline_ROUNDDN(Addr a)1738 static inline Addr cacheline_ROUNDDN ( Addr a ) {
1739    return ROUNDDN(a, N_LINE_ARANGE);
1740 }
get_treeno(Addr a)1741 static inline UWord get_treeno ( Addr a ) {
1742    return get_cacheline_offset(a) >> 3;
1743 }
get_tree_offset(Addr a)1744 static inline UWord get_tree_offset ( Addr a ) {
1745    return a & 7;
1746 }
1747 
1748 static __attribute__((noinline))
1749        CacheLine* get_cacheline_MISS ( Addr a ); /* fwds */
get_cacheline(Addr a)1750 static inline CacheLine* get_cacheline ( Addr a )
1751 {
1752    /* tag is 'a' with the in-line offset masked out,
1753       eg a[31]..a[4] 0000 */
1754    Addr       tag = a & ~(N_LINE_ARANGE - 1);
1755    UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
1756    stats__cache_totrefs++;
1757    if (LIKELY(tag == cache_shmem.tags0[wix])) {
1758       return &cache_shmem.lyns0[wix];
1759    } else {
1760       return get_cacheline_MISS( a );
1761    }
1762 }
1763 
1764 static __attribute__((noinline))
get_cacheline_MISS(Addr a)1765        CacheLine* get_cacheline_MISS ( Addr a )
1766 {
1767    /* tag is 'a' with the in-line offset masked out,
1768       eg a[31]..a[4] 0000 */
1769 
1770    CacheLine* cl;
1771    Addr*      tag_old_p;
1772    Addr       tag = a & ~(N_LINE_ARANGE - 1);
1773    UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
1774 
1775    tl_assert(tag != cache_shmem.tags0[wix]);
1776 
1777    /* Dump the old line into the backing store. */
1778    stats__cache_totmisses++;
1779 
1780    cl        = &cache_shmem.lyns0[wix];
1781    tag_old_p = &cache_shmem.tags0[wix];
1782 
1783    if (is_valid_scache_tag( *tag_old_p )) {
1784       /* EXPENSIVE and REDUNDANT: callee does it */
1785       if (CHECK_ZSM)
1786          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1787       cacheline_wback( wix );
1788    }
1789    /* and reload the new one */
1790    *tag_old_p = tag;
1791    cacheline_fetch( wix );
1792    if (CHECK_ZSM)
1793       tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
1794    return cl;
1795 }
1796 
pulldown_to_32(SVal * tree,UWord toff,UShort descr)1797 static UShort pulldown_to_32 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
1798    stats__cline_64to32pulldown++;
1799    switch (toff) {
1800       case 0: case 4:
1801          tl_assert(descr & TREE_DESCR_64);
1802          tree[4] = tree[0];
1803          descr &= ~TREE_DESCR_64;
1804          descr |= (TREE_DESCR_32_1 | TREE_DESCR_32_0);
1805          break;
1806       default:
1807          tl_assert(0);
1808    }
1809    return descr;
1810 }
1811 
pulldown_to_16(SVal * tree,UWord toff,UShort descr)1812 static UShort pulldown_to_16 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
1813    stats__cline_32to16pulldown++;
1814    switch (toff) {
1815       case 0: case 2:
1816          if (!(descr & TREE_DESCR_32_0)) {
1817             descr = pulldown_to_32(tree, 0, descr);
1818          }
1819          tl_assert(descr & TREE_DESCR_32_0);
1820          tree[2] = tree[0];
1821          descr &= ~TREE_DESCR_32_0;
1822          descr |= (TREE_DESCR_16_1 | TREE_DESCR_16_0);
1823          break;
1824       case 4: case 6:
1825          if (!(descr & TREE_DESCR_32_1)) {
1826             descr = pulldown_to_32(tree, 4, descr);
1827          }
1828          tl_assert(descr & TREE_DESCR_32_1);
1829          tree[6] = tree[4];
1830          descr &= ~TREE_DESCR_32_1;
1831          descr |= (TREE_DESCR_16_3 | TREE_DESCR_16_2);
1832          break;
1833       default:
1834          tl_assert(0);
1835    }
1836    return descr;
1837 }
1838 
pulldown_to_8(SVal * tree,UWord toff,UShort descr)1839 static UShort pulldown_to_8 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
1840    stats__cline_16to8pulldown++;
1841    switch (toff) {
1842       case 0: case 1:
1843          if (!(descr & TREE_DESCR_16_0)) {
1844             descr = pulldown_to_16(tree, 0, descr);
1845          }
1846          tl_assert(descr & TREE_DESCR_16_0);
1847          tree[1] = tree[0];
1848          descr &= ~TREE_DESCR_16_0;
1849          descr |= (TREE_DESCR_8_1 | TREE_DESCR_8_0);
1850          break;
1851       case 2: case 3:
1852          if (!(descr & TREE_DESCR_16_1)) {
1853             descr = pulldown_to_16(tree, 2, descr);
1854          }
1855          tl_assert(descr & TREE_DESCR_16_1);
1856          tree[3] = tree[2];
1857          descr &= ~TREE_DESCR_16_1;
1858          descr |= (TREE_DESCR_8_3 | TREE_DESCR_8_2);
1859          break;
1860       case 4: case 5:
1861          if (!(descr & TREE_DESCR_16_2)) {
1862             descr = pulldown_to_16(tree, 4, descr);
1863          }
1864          tl_assert(descr & TREE_DESCR_16_2);
1865          tree[5] = tree[4];
1866          descr &= ~TREE_DESCR_16_2;
1867          descr |= (TREE_DESCR_8_5 | TREE_DESCR_8_4);
1868          break;
1869       case 6: case 7:
1870          if (!(descr & TREE_DESCR_16_3)) {
1871             descr = pulldown_to_16(tree, 6, descr);
1872          }
1873          tl_assert(descr & TREE_DESCR_16_3);
1874          tree[7] = tree[6];
1875          descr &= ~TREE_DESCR_16_3;
1876          descr |= (TREE_DESCR_8_7 | TREE_DESCR_8_6);
1877          break;
1878       default:
1879          tl_assert(0);
1880    }
1881    return descr;
1882 }
1883 
1884 
pullup_descr_to_16(UShort descr,UWord toff)1885 static UShort pullup_descr_to_16 ( UShort descr, UWord toff ) {
1886    UShort mask;
1887    switch (toff) {
1888       case 0:
1889          mask = TREE_DESCR_8_1 | TREE_DESCR_8_0;
1890          tl_assert( (descr & mask) == mask );
1891          descr &= ~mask;
1892          descr |= TREE_DESCR_16_0;
1893          break;
1894       case 2:
1895          mask = TREE_DESCR_8_3 | TREE_DESCR_8_2;
1896          tl_assert( (descr & mask) == mask );
1897          descr &= ~mask;
1898          descr |= TREE_DESCR_16_1;
1899          break;
1900       case 4:
1901          mask = TREE_DESCR_8_5 | TREE_DESCR_8_4;
1902          tl_assert( (descr & mask) == mask );
1903          descr &= ~mask;
1904          descr |= TREE_DESCR_16_2;
1905          break;
1906       case 6:
1907          mask = TREE_DESCR_8_7 | TREE_DESCR_8_6;
1908          tl_assert( (descr & mask) == mask );
1909          descr &= ~mask;
1910          descr |= TREE_DESCR_16_3;
1911          break;
1912       default:
1913          tl_assert(0);
1914    }
1915    return descr;
1916 }
1917 
pullup_descr_to_32(UShort descr,UWord toff)1918 static UShort pullup_descr_to_32 ( UShort descr, UWord toff ) {
1919    UShort mask;
1920    switch (toff) {
1921       case 0:
1922          if (!(descr & TREE_DESCR_16_0))
1923             descr = pullup_descr_to_16(descr, 0);
1924          if (!(descr & TREE_DESCR_16_1))
1925             descr = pullup_descr_to_16(descr, 2);
1926          mask = TREE_DESCR_16_1 | TREE_DESCR_16_0;
1927          tl_assert( (descr & mask) == mask );
1928          descr &= ~mask;
1929          descr |= TREE_DESCR_32_0;
1930          break;
1931       case 4:
1932          if (!(descr & TREE_DESCR_16_2))
1933             descr = pullup_descr_to_16(descr, 4);
1934          if (!(descr & TREE_DESCR_16_3))
1935             descr = pullup_descr_to_16(descr, 6);
1936          mask = TREE_DESCR_16_3 | TREE_DESCR_16_2;
1937          tl_assert( (descr & mask) == mask );
1938          descr &= ~mask;
1939          descr |= TREE_DESCR_32_1;
1940          break;
1941       default:
1942          tl_assert(0);
1943    }
1944    return descr;
1945 }
1946 
valid_value_is_above_me_32(UShort descr,UWord toff)1947 static Bool valid_value_is_above_me_32 ( UShort descr, UWord toff ) {
1948    switch (toff) {
1949       case 0: case 4:
1950          return 0 != (descr & TREE_DESCR_64);
1951       default:
1952          tl_assert(0);
1953    }
1954 }
1955 
valid_value_is_below_me_16(UShort descr,UWord toff)1956 static Bool valid_value_is_below_me_16 ( UShort descr, UWord toff ) {
1957    switch (toff) {
1958       case 0:
1959          return 0 != (descr & (TREE_DESCR_8_1 | TREE_DESCR_8_0));
1960       case 2:
1961          return 0 != (descr & (TREE_DESCR_8_3 | TREE_DESCR_8_2));
1962       case 4:
1963          return 0 != (descr & (TREE_DESCR_8_5 | TREE_DESCR_8_4));
1964       case 6:
1965          return 0 != (descr & (TREE_DESCR_8_7 | TREE_DESCR_8_6));
1966       default:
1967          tl_assert(0);
1968    }
1969 }
1970 
1971 /* ------------ Cache management ------------ */
1972 
zsm_flush_cache(void)1973 static void zsm_flush_cache ( void )
1974 {
1975    shmem__flush_and_invalidate_scache();
1976 }
1977 
1978 
zsm_init(void)1979 static void zsm_init ( void )
1980 {
1981    tl_assert( sizeof(UWord) == sizeof(Addr) );
1982 
1983    tl_assert(map_shmem == NULL);
1984    map_shmem = VG_(newFM)( HG_(zalloc), "libhb.zsm_init.1 (map_shmem)",
1985                            HG_(free),
1986                            NULL/*unboxed UWord cmp*/);
1987    /* Invalidate all cache entries. */
1988    tl_assert(!is_valid_scache_tag(1));
1989    for (UWord wix = 0; wix < N_WAY_NENT; wix++) {
1990       cache_shmem.tags0[wix] = 1/*INVALID*/;
1991    }
1992 
1993    LineF_pool_allocator = VG_(newPA) (
1994                              sizeof(LineF),
1995                              /* Nr elements/pool to fill a core arena block
1996                                 taking some arena overhead into account. */
1997                              (4 * 1024 * 1024 - 200)/sizeof(LineF),
1998                              HG_(zalloc),
1999                              "libhb.LineF_storage.pool",
2000                              HG_(free)
2001                           );
2002 
2003    /* a SecMap must contain an integral number of CacheLines */
2004    tl_assert(0 == (N_SECMAP_ARANGE % N_LINE_ARANGE));
2005    /* also ... a CacheLine holds an integral number of trees */
2006    tl_assert(0 == (N_LINE_ARANGE % 8));
2007 }
2008 
2009 /////////////////////////////////////////////////////////////////
2010 /////////////////////////////////////////////////////////////////
2011 //                                                             //
2012 // SECTION END compressed shadow memory                        //
2013 //                                                             //
2014 /////////////////////////////////////////////////////////////////
2015 /////////////////////////////////////////////////////////////////
2016 
2017 
2018 
2019 /////////////////////////////////////////////////////////////////
2020 /////////////////////////////////////////////////////////////////
2021 //                                                             //
2022 // SECTION BEGIN vts primitives                                //
2023 //                                                             //
2024 /////////////////////////////////////////////////////////////////
2025 /////////////////////////////////////////////////////////////////
2026 
2027 
2028 /* There's a 1-1 mapping between Thr and ThrIDs -- the latter merely
2029    being compact stand-ins for Thr*'s.  Use these functions to map
2030    between them. */
2031 static ThrID Thr__to_ThrID   ( Thr*  thr   ); /* fwds */
2032 static Thr*  Thr__from_ThrID ( ThrID thrid ); /* fwds */
2033 
2034 __attribute__((noreturn))
scalarts_limitations_fail_NORETURN(Bool due_to_nThrs)2035 static void scalarts_limitations_fail_NORETURN ( Bool due_to_nThrs )
2036 {
2037    if (due_to_nThrs) {
2038       const HChar* s =
2039          "\n"
2040          "Helgrind: cannot continue, run aborted: too many threads.\n"
2041          "Sorry.  Helgrind can only handle programs that create\n"
2042          "%'llu or fewer threads over their entire lifetime.\n"
2043          "\n";
2044       VG_(umsg)(s, (ULong)(ThrID_MAX_VALID - 1024));
2045    } else {
2046       const HChar* s =
2047          "\n"
2048          "Helgrind: cannot continue, run aborted: too many\n"
2049          "synchronisation events.  Sorry. Helgrind can only handle\n"
2050          "programs which perform %'llu or fewer\n"
2051          "inter-thread synchronisation events (locks, unlocks, etc).\n"
2052          "\n";
2053       VG_(umsg)(s, (1ULL << SCALARTS_N_TYMBITS) - 1);
2054    }
2055    VG_(exit)(1);
2056    /*NOTREACHED*/
2057    tl_assert(0); /*wtf?!*/
2058 }
2059 
2060 
2061 /* The dead thread (ThrID, actually) tables.  A thread may only be
2062    listed here if we have been notified thereof by libhb_async_exit.
2063    New entries are added at the end.  The order isn't important, but
2064    the ThrID values must be unique.
2065    verydead_thread_table_not_pruned lists the identity of the threads
2066    that died since the previous round of pruning.
2067    Once pruning is done, these ThrID are added in verydead_thread_table.
2068    We don't actually need to keep the set of threads that have ever died --
2069    only the threads that have died since the previous round of
2070    pruning.  But it's useful for sanity check purposes to keep the
2071    entire set, so we do. */
2072 static XArray* /* of ThrID */ verydead_thread_table_not_pruned = NULL;
2073 static XArray* /* of ThrID */ verydead_thread_table = NULL;
2074 
2075 /* Arbitrary total ordering on ThrIDs. */
cmp__ThrID(const void * v1,const void * v2)2076 static Int cmp__ThrID ( const void* v1, const void* v2 ) {
2077    ThrID id1 = *(const ThrID*)v1;
2078    ThrID id2 = *(const ThrID*)v2;
2079    if (id1 < id2) return -1;
2080    if (id1 > id2) return 1;
2081    return 0;
2082 }
2083 
verydead_thread_tables_init(void)2084 static void verydead_thread_tables_init ( void )
2085 {
2086    tl_assert(!verydead_thread_table);
2087    tl_assert(!verydead_thread_table_not_pruned);
2088    verydead_thread_table
2089      = VG_(newXA)( HG_(zalloc),
2090                    "libhb.verydead_thread_table_init.1",
2091                    HG_(free), sizeof(ThrID) );
2092    VG_(setCmpFnXA)(verydead_thread_table, cmp__ThrID);
2093    verydead_thread_table_not_pruned
2094      = VG_(newXA)( HG_(zalloc),
2095                    "libhb.verydead_thread_table_init.2",
2096                    HG_(free), sizeof(ThrID) );
2097    VG_(setCmpFnXA)(verydead_thread_table_not_pruned, cmp__ThrID);
2098 }
2099 
verydead_thread_table_sort_and_check(XArray * thrids)2100 static void verydead_thread_table_sort_and_check (XArray* thrids)
2101 {
2102    UWord i;
2103 
2104    VG_(sortXA)( thrids );
2105    /* Sanity check: check for unique .sts.thr values. */
2106    UWord nBT = VG_(sizeXA)( thrids );
2107    if (nBT > 0) {
2108       ThrID thrid1, thrid2;
2109       thrid2 = *(ThrID*)VG_(indexXA)( thrids, 0 );
2110       for (i = 1; i < nBT; i++) {
2111          thrid1 = thrid2;
2112          thrid2 = *(ThrID*)VG_(indexXA)( thrids, i );
2113          tl_assert(thrid1 < thrid2);
2114       }
2115    }
2116    /* Ok, so the dead thread table thrids has unique and in-order keys. */
2117 }
2118 
2119 /* A VTS contains .ts, its vector clock, and also .id, a field to hold
2120    a backlink for the caller's convenience.  Since we have no idea
2121    what to set that to in the library, it always gets set to
2122    VtsID_INVALID. */
2123 typedef
2124    struct {
2125       VtsID    id;
2126       UInt     usedTS;
2127       UInt     sizeTS;
2128       ScalarTS ts[0];
2129    }
2130    VTS;
2131 
2132 /* Allocate a VTS capable of storing 'sizeTS' entries. */
2133 static VTS* VTS__new ( const HChar* who, UInt sizeTS );
2134 
2135 /* Make a clone of 'vts', sizing the new array to exactly match the
2136    number of ScalarTSs present. */
2137 static VTS* VTS__clone ( const HChar* who, VTS* vts );
2138 
2139 /* Make a clone of 'vts' with the thrids in 'thrids' removed.  The new
2140    array is sized exactly to hold the number of required elements.
2141    'thridsToDel' is an array of ThrIDs to be omitted in the clone, and
2142    must be in strictly increasing order. */
2143 static VTS* VTS__subtract ( const HChar* who, VTS* vts, XArray* thridsToDel );
2144 
2145 /* Delete this VTS in its entirety. */
2146 static void VTS__delete ( VTS* vts );
2147 
2148 /* Create a new singleton VTS in 'out'.  Caller must have
2149    pre-allocated 'out' sufficiently big to hold the result in all
2150    possible cases. */
2151 static void VTS__singleton ( /*OUT*/VTS* out, Thr* thr, ULong tym );
2152 
2153 /* Create in 'out' a VTS which is the same as 'vts' except with
2154    vts[me]++, so to speak.  Caller must have pre-allocated 'out'
2155    sufficiently big to hold the result in all possible cases. */
2156 static void VTS__tick ( /*OUT*/VTS* out, Thr* me, VTS* vts );
2157 
2158 /* Create in 'out' a VTS which is the join (max) of 'a' and
2159    'b'. Caller must have pre-allocated 'out' sufficiently big to hold
2160    the result in all possible cases. */
2161 static void VTS__join ( /*OUT*/VTS* out, VTS* a, VTS* b );
2162 
2163 /* Compute the partial ordering relation of the two args.  Although we
2164    could be completely general and return an enumeration value (EQ,
2165    LT, GT, UN), in fact we only need LEQ, and so we may as well
2166    hardwire that fact.
2167 
2168    Returns zero iff LEQ(A,B), or a valid ThrID if not (zero is an
2169    invald ThrID).  In the latter case, the returned ThrID indicates
2170    the discovered point for which they are not.  There may be more
2171    than one such point, but we only care about seeing one of them, not
2172    all of them.  This rather strange convention is used because
2173    sometimes we want to know the actual index at which they first
2174    differ. */
2175 static UInt VTS__cmpLEQ ( VTS* a, VTS* b );
2176 
2177 /* Compute an arbitrary structural (total) ordering on the two args,
2178    based on their VCs, so they can be looked up in a table, tree, etc.
2179    Returns -1, 0 or 1. */
2180 static Word VTS__cmp_structural ( VTS* a, VTS* b );
2181 
2182 /* Debugging only.  Display the given VTS. */
2183 static void VTS__show ( const VTS* vts );
2184 
2185 /* Debugging only.  Return vts[index], so to speak. */
2186 static ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx );
2187 
2188 /* Notify the VTS machinery that a thread has been declared
2189    comprehensively dead: that is, it has done an async exit AND it has
2190    been joined with.  This should ensure that its local clocks (.viR
2191    and .viW) will never again change, and so all mentions of this
2192    thread from all VTSs in the system may be removed. */
2193 static void VTS__declare_thread_very_dead ( Thr* idx );
2194 
2195 /*--------------- to do with Vector Timestamps ---------------*/
2196 
is_sane_VTS(VTS * vts)2197 static Bool is_sane_VTS ( VTS* vts )
2198 {
2199    UWord     i, n;
2200    ScalarTS  *st1, *st2;
2201    if (!vts) return False;
2202    if (vts->usedTS > vts->sizeTS) return False;
2203    n = vts->usedTS;
2204    if (n == 1) {
2205       st1 = &vts->ts[0];
2206       if (st1->tym == 0)
2207          return False;
2208    }
2209    else
2210    if (n >= 2) {
2211       for (i = 0; i < n-1; i++) {
2212          st1 = &vts->ts[i];
2213          st2 = &vts->ts[i+1];
2214          if (st1->thrid >= st2->thrid)
2215             return False;
2216          if (st1->tym == 0 || st2->tym == 0)
2217             return False;
2218       }
2219    }
2220    return True;
2221 }
2222 
2223 
2224 /* Create a new, empty VTS.
2225 */
VTS__new(const HChar * who,UInt sizeTS)2226 static VTS* VTS__new ( const HChar* who, UInt sizeTS )
2227 {
2228    VTS* vts = HG_(zalloc)(who, sizeof(VTS) + (sizeTS+1) * sizeof(ScalarTS));
2229    tl_assert(vts->usedTS == 0);
2230    vts->sizeTS = sizeTS;
2231    *(ULong*)(&vts->ts[sizeTS]) = 0x0ddC0ffeeBadF00dULL;
2232    return vts;
2233 }
2234 
2235 /* Clone this VTS.
2236 */
VTS__clone(const HChar * who,VTS * vts)2237 static VTS* VTS__clone ( const HChar* who, VTS* vts )
2238 {
2239    tl_assert(vts);
2240    tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
2241    UInt nTS = vts->usedTS;
2242    VTS* clone = VTS__new(who, nTS);
2243    clone->id = vts->id;
2244    clone->sizeTS = nTS;
2245    clone->usedTS = nTS;
2246    UInt i;
2247    for (i = 0; i < nTS; i++) {
2248       clone->ts[i] = vts->ts[i];
2249    }
2250    tl_assert( *(ULong*)(&clone->ts[clone->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
2251    return clone;
2252 }
2253 
2254 
2255 /* Make a clone of a VTS with specified ThrIDs removed.  'thridsToDel'
2256    must be in strictly increasing order.  We could obviously do this
2257    much more efficiently (in linear time) if necessary.
2258 */
VTS__subtract(const HChar * who,VTS * vts,XArray * thridsToDel)2259 static VTS* VTS__subtract ( const HChar* who, VTS* vts, XArray* thridsToDel )
2260 {
2261    UInt i, j;
2262    tl_assert(vts);
2263    tl_assert(thridsToDel);
2264    tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
2265    UInt nTS = vts->usedTS;
2266    /* Figure out how many ScalarTSs will remain in the output. */
2267    UInt nReq = nTS;
2268    for (i = 0; i < nTS; i++) {
2269       ThrID thrid = vts->ts[i].thrid;
2270       if (VG_(lookupXA)(thridsToDel, &thrid, NULL, NULL))
2271          nReq--;
2272    }
2273    tl_assert(nReq <= nTS);
2274    /* Copy the ones that will remain. */
2275    VTS* res = VTS__new(who, nReq);
2276    j = 0;
2277    for (i = 0; i < nTS; i++) {
2278       ThrID thrid = vts->ts[i].thrid;
2279       if (VG_(lookupXA)(thridsToDel, &thrid, NULL, NULL))
2280          continue;
2281       res->ts[j++] = vts->ts[i];
2282    }
2283    tl_assert(j == nReq);
2284    tl_assert(j == res->sizeTS);
2285    res->usedTS = j;
2286    tl_assert( *(ULong*)(&res->ts[j]) == 0x0ddC0ffeeBadF00dULL);
2287    return res;
2288 }
2289 
2290 
2291 /* Delete this VTS in its entirety.
2292 */
VTS__delete(VTS * vts)2293 static void VTS__delete ( VTS* vts )
2294 {
2295    tl_assert(vts);
2296    tl_assert(vts->usedTS <= vts->sizeTS);
2297    tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
2298    HG_(free)(vts);
2299 }
2300 
2301 
2302 /* Create a new singleton VTS.
2303 */
VTS__singleton(VTS * out,Thr * thr,ULong tym)2304 static void VTS__singleton ( /*OUT*/VTS* out, Thr* thr, ULong tym )
2305 {
2306    tl_assert(thr);
2307    tl_assert(tym >= 1);
2308    tl_assert(out);
2309    tl_assert(out->usedTS == 0);
2310    tl_assert(out->sizeTS >= 1);
2311    UInt hi = out->usedTS++;
2312    out->ts[hi].thrid = Thr__to_ThrID(thr);
2313    out->ts[hi].tym   = tym;
2314 }
2315 
2316 
2317 /* Return a new VTS in which vts[me]++, so to speak.  'vts' itself is
2318    not modified.
2319 */
VTS__tick(VTS * out,Thr * me,VTS * vts)2320 static void VTS__tick ( /*OUT*/VTS* out, Thr* me, VTS* vts )
2321 {
2322    UInt      i, n;
2323    ThrID     me_thrid;
2324    Bool      found = False;
2325 
2326    stats__vts__tick++;
2327 
2328    tl_assert(out);
2329    tl_assert(out->usedTS == 0);
2330    if (vts->usedTS >= ThrID_MAX_VALID)
2331       scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
2332    tl_assert(out->sizeTS >= 1 + vts->usedTS);
2333 
2334    tl_assert(me);
2335    me_thrid = Thr__to_ThrID(me);
2336    tl_assert(is_sane_VTS(vts));
2337    n = vts->usedTS;
2338 
2339    /* Copy all entries which precede 'me'. */
2340    for (i = 0; i < n; i++) {
2341       ScalarTS* here = &vts->ts[i];
2342       if (UNLIKELY(here->thrid >= me_thrid))
2343          break;
2344       UInt hi = out->usedTS++;
2345       out->ts[hi] = *here;
2346    }
2347 
2348    /* 'i' now indicates the next entry to copy, if any.
2349        There are 3 possibilities:
2350        (a) there is no next entry (we used them all up already):
2351            add (me_thrid,1) to the output, and quit
2352        (b) there is a next entry, and its thrid > me_thrid:
2353            add (me_thrid,1) to the output, then copy the remaining entries
2354        (c) there is a next entry, and its thrid == me_thrid:
2355            copy it to the output but increment its timestamp value.
2356            Then copy the remaining entries.  (c) is the common case.
2357    */
2358    tl_assert(i >= 0 && i <= n);
2359    if (i == n) { /* case (a) */
2360       UInt hi = out->usedTS++;
2361       out->ts[hi].thrid = me_thrid;
2362       out->ts[hi].tym   = 1;
2363    } else {
2364       /* cases (b) and (c) */
2365       ScalarTS* here = &vts->ts[i];
2366       if (me_thrid == here->thrid) { /* case (c) */
2367          if (UNLIKELY(here->tym >= (1ULL << SCALARTS_N_TYMBITS) - 2ULL)) {
2368             /* We're hosed.  We have to stop. */
2369             scalarts_limitations_fail_NORETURN( False/*!due_to_nThrs*/ );
2370          }
2371          UInt hi = out->usedTS++;
2372          out->ts[hi].thrid = here->thrid;
2373          out->ts[hi].tym   = here->tym + 1;
2374          i++;
2375          found = True;
2376       } else { /* case (b) */
2377          UInt hi = out->usedTS++;
2378          out->ts[hi].thrid = me_thrid;
2379          out->ts[hi].tym   = 1;
2380       }
2381       /* And copy any remaining entries. */
2382       for (/*keepgoing*/; i < n; i++) {
2383          ScalarTS* here2 = &vts->ts[i];
2384          UInt hi = out->usedTS++;
2385          out->ts[hi] = *here2;
2386       }
2387    }
2388 
2389    tl_assert(is_sane_VTS(out));
2390    tl_assert(out->usedTS == vts->usedTS + (found ? 0 : 1));
2391    tl_assert(out->usedTS <= out->sizeTS);
2392 }
2393 
2394 
2395 /* Return a new VTS constructed as the join (max) of the 2 args.
2396    Neither arg is modified.
2397 */
VTS__join(VTS * out,VTS * a,VTS * b)2398 static void VTS__join ( /*OUT*/VTS* out, VTS* a, VTS* b )
2399 {
2400    UInt     ia, ib, useda, usedb;
2401    ULong    tyma, tymb, tymMax;
2402    ThrID    thrid;
2403    UInt     ncommon = 0;
2404 
2405    stats__vts__join++;
2406 
2407    tl_assert(a);
2408    tl_assert(b);
2409    useda = a->usedTS;
2410    usedb = b->usedTS;
2411 
2412    tl_assert(out);
2413    tl_assert(out->usedTS == 0);
2414    /* overly conservative test, but doing better involves comparing
2415       the two VTSs, which we don't want to do at this point. */
2416    if (useda + usedb >= ThrID_MAX_VALID)
2417       scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
2418    tl_assert(out->sizeTS >= useda + usedb);
2419 
2420    ia = ib = 0;
2421 
2422    while (1) {
2423 
2424       /* This logic is to enumerate triples (thrid, tyma, tymb) drawn
2425          from a and b in order, where thrid is the next ThrID
2426          occurring in either a or b, and tyma/b are the relevant
2427          scalar timestamps, taking into account implicit zeroes. */
2428       tl_assert(ia >= 0 && ia <= useda);
2429       tl_assert(ib >= 0 && ib <= usedb);
2430 
2431       if        (ia == useda && ib == usedb) {
2432          /* both empty - done */
2433          break;
2434 
2435       } else if (ia == useda && ib != usedb) {
2436          /* a empty, use up b */
2437          ScalarTS* tmpb = &b->ts[ib];
2438          thrid = tmpb->thrid;
2439          tyma  = 0;
2440          tymb  = tmpb->tym;
2441          ib++;
2442 
2443       } else if (ia != useda && ib == usedb) {
2444          /* b empty, use up a */
2445          ScalarTS* tmpa = &a->ts[ia];
2446          thrid = tmpa->thrid;
2447          tyma  = tmpa->tym;
2448          tymb  = 0;
2449          ia++;
2450 
2451       } else {
2452          /* both not empty; extract lowest-ThrID'd triple */
2453          ScalarTS* tmpa = &a->ts[ia];
2454          ScalarTS* tmpb = &b->ts[ib];
2455          if (tmpa->thrid < tmpb->thrid) {
2456             /* a has the lowest unconsidered ThrID */
2457             thrid = tmpa->thrid;
2458             tyma  = tmpa->tym;
2459             tymb  = 0;
2460             ia++;
2461          } else if (tmpa->thrid > tmpb->thrid) {
2462             /* b has the lowest unconsidered ThrID */
2463             thrid = tmpb->thrid;
2464             tyma  = 0;
2465             tymb  = tmpb->tym;
2466             ib++;
2467          } else {
2468             /* they both next mention the same ThrID */
2469             tl_assert(tmpa->thrid == tmpb->thrid);
2470             thrid = tmpa->thrid; /* == tmpb->thrid */
2471             tyma  = tmpa->tym;
2472             tymb  = tmpb->tym;
2473             ia++;
2474             ib++;
2475             ncommon++;
2476          }
2477       }
2478 
2479       /* having laboriously determined (thr, tyma, tymb), do something
2480          useful with it. */
2481       tymMax = tyma > tymb ? tyma : tymb;
2482       if (tymMax > 0) {
2483          UInt hi = out->usedTS++;
2484          out->ts[hi].thrid = thrid;
2485          out->ts[hi].tym   = tymMax;
2486       }
2487 
2488    }
2489 
2490    tl_assert(is_sane_VTS(out));
2491    tl_assert(out->usedTS <= out->sizeTS);
2492    tl_assert(out->usedTS == useda + usedb - ncommon);
2493 }
2494 
2495 
2496 /* Determine if 'a' <= 'b', in the partial ordering.  Returns zero if
2497    they are, or the first ThrID for which they are not (no valid ThrID
2498    has the value zero).  This rather strange convention is used
2499    because sometimes we want to know the actual index at which they
2500    first differ. */
VTS__cmpLEQ(VTS * a,VTS * b)2501 static UInt/*ThrID*/ VTS__cmpLEQ ( VTS* a, VTS* b )
2502 {
2503    Word  ia, ib, useda, usedb;
2504    ULong tyma, tymb;
2505 
2506    stats__vts__cmpLEQ++;
2507 
2508    tl_assert(a);
2509    tl_assert(b);
2510    useda = a->usedTS;
2511    usedb = b->usedTS;
2512 
2513    ia = ib = 0;
2514 
2515    while (1) {
2516 
2517       /* This logic is to enumerate doubles (tyma, tymb) drawn
2518          from a and b in order, and tyma/b are the relevant
2519          scalar timestamps, taking into account implicit zeroes. */
2520       ThrID thrid;
2521 
2522       tl_assert(ia >= 0 && ia <= useda);
2523       tl_assert(ib >= 0 && ib <= usedb);
2524 
2525       if        (ia == useda && ib == usedb) {
2526          /* both empty - done */
2527          break;
2528 
2529       } else if (ia == useda && ib != usedb) {
2530          /* a empty, use up b */
2531          ScalarTS* tmpb = &b->ts[ib];
2532          tyma  = 0;
2533          tymb  = tmpb->tym;
2534          thrid = tmpb->thrid;
2535          ib++;
2536 
2537       } else if (ia != useda && ib == usedb) {
2538          /* b empty, use up a */
2539          ScalarTS* tmpa = &a->ts[ia];
2540          tyma  = tmpa->tym;
2541          thrid = tmpa->thrid;
2542          tymb  = 0;
2543          ia++;
2544 
2545       } else {
2546          /* both not empty; extract lowest-ThrID'd triple */
2547          ScalarTS* tmpa = &a->ts[ia];
2548          ScalarTS* tmpb = &b->ts[ib];
2549          if (tmpa->thrid < tmpb->thrid) {
2550             /* a has the lowest unconsidered ThrID */
2551             tyma  = tmpa->tym;
2552             thrid = tmpa->thrid;
2553             tymb  = 0;
2554             ia++;
2555          }
2556          else
2557          if (tmpa->thrid > tmpb->thrid) {
2558             /* b has the lowest unconsidered ThrID */
2559             tyma  = 0;
2560             tymb  = tmpb->tym;
2561             thrid = tmpb->thrid;
2562             ib++;
2563          } else {
2564             /* they both next mention the same ThrID */
2565             tl_assert(tmpa->thrid == tmpb->thrid);
2566             tyma  = tmpa->tym;
2567             thrid = tmpa->thrid;
2568             tymb  = tmpb->tym;
2569             ia++;
2570             ib++;
2571          }
2572       }
2573 
2574       /* having laboriously determined (tyma, tymb), do something
2575          useful with it. */
2576       if (tyma > tymb) {
2577          /* not LEQ at this index.  Quit, since the answer is
2578             determined already. */
2579          tl_assert(thrid >= 1024);
2580          return thrid;
2581       }
2582    }
2583 
2584    return 0; /* all points are LEQ => return an invalid ThrID */
2585 }
2586 
2587 
2588 /* Compute an arbitrary structural (total) ordering on the two args,
2589    based on their VCs, so they can be looked up in a table, tree, etc.
2590    Returns -1, 0 or 1.  (really just 'deriving Ord' :-) This can be
2591    performance critical so there is some effort expended to make it sa
2592    fast as possible.
2593 */
VTS__cmp_structural(VTS * a,VTS * b)2594 Word VTS__cmp_structural ( VTS* a, VTS* b )
2595 {
2596    /* We just need to generate an arbitrary total ordering based on
2597       a->ts and b->ts.  Preferably do it in a way which comes across likely
2598       differences relatively quickly. */
2599    Word     i;
2600    Word     useda = 0,    usedb = 0;
2601    ScalarTS *ctsa = NULL, *ctsb = NULL;
2602 
2603    stats__vts__cmp_structural++;
2604 
2605    tl_assert(a);
2606    tl_assert(b);
2607 
2608    ctsa = &a->ts[0]; useda = a->usedTS;
2609    ctsb = &b->ts[0]; usedb = b->usedTS;
2610 
2611    if (LIKELY(useda == usedb)) {
2612       ScalarTS *tmpa = NULL, *tmpb = NULL;
2613       stats__vts__cmp_structural_slow++;
2614       /* Same length vectors.  Find the first difference, if any, as
2615          fast as possible. */
2616       for (i = 0; i < useda; i++) {
2617          tmpa = &ctsa[i];
2618          tmpb = &ctsb[i];
2619          if (LIKELY(tmpa->tym == tmpb->tym
2620                     && tmpa->thrid == tmpb->thrid))
2621             continue;
2622          else
2623             break;
2624       }
2625       if (UNLIKELY(i == useda)) {
2626          /* They're identical. */
2627          return 0;
2628       } else {
2629          tl_assert(i >= 0 && i < useda);
2630          if (tmpa->tym < tmpb->tym) return -1;
2631          if (tmpa->tym > tmpb->tym) return 1;
2632          if (tmpa->thrid < tmpb->thrid) return -1;
2633          if (tmpa->thrid > tmpb->thrid) return 1;
2634          /* we just established them as non-identical, hence: */
2635       }
2636       /*NOTREACHED*/
2637       tl_assert(0);
2638    }
2639 
2640    if (useda < usedb) return -1;
2641    if (useda > usedb) return 1;
2642    /*NOTREACHED*/
2643    tl_assert(0);
2644 }
2645 
2646 
2647 /* Debugging only.  Display the given VTS.
2648 */
VTS__show(const VTS * vts)2649 static void VTS__show ( const VTS* vts )
2650 {
2651    Word      i, n;
2652    tl_assert(vts);
2653 
2654    VG_(printf)("[");
2655    n =  vts->usedTS;
2656    for (i = 0; i < n; i++) {
2657       const ScalarTS *st = &vts->ts[i];
2658       VG_(printf)(i < n-1 ? "%d:%llu " : "%d:%llu", st->thrid, (ULong)st->tym);
2659    }
2660    VG_(printf)("]");
2661 }
2662 
2663 
2664 /* Debugging only.  Return vts[index], so to speak.
2665 */
VTS__indexAt_SLOW(VTS * vts,Thr * idx)2666 ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx )
2667 {
2668    UWord i, n;
2669    ThrID idx_thrid = Thr__to_ThrID(idx);
2670    stats__vts__indexat_slow++;
2671    tl_assert(vts);
2672    n = vts->usedTS;
2673    for (i = 0; i < n; i++) {
2674       ScalarTS* st = &vts->ts[i];
2675       if (st->thrid == idx_thrid)
2676          return st->tym;
2677    }
2678    return 0;
2679 }
2680 
2681 
2682 /* See comment on prototype above.
2683 */
VTS__declare_thread_very_dead(Thr * thr)2684 static void VTS__declare_thread_very_dead ( Thr* thr )
2685 {
2686    if (0) VG_(printf)("VTQ:  tae %p\n", thr);
2687 
2688    tl_assert(thr->llexit_done);
2689    tl_assert(thr->joinedwith_done);
2690 
2691    ThrID nyu;
2692    nyu = Thr__to_ThrID(thr);
2693    VG_(addToXA)( verydead_thread_table_not_pruned, &nyu );
2694 
2695    /* We can only get here if we're assured that we'll never again
2696       need to look at this thread's ::viR or ::viW.  Set them to
2697       VtsID_INVALID, partly so as to avoid holding on to the VTSs, but
2698       mostly so that we don't wind up pruning them (as that would be
2699       nonsensical: the only interesting ScalarTS entry for a dead
2700       thread is its own index, and the pruning will remove that.). */
2701    VtsID__rcdec(thr->viR);
2702    VtsID__rcdec(thr->viW);
2703    thr->viR = VtsID_INVALID;
2704    thr->viW = VtsID_INVALID;
2705 }
2706 
2707 
2708 /////////////////////////////////////////////////////////////////
2709 /////////////////////////////////////////////////////////////////
2710 //                                                             //
2711 // SECTION END vts primitives                                  //
2712 //                                                             //
2713 /////////////////////////////////////////////////////////////////
2714 /////////////////////////////////////////////////////////////////
2715 
2716 
2717 
2718 /////////////////////////////////////////////////////////////////
2719 /////////////////////////////////////////////////////////////////
2720 //                                                             //
2721 // SECTION BEGIN main library                                  //
2722 //                                                             //
2723 /////////////////////////////////////////////////////////////////
2724 /////////////////////////////////////////////////////////////////
2725 
2726 
2727 /////////////////////////////////////////////////////////
2728 //                                                     //
2729 // VTS set                                             //
2730 //                                                     //
2731 /////////////////////////////////////////////////////////
2732 
2733 static WordFM* /* WordFM VTS* void */ vts_set = NULL;
2734 
vts_set_init(void)2735 static void vts_set_init ( void )
2736 {
2737    tl_assert(!vts_set);
2738    vts_set = VG_(newFM)( HG_(zalloc), "libhb.vts_set_init.1",
2739                          HG_(free),
2740                          (Word(*)(UWord,UWord))VTS__cmp_structural );
2741 }
2742 
2743 /* Given a VTS, look in vts_set to see if we already have a
2744    structurally identical one.  If yes, return the pair (True, pointer
2745    to the existing one).  If no, clone this one, add the clone to the
2746    set, and return (False, pointer to the clone). */
vts_set__find__or__clone_and_add(VTS ** res,VTS * cand)2747 static Bool vts_set__find__or__clone_and_add ( /*OUT*/VTS** res, VTS* cand )
2748 {
2749    UWord keyW, valW;
2750    stats__vts_set__focaa++;
2751    tl_assert(cand->id == VtsID_INVALID);
2752    /* lookup cand (by value) */
2753    if (VG_(lookupFM)( vts_set, &keyW, &valW, (UWord)cand )) {
2754       /* found it */
2755       tl_assert(valW == 0);
2756       /* if this fails, cand (by ref) was already present (!) */
2757       tl_assert(keyW != (UWord)cand);
2758       *res = (VTS*)keyW;
2759       return True;
2760    } else {
2761       /* not present.  Clone, add and return address of clone. */
2762       stats__vts_set__focaa_a++;
2763       VTS* clone = VTS__clone( "libhb.vts_set_focaa.1", cand );
2764       tl_assert(clone != cand);
2765       VG_(addToFM)( vts_set, (UWord)clone, 0/*val is unused*/ );
2766       *res = clone;
2767       return False;
2768    }
2769 }
2770 
2771 
2772 /////////////////////////////////////////////////////////
2773 //                                                     //
2774 // VTS table                                           //
2775 //                                                     //
2776 /////////////////////////////////////////////////////////
2777 
2778 static void VtsID__invalidate_caches ( void ); /* fwds */
2779 
2780 /* A type to hold VTS table entries.  Invariants:
2781    If .vts == NULL, then this entry is not in use, so:
2782    - .rc == 0
2783    - this entry is on the freelist (unfortunately, does not imply
2784      any constraints on value for u.freelink)
2785    If .vts != NULL, then this entry is in use:
2786    - .vts is findable in vts_set
2787    - .vts->id == this entry number
2788    - no specific value for .rc (even 0 is OK)
2789    - this entry is not on freelist, so u.freelink == VtsID_INVALID
2790 */
2791 typedef
2792    struct {
2793       VTS*  vts;      /* vts, in vts_set */
2794       UWord rc;       /* reference count - enough for entire aspace */
2795       union {
2796          VtsID freelink; /* chain for free entries, VtsID_INVALID at end */
2797          VtsID remap;    /* used only during pruning, for used entries */
2798       } u;
2799       /* u.freelink only used when vts == NULL,
2800          u.remap only used when vts != NULL, during pruning. */
2801    }
2802    VtsTE;
2803 
2804 /* The VTS table. */
2805 static XArray* /* of VtsTE */ vts_tab = NULL;
2806 
2807 /* An index into the VTS table, indicating the start of the list of
2808    free (available for use) entries.  If the list is empty, this is
2809    VtsID_INVALID. */
2810 static VtsID vts_tab_freelist = VtsID_INVALID;
2811 
2812 /* Do a GC of vts_tab when the freelist becomes empty AND the size of
2813    vts_tab equals or exceeds this size.  After GC, the value here is
2814    set appropriately so as to check for the next GC point. */
2815 static Word vts_next_GC_at = 1000;
2816 
vts_tab_init(void)2817 static void vts_tab_init ( void )
2818 {
2819    vts_tab = VG_(newXA)( HG_(zalloc), "libhb.vts_tab_init.1",
2820                          HG_(free), sizeof(VtsTE) );
2821    vts_tab_freelist = VtsID_INVALID;
2822 }
2823 
2824 /* Add ii to the free list, checking that it looks out-of-use. */
add_to_free_list(VtsID ii)2825 static void add_to_free_list ( VtsID ii )
2826 {
2827    VtsTE* ie = VG_(indexXA)( vts_tab, ii );
2828    tl_assert(ie->vts == NULL);
2829    tl_assert(ie->rc == 0);
2830    tl_assert(ie->u.freelink == VtsID_INVALID);
2831    ie->u.freelink = vts_tab_freelist;
2832    vts_tab_freelist = ii;
2833 }
2834 
2835 /* Get an entry from the free list.  This will return VtsID_INVALID if
2836    the free list is empty. */
get_from_free_list(void)2837 static VtsID get_from_free_list ( void )
2838 {
2839    VtsID  ii;
2840    VtsTE* ie;
2841    if (vts_tab_freelist == VtsID_INVALID)
2842       return VtsID_INVALID;
2843    ii = vts_tab_freelist;
2844    ie = VG_(indexXA)( vts_tab, ii );
2845    tl_assert(ie->vts == NULL);
2846    tl_assert(ie->rc == 0);
2847    vts_tab_freelist = ie->u.freelink;
2848    return ii;
2849 }
2850 
2851 /* Produce a new VtsID that can be used, either by getting it from
2852    the freelist, or, if that is empty, by expanding vts_tab. */
get_new_VtsID(void)2853 static VtsID get_new_VtsID ( void )
2854 {
2855    VtsID ii;
2856    VtsTE te;
2857    ii = get_from_free_list();
2858    if (ii != VtsID_INVALID)
2859       return ii;
2860    te.vts = NULL;
2861    te.rc = 0;
2862    te.u.freelink = VtsID_INVALID;
2863    ii = (VtsID)VG_(addToXA)( vts_tab, &te );
2864    return ii;
2865 }
2866 
2867 
2868 /* Indirect callback from lib_zsm. */
VtsID__rcinc(VtsID ii)2869 static void VtsID__rcinc ( VtsID ii )
2870 {
2871    VtsTE* ie;
2872    /* VG_(indexXA) does a range check for us */
2873    ie = VG_(indexXA)( vts_tab, ii );
2874    tl_assert(ie->vts); /* else it's not in use */
2875    tl_assert(ie->rc < ~0UL); /* else we can't continue */
2876    tl_assert(ie->vts->id == ii);
2877    ie->rc++;
2878 }
2879 
2880 /* Indirect callback from lib_zsm. */
VtsID__rcdec(VtsID ii)2881 static void VtsID__rcdec ( VtsID ii )
2882 {
2883    VtsTE* ie;
2884    /* VG_(indexXA) does a range check for us */
2885    ie = VG_(indexXA)( vts_tab, ii );
2886    tl_assert(ie->vts); /* else it's not in use */
2887    tl_assert(ie->rc > 0); /* else RC snafu */
2888    tl_assert(ie->vts->id == ii);
2889    ie->rc--;
2890 }
2891 
2892 
2893 /* Look up 'cand' in our collection of VTSs.  If present, return the
2894    VtsID for the pre-existing version.  If not present, clone it, add
2895    the clone to both vts_tab and vts_set, allocate a fresh VtsID for
2896    it, and return that. */
vts_tab__find__or__clone_and_add(VTS * cand)2897 static VtsID vts_tab__find__or__clone_and_add ( VTS* cand )
2898 {
2899    VTS* in_tab = NULL;
2900    tl_assert(cand->id == VtsID_INVALID);
2901    Bool already_have = vts_set__find__or__clone_and_add( &in_tab, cand );
2902    tl_assert(in_tab);
2903    if (already_have) {
2904       /* We already have a copy of 'cand'.  Use that. */
2905       VtsTE* ie;
2906       tl_assert(in_tab->id != VtsID_INVALID);
2907       ie = VG_(indexXA)( vts_tab, in_tab->id );
2908       tl_assert(ie->vts == in_tab);
2909       return in_tab->id;
2910    } else {
2911       VtsID  ii = get_new_VtsID();
2912       VtsTE* ie = VG_(indexXA)( vts_tab, ii );
2913       ie->vts = in_tab;
2914       ie->rc = 0;
2915       ie->u.freelink = VtsID_INVALID;
2916       in_tab->id = ii;
2917       return ii;
2918    }
2919 }
2920 
2921 
show_vts_stats(const HChar * caller)2922 static void show_vts_stats ( const HChar* caller )
2923 {
2924    UWord nSet, nTab, nLive;
2925    ULong totrc;
2926    UWord n, i;
2927    nSet = VG_(sizeFM)( vts_set );
2928    nTab = VG_(sizeXA)( vts_tab );
2929    totrc = 0;
2930    nLive = 0;
2931    n = VG_(sizeXA)( vts_tab );
2932    for (i = 0; i < n; i++) {
2933       VtsTE* ie = VG_(indexXA)( vts_tab, i );
2934       if (ie->vts) {
2935          nLive++;
2936          totrc += (ULong)ie->rc;
2937       } else {
2938          tl_assert(ie->rc == 0);
2939       }
2940    }
2941    VG_(printf)("  show_vts_stats %s\n", caller);
2942    VG_(printf)("    vts_tab size %4lu\n", nTab);
2943    VG_(printf)("    vts_tab live %4lu\n", nLive);
2944    VG_(printf)("    vts_set size %4lu\n", nSet);
2945    VG_(printf)("        total rc %4llu\n", totrc);
2946 }
2947 
2948 
2949 /* --- Helpers for VtsID pruning --- */
2950 
2951 static
remap_VtsID(XArray * old_tab,XArray * new_tab,VtsID * ii)2952 void remap_VtsID ( /*MOD*/XArray* /* of VtsTE */ old_tab,
2953                    /*MOD*/XArray* /* of VtsTE */ new_tab,
2954                    VtsID* ii )
2955 {
2956    VtsTE *old_te, *new_te;
2957    VtsID old_id, new_id;
2958    /* We're relying here on VG_(indexXA)'s range checking to assert on
2959       any stupid values, in particular *ii == VtsID_INVALID. */
2960    old_id = *ii;
2961    old_te = VG_(indexXA)( old_tab, old_id );
2962    old_te->rc--;
2963    new_id = old_te->u.remap;
2964    new_te = VG_(indexXA)( new_tab, new_id );
2965    new_te->rc++;
2966    *ii = new_id;
2967 }
2968 
2969 static
remap_VtsIDs_in_SVal(XArray * old_tab,XArray * new_tab,SVal * s)2970 void remap_VtsIDs_in_SVal ( /*MOD*/XArray* /* of VtsTE */ old_tab,
2971                             /*MOD*/XArray* /* of VtsTE */ new_tab,
2972                             SVal* s )
2973 {
2974    SVal old_sv, new_sv;
2975    old_sv = *s;
2976    if (SVal__isC(old_sv)) {
2977       VtsID rMin, wMin;
2978       rMin = SVal__unC_Rmin(old_sv);
2979       wMin = SVal__unC_Wmin(old_sv);
2980       remap_VtsID( old_tab, new_tab, &rMin );
2981       remap_VtsID( old_tab, new_tab, &wMin );
2982       new_sv = SVal__mkC( rMin, wMin );
2983       *s = new_sv;
2984   }
2985 }
2986 
2987 
2988 /* NOT TO BE CALLED FROM WITHIN libzsm. */
2989 __attribute__((noinline))
vts_tab__do_GC(Bool show_stats)2990 static void vts_tab__do_GC ( Bool show_stats )
2991 {
2992    UWord i, nTab, nLive, nFreed;
2993 
2994    /* ---------- BEGIN VTS GC ---------- */
2995    /* check this is actually necessary. */
2996    tl_assert(vts_tab_freelist == VtsID_INVALID);
2997 
2998    /* empty the caches for partial order checks and binary joins.  We
2999       could do better and prune out the entries to be deleted, but it
3000       ain't worth the hassle. */
3001    VtsID__invalidate_caches();
3002 
3003    /* First, make the reference counts up to date. */
3004    zsm_flush_cache();
3005 
3006    nTab = VG_(sizeXA)( vts_tab );
3007 
3008    if (show_stats) {
3009       VG_(printf)("<<GC begins at vts_tab size %lu>>\n", nTab);
3010       show_vts_stats("before GC");
3011    }
3012 
3013    /* Now we can inspect the entire vts_tab.  Any entries with zero
3014       .rc fields are now no longer in use and can be put back on the
3015       free list, removed from vts_set, and deleted. */
3016    nFreed = 0;
3017    for (i = 0; i < nTab; i++) {
3018       Bool present;
3019       UWord oldK = 0, oldV = 12345;
3020       VtsTE* te = VG_(indexXA)( vts_tab, i );
3021       if (te->vts == NULL) {
3022          tl_assert(te->rc == 0);
3023          continue; /* already on the free list (presumably) */
3024       }
3025       if (te->rc > 0)
3026          continue; /* in use */
3027       /* Ok, we got one we can free. */
3028       tl_assert(te->vts->id == i);
3029       /* first, remove it from vts_set. */
3030       present = VG_(delFromFM)( vts_set,
3031                                 &oldK, &oldV, (UWord)te->vts );
3032       tl_assert(present); /* else it isn't in vts_set ?! */
3033       tl_assert(oldV == 0); /* no info stored in vts_set val fields */
3034       tl_assert(oldK == (UWord)te->vts); /* else what did delFromFM find?! */
3035       /* now free the VTS itself */
3036       VTS__delete(te->vts);
3037       te->vts = NULL;
3038       /* and finally put this entry on the free list */
3039       tl_assert(te->u.freelink == VtsID_INVALID); /* can't already be on it */
3040       add_to_free_list( i );
3041       nFreed++;
3042    }
3043 
3044    /* Now figure out when the next GC should be.  We'll allow the
3045       number of VTSs to double before GCing again.  Except of course
3046       that since we can't (or, at least, don't) shrink vts_tab, we
3047       can't set the threshold value smaller than it. */
3048    tl_assert(nFreed <= nTab);
3049    nLive = nTab - nFreed;
3050    tl_assert(nLive >= 0 && nLive <= nTab);
3051    vts_next_GC_at = 2 * nLive;
3052    if (vts_next_GC_at < nTab)
3053       vts_next_GC_at = nTab;
3054 
3055    if (show_stats) {
3056       show_vts_stats("after GC");
3057       VG_(printf)("<<GC ends, next gc at %ld>>\n", vts_next_GC_at);
3058    }
3059 
3060    stats__vts_tab_GC++;
3061    if (VG_(clo_stats)) {
3062       tl_assert(nTab > 0);
3063       VG_(message)(Vg_DebugMsg,
3064                    "libhb: VTS GC: #%lu  old size %lu  live %lu  (%2llu%%)\n",
3065                    stats__vts_tab_GC,
3066                    nTab, nLive, (100ULL * (ULong)nLive) / (ULong)nTab);
3067    }
3068    /* ---------- END VTS GC ---------- */
3069 
3070    /* Decide whether to do VTS pruning.  We have one of three
3071       settings. */
3072    static UInt pruning_auto_ctr = 0; /* do not make non-static */
3073 
3074    Bool do_pruning = False;
3075    switch (HG_(clo_vts_pruning)) {
3076       case 0: /* never */
3077          break;
3078       case 1: /* auto */
3079          do_pruning = (++pruning_auto_ctr % 5) == 0;
3080          break;
3081       case 2: /* always */
3082          do_pruning = True;
3083          break;
3084       default:
3085          tl_assert(0);
3086    }
3087 
3088    /* The rest of this routine only handles pruning, so we can
3089       quit at this point if it is not to be done. */
3090    if (!do_pruning)
3091       return;
3092    /* No need to do pruning if no thread died since the last pruning as
3093       no VtsTE can be pruned. */
3094    if (VG_(sizeXA)( verydead_thread_table_not_pruned) == 0)
3095       return;
3096 
3097    /* ---------- BEGIN VTS PRUNING ---------- */
3098    /* Sort and check the very dead threads that died since the last pruning.
3099       Sorting is used for the check and so that we can quickly look
3100       up the dead-thread entries as we work through the VTSs. */
3101    verydead_thread_table_sort_and_check (verydead_thread_table_not_pruned);
3102 
3103    /* We will run through the old table, and create a new table and
3104       set, at the same time setting the u.remap entries in the old
3105       table to point to the new entries.  Then, visit every VtsID in
3106       the system, and replace all of them with new ones, using the
3107       u.remap entries in the old table.  Finally, we can delete the old
3108       table and set. */
3109 
3110    XArray* /* of VtsTE */ new_tab
3111       = VG_(newXA)( HG_(zalloc), "libhb.vts_tab__do_GC.new_tab",
3112                     HG_(free), sizeof(VtsTE) );
3113 
3114    /* WordFM VTS* void */
3115    WordFM* new_set
3116       = VG_(newFM)( HG_(zalloc), "libhb.vts_tab__do_GC.new_set",
3117                     HG_(free),
3118                     (Word(*)(UWord,UWord))VTS__cmp_structural );
3119 
3120    /* Visit each old VTS.  For each one:
3121 
3122       * make a pruned version
3123 
3124       * search new_set for the pruned version, yielding either
3125         Nothing (not present) or the new VtsID for it.
3126 
3127       * if not present, allocate a new VtsID for it, insert (pruned
3128         VTS, new VtsID) in the tree, and set
3129         remap_table[old VtsID] = new VtsID.
3130 
3131       * if present, set remap_table[old VtsID] = new VtsID, where
3132         new VtsID was determined by the tree lookup.  Then free up
3133         the clone.
3134    */
3135 
3136    UWord nBeforePruning = 0, nAfterPruning = 0;
3137    UWord nSTSsBefore = 0, nSTSsAfter = 0;
3138    VtsID new_VtsID_ctr = 0;
3139 
3140    for (i = 0; i < nTab; i++) {
3141 
3142       /* For each old VTS .. */
3143       VtsTE* old_te  = VG_(indexXA)( vts_tab, i );
3144       VTS*   old_vts = old_te->vts;
3145 
3146       /* Skip it if not in use */
3147       if (old_te->rc == 0) {
3148          tl_assert(old_vts == NULL);
3149          continue;
3150       }
3151       tl_assert(old_te->u.remap == VtsID_INVALID);
3152       tl_assert(old_vts != NULL);
3153       tl_assert(old_vts->id == i);
3154       tl_assert(old_vts->ts != NULL);
3155 
3156       /* It is in use. Make a pruned version. */
3157       nBeforePruning++;
3158       nSTSsBefore += old_vts->usedTS;
3159       VTS* new_vts = VTS__subtract("libhb.vts_tab__do_GC.new_vts",
3160                                    old_vts, verydead_thread_table_not_pruned);
3161       tl_assert(new_vts->sizeTS == new_vts->usedTS);
3162       tl_assert(*(ULong*)(&new_vts->ts[new_vts->usedTS])
3163                 == 0x0ddC0ffeeBadF00dULL);
3164 
3165       /* Get rid of the old VTS and the tree entry.  It's a bit more
3166          complex to incrementally delete the VTSs now than to nuke
3167          them all after we're done, but the upside is that we don't
3168          wind up temporarily storing potentially two complete copies
3169          of each VTS and hence spiking memory use. */
3170       UWord oldK = 0, oldV = 12345;
3171       Bool  present = VG_(delFromFM)( vts_set,
3172                                       &oldK, &oldV, (UWord)old_vts );
3173       tl_assert(present); /* else it isn't in vts_set ?! */
3174       tl_assert(oldV == 0); /* no info stored in vts_set val fields */
3175       tl_assert(oldK == (UWord)old_vts); /* else what did delFromFM find?! */
3176       /* now free the VTS itself */
3177       VTS__delete(old_vts);
3178       old_te->vts = NULL;
3179       old_vts = NULL;
3180 
3181       /* NO MENTIONS of old_vts allowed beyond this point. */
3182 
3183       /* Ok, we have the pruned copy in new_vts.  See if a
3184          structurally identical version is already present in new_set.
3185          If so, delete the one we just made and move on; if not, add
3186          it. */
3187       VTS*  identical_version = NULL;
3188       UWord valW = 12345;
3189       if (VG_(lookupFM)(new_set, (UWord*)&identical_version, &valW,
3190                         (UWord)new_vts)) {
3191          // already have it
3192          tl_assert(valW == 0);
3193          tl_assert(identical_version != NULL);
3194          tl_assert(identical_version != new_vts);
3195          VTS__delete(new_vts);
3196          new_vts = identical_version;
3197          tl_assert(new_vts->id != VtsID_INVALID);
3198       } else {
3199          tl_assert(valW == 12345);
3200          tl_assert(identical_version == NULL);
3201          new_vts->id = new_VtsID_ctr++;
3202          Bool b = VG_(addToFM)(new_set, (UWord)new_vts, 0);
3203          tl_assert(!b);
3204          VtsTE new_te;
3205          new_te.vts      = new_vts;
3206          new_te.rc       = 0;
3207          new_te.u.freelink = VtsID_INVALID;
3208          Word j = VG_(addToXA)( new_tab, &new_te );
3209          tl_assert(j <= i);
3210          tl_assert(j == new_VtsID_ctr - 1);
3211          // stats
3212          nAfterPruning++;
3213          nSTSsAfter += new_vts->usedTS;
3214       }
3215       old_te->u.remap = new_vts->id;
3216 
3217    } /* for (i = 0; i < nTab; i++) */
3218 
3219    /* Move very dead thread from verydead_thread_table_not_pruned to
3220       verydead_thread_table. Sort and check verydead_thread_table
3221       to verify a thread was reported very dead only once. */
3222    {
3223       UWord nBT = VG_(sizeXA)( verydead_thread_table_not_pruned);
3224 
3225       for (i = 0; i < nBT; i++) {
3226          ThrID thrid =
3227             *(ThrID*)VG_(indexXA)( verydead_thread_table_not_pruned, i );
3228          VG_(addToXA)( verydead_thread_table, &thrid );
3229       }
3230       verydead_thread_table_sort_and_check (verydead_thread_table);
3231       VG_(dropHeadXA) (verydead_thread_table_not_pruned, nBT);
3232    }
3233 
3234    /* At this point, we have:
3235       * the old VTS table, with its u.remap entries set,
3236         and with all .vts == NULL.
3237       * the old VTS tree should be empty, since it and the old VTSs
3238         it contained have been incrementally deleted was we worked
3239         through the old table.
3240       * the new VTS table, with all .rc == 0, all u.freelink and u.remap
3241         == VtsID_INVALID.
3242       * the new VTS tree.
3243    */
3244    tl_assert( VG_(sizeFM)(vts_set) == 0 );
3245 
3246    /* Now actually apply the mapping. */
3247    /* Visit all the VtsIDs in the entire system.  Where do we expect
3248       to find them?
3249       (a) in shadow memory -- the LineZs and LineFs
3250       (b) in our collection of struct _Thrs.
3251       (c) in our collection of struct _SOs.
3252       Nowhere else, AFAICS.  Not in the zsm cache, because that just
3253       got invalidated.
3254 
3255       Using the u.remap fields in vts_tab, map each old VtsID to a new
3256       VtsID.  For each old VtsID, dec its rc; and for each new one,
3257       inc it.  This sets up the new refcounts, and it also gives a
3258       cheap sanity check of the old ones: all old refcounts should be
3259       zero after this operation.
3260    */
3261 
3262    /* Do the mappings for (a) above: iterate over the Primary shadow
3263       mem map (WordFM Addr SecMap*). */
3264    UWord secmapW = 0;
3265    VG_(initIterFM)( map_shmem );
3266    while (VG_(nextIterFM)( map_shmem, NULL, &secmapW )) {
3267       UWord   j;
3268       SecMap* sm = (SecMap*)secmapW;
3269       tl_assert(sm->magic == SecMap_MAGIC);
3270       /* Deal with the LineZs */
3271       for (i = 0; i < N_SECMAP_ZLINES; i++) {
3272          LineZ* lineZ = &sm->linesZ[i];
3273          if (lineZ->dict[0] != SVal_INVALID) {
3274             for (j = 0; j < 4; j++)
3275                remap_VtsIDs_in_SVal(vts_tab, new_tab, &lineZ->dict[j]);
3276          } else {
3277             LineF* lineF = SVal2Ptr (lineZ->dict[1]);
3278             for (j = 0; j < N_LINE_ARANGE; j++)
3279                remap_VtsIDs_in_SVal(vts_tab, new_tab, &lineF->w64s[j]);
3280          }
3281       }
3282    }
3283    VG_(doneIterFM)( map_shmem );
3284 
3285    /* Do the mappings for (b) above: visit our collection of struct
3286       _Thrs. */
3287    Thread* hgthread = get_admin_threads();
3288    tl_assert(hgthread);
3289    while (hgthread) {
3290       Thr* hbthr = hgthread->hbthr;
3291       tl_assert(hbthr);
3292       /* Threads that are listed in the prunable set have their viR
3293          and viW set to VtsID_INVALID, so we can't mess with them. */
3294       if (hbthr->llexit_done && hbthr->joinedwith_done) {
3295          tl_assert(hbthr->viR == VtsID_INVALID);
3296          tl_assert(hbthr->viW == VtsID_INVALID);
3297          hgthread = hgthread->admin;
3298          continue;
3299       }
3300       remap_VtsID( vts_tab, new_tab, &hbthr->viR );
3301       remap_VtsID( vts_tab, new_tab, &hbthr->viW );
3302       hgthread = hgthread->admin;
3303    }
3304 
3305    /* Do the mappings for (c) above: visit the struct _SOs. */
3306    SO* so = admin_SO;
3307    while (so) {
3308       if (so->viR != VtsID_INVALID)
3309          remap_VtsID( vts_tab, new_tab, &so->viR );
3310       if (so->viW != VtsID_INVALID)
3311          remap_VtsID( vts_tab, new_tab, &so->viW );
3312       so = so->admin_next;
3313    }
3314 
3315    /* So, we're nearly done (with this incredibly complex operation).
3316       Check the refcounts for the old VtsIDs all fell to zero, as
3317       expected.  Any failure is serious. */
3318    for (i = 0; i < nTab; i++) {
3319       VtsTE* te = VG_(indexXA)( vts_tab, i );
3320       tl_assert(te->vts == NULL);
3321       /* This is the assert proper.  Note we're also asserting
3322          zeroness for old entries which are unmapped.  That's OK. */
3323       tl_assert(te->rc == 0);
3324    }
3325 
3326    /* Install the new table and set. */
3327    VG_(deleteFM)(vts_set, NULL/*kFin*/, NULL/*vFin*/);
3328    vts_set = new_set;
3329    VG_(deleteXA)( vts_tab );
3330    vts_tab = new_tab;
3331 
3332    /* The freelist of vts_tab entries is empty now, because we've
3333       compacted all of the live entries at the low end of the
3334       table. */
3335    vts_tab_freelist = VtsID_INVALID;
3336 
3337    /* Sanity check vts_set and vts_tab. */
3338 
3339    /* Because all the live entries got slid down to the bottom of vts_tab: */
3340    tl_assert( VG_(sizeXA)( vts_tab ) == VG_(sizeFM)( vts_set ));
3341 
3342    /* Assert that the vts_tab and vts_set entries point at each other
3343       in the required way */
3344    UWord wordK = 0, wordV = 0;
3345    VG_(initIterFM)( vts_set );
3346    while (VG_(nextIterFM)( vts_set, &wordK, &wordV )) {
3347       tl_assert(wordK != 0);
3348       tl_assert(wordV == 0);
3349       VTS* vts = (VTS*)wordK;
3350       tl_assert(vts->id != VtsID_INVALID);
3351       VtsTE* te = VG_(indexXA)( vts_tab, vts->id );
3352       tl_assert(te->vts == vts);
3353    }
3354    VG_(doneIterFM)( vts_set );
3355 
3356    /* Also iterate over the table, and check each entry is
3357       plausible. */
3358    nTab = VG_(sizeXA)( vts_tab );
3359    for (i = 0; i < nTab; i++) {
3360       VtsTE* te = VG_(indexXA)( vts_tab, i );
3361       tl_assert(te->vts);
3362       tl_assert(te->vts->id == i);
3363       tl_assert(te->rc > 0); /* 'cos we just GC'd */
3364       tl_assert(te->u.freelink == VtsID_INVALID); /* in use */
3365       /* value of te->u.remap  not relevant */
3366    }
3367 
3368    /* And we're done.  Bwahahaha. Ha. Ha. Ha. */
3369    stats__vts_pruning++;
3370    if (VG_(clo_stats)) {
3371       tl_assert(nTab > 0);
3372       VG_(message)(
3373          Vg_DebugMsg,
3374          "libhb: VTS PR: #%lu  before %lu (avg sz %lu)  "
3375             "after %lu (avg sz %lu)\n",
3376          stats__vts_pruning,
3377          nBeforePruning, nSTSsBefore / (nBeforePruning ? nBeforePruning : 1),
3378          nAfterPruning, nSTSsAfter / (nAfterPruning ? nAfterPruning : 1)
3379       );
3380    }
3381    /* ---------- END VTS PRUNING ---------- */
3382 }
3383 
3384 
3385 /////////////////////////////////////////////////////////
3386 //                                                     //
3387 // Vts IDs                                             //
3388 //                                                     //
3389 /////////////////////////////////////////////////////////
3390 
3391 //////////////////////////
3392 /* A temporary, max-sized VTS which is used as a temporary (the first
3393    argument) in VTS__singleton, VTS__tick and VTS__join operations. */
3394 static VTS* temp_max_sized_VTS = NULL;
3395 
3396 //////////////////////////
3397 static ULong stats__cmpLEQ_queries = 0;
3398 static ULong stats__cmpLEQ_misses  = 0;
3399 static ULong stats__join2_queries  = 0;
3400 static ULong stats__join2_misses   = 0;
3401 
ROL32(UInt w,Int n)3402 static inline UInt ROL32 ( UInt w, Int n ) {
3403    w = (w << n) | (w >> (32-n));
3404    return w;
3405 }
hash_VtsIDs(VtsID vi1,VtsID vi2,UInt nTab)3406 static inline UInt hash_VtsIDs ( VtsID vi1, VtsID vi2, UInt nTab ) {
3407    UInt hash = ROL32(vi1,19) ^ ROL32(vi2,13);
3408    return hash % nTab;
3409 }
3410 
3411 #define N_CMPLEQ_CACHE 1023
3412 static
3413    struct { VtsID vi1; VtsID vi2; Bool leq; }
3414    cmpLEQ_cache[N_CMPLEQ_CACHE];
3415 
3416 #define N_JOIN2_CACHE 1023
3417 static
3418    struct { VtsID vi1; VtsID vi2; VtsID res; }
3419    join2_cache[N_JOIN2_CACHE];
3420 
VtsID__invalidate_caches(void)3421 static void VtsID__invalidate_caches ( void ) {
3422    Int i;
3423    for (i = 0; i < N_CMPLEQ_CACHE; i++) {
3424       cmpLEQ_cache[i].vi1 = VtsID_INVALID;
3425       cmpLEQ_cache[i].vi2 = VtsID_INVALID;
3426       cmpLEQ_cache[i].leq = False;
3427    }
3428    for (i = 0; i < N_JOIN2_CACHE; i++) {
3429      join2_cache[i].vi1 = VtsID_INVALID;
3430      join2_cache[i].vi2 = VtsID_INVALID;
3431      join2_cache[i].res = VtsID_INVALID;
3432    }
3433 }
3434 //////////////////////////
3435 
3436 //static Bool VtsID__is_valid ( VtsID vi ) {
3437 //   VtsTE* ve;
3438 //   if (vi >= (VtsID)VG_(sizeXA)( vts_tab ))
3439 //      return False;
3440 //   ve = VG_(indexXA)( vts_tab, vi );
3441 //   if (!ve->vts)
3442 //      return False;
3443 //   tl_assert(ve->vts->id == vi);
3444 //   return True;
3445 //}
3446 
VtsID__to_VTS(VtsID vi)3447 static VTS* VtsID__to_VTS ( VtsID vi ) {
3448    VtsTE* te = VG_(indexXA)( vts_tab, vi );
3449    tl_assert(te->vts);
3450    return te->vts;
3451 }
3452 
VtsID__pp(VtsID vi)3453 static void VtsID__pp ( VtsID vi ) {
3454    VTS* vts = VtsID__to_VTS(vi);
3455    VTS__show( vts );
3456 }
3457 
3458 /* compute partial ordering relation of vi1 and vi2. */
3459 __attribute__((noinline))
VtsID__cmpLEQ_WRK(VtsID vi1,VtsID vi2)3460 static Bool VtsID__cmpLEQ_WRK ( VtsID vi1, VtsID vi2 ) {
3461    UInt hash;
3462    Bool leq;
3463    VTS  *v1, *v2;
3464    //if (vi1 == vi2) return True;
3465    tl_assert(vi1 != vi2);
3466    ////++
3467    stats__cmpLEQ_queries++;
3468    hash = hash_VtsIDs(vi1, vi2, N_CMPLEQ_CACHE);
3469    if (cmpLEQ_cache[hash].vi1 == vi1
3470        && cmpLEQ_cache[hash].vi2 == vi2)
3471       return cmpLEQ_cache[hash].leq;
3472    stats__cmpLEQ_misses++;
3473    ////--
3474    v1  = VtsID__to_VTS(vi1);
3475    v2  = VtsID__to_VTS(vi2);
3476    leq = VTS__cmpLEQ( v1, v2 ) == 0;
3477    ////++
3478    cmpLEQ_cache[hash].vi1 = vi1;
3479    cmpLEQ_cache[hash].vi2 = vi2;
3480    cmpLEQ_cache[hash].leq = leq;
3481    ////--
3482    return leq;
3483 }
VtsID__cmpLEQ(VtsID vi1,VtsID vi2)3484 static inline Bool VtsID__cmpLEQ ( VtsID vi1, VtsID vi2 ) {
3485    return LIKELY(vi1 == vi2)  ? True  : VtsID__cmpLEQ_WRK(vi1, vi2);
3486 }
3487 
3488 /* compute binary join */
3489 __attribute__((noinline))
VtsID__join2_WRK(VtsID vi1,VtsID vi2)3490 static VtsID VtsID__join2_WRK ( VtsID vi1, VtsID vi2 ) {
3491    UInt  hash;
3492    VtsID res;
3493    VTS   *vts1, *vts2;
3494    //if (vi1 == vi2) return vi1;
3495    tl_assert(vi1 != vi2);
3496    ////++
3497    stats__join2_queries++;
3498    hash = hash_VtsIDs(vi1, vi2, N_JOIN2_CACHE);
3499    if (join2_cache[hash].vi1 == vi1
3500        && join2_cache[hash].vi2 == vi2)
3501       return join2_cache[hash].res;
3502    stats__join2_misses++;
3503    ////--
3504    vts1 = VtsID__to_VTS(vi1);
3505    vts2 = VtsID__to_VTS(vi2);
3506    temp_max_sized_VTS->usedTS = 0;
3507    VTS__join(temp_max_sized_VTS, vts1,vts2);
3508    res = vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
3509    ////++
3510    join2_cache[hash].vi1 = vi1;
3511    join2_cache[hash].vi2 = vi2;
3512    join2_cache[hash].res = res;
3513    ////--
3514    return res;
3515 }
VtsID__join2(VtsID vi1,VtsID vi2)3516 static inline VtsID VtsID__join2 ( VtsID vi1, VtsID vi2 ) {
3517    return LIKELY(vi1 == vi2)  ? vi1  : VtsID__join2_WRK(vi1, vi2);
3518 }
3519 
3520 /* create a singleton VTS, namely [thr:1] */
VtsID__mk_Singleton(Thr * thr,ULong tym)3521 static VtsID VtsID__mk_Singleton ( Thr* thr, ULong tym ) {
3522    temp_max_sized_VTS->usedTS = 0;
3523    VTS__singleton(temp_max_sized_VTS, thr,tym);
3524    return vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
3525 }
3526 
3527 /* tick operation, creates value 1 if specified index is absent */
VtsID__tick(VtsID vi,Thr * idx)3528 static VtsID VtsID__tick ( VtsID vi, Thr* idx ) {
3529    VTS* vts = VtsID__to_VTS(vi);
3530    temp_max_sized_VTS->usedTS = 0;
3531    VTS__tick(temp_max_sized_VTS, idx,vts);
3532    return vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
3533 }
3534 
3535 /* index into a VTS (only for assertions) */
VtsID__indexAt(VtsID vi,Thr * idx)3536 static ULong VtsID__indexAt ( VtsID vi, Thr* idx ) {
3537    VTS* vts = VtsID__to_VTS(vi);
3538    return VTS__indexAt_SLOW( vts, idx );
3539 }
3540 
3541 /* Assuming that !cmpLEQ(vi1, vi2), find the index of the first (or
3542    any, really) element in vi1 which is pointwise greater-than the
3543    corresponding element in vi2.  If no such element exists, return
3544    NULL.  This needs to be fairly quick since it is called every time
3545    a race is detected. */
VtsID__findFirst_notLEQ(VtsID vi1,VtsID vi2)3546 static Thr* VtsID__findFirst_notLEQ ( VtsID vi1, VtsID vi2 )
3547 {
3548    VTS  *vts1, *vts2;
3549    Thr*  diffthr;
3550    ThrID diffthrid;
3551    tl_assert(vi1 != vi2);
3552    vts1 = VtsID__to_VTS(vi1);
3553    vts2 = VtsID__to_VTS(vi2);
3554    tl_assert(vts1 != vts2);
3555    diffthrid = VTS__cmpLEQ(vts1, vts2);
3556    diffthr = Thr__from_ThrID(diffthrid);
3557    tl_assert(diffthr); /* else they are LEQ ! */
3558    return diffthr;
3559 }
3560 
3561 
3562 /////////////////////////////////////////////////////////
3563 //                                                     //
3564 // Filters                                             //
3565 //                                                     //
3566 /////////////////////////////////////////////////////////
3567 
3568 /* Forget everything we know -- clear the filter and let everything
3569    through.  This needs to be as fast as possible, since it is called
3570    every time the running thread changes, and every time a thread's
3571    vector clocks change, which can be quite frequent.  The obvious
3572    fast way to do this is simply to stuff in tags which we know are
3573    not going to match anything, since they're not aligned to the start
3574    of a line. */
Filter__clear(Filter * fi,const HChar * who)3575 static void Filter__clear ( Filter* fi, const HChar* who )
3576 {
3577    UWord i;
3578    if (0) VG_(printf)("  Filter__clear(%p, %s)\n", fi, who);
3579    for (i = 0; i < FI_NUM_LINES; i += 8) {
3580       fi->tags[i+0] = 1; /* impossible value -- cannot match */
3581       fi->tags[i+1] = 1;
3582       fi->tags[i+2] = 1;
3583       fi->tags[i+3] = 1;
3584       fi->tags[i+4] = 1;
3585       fi->tags[i+5] = 1;
3586       fi->tags[i+6] = 1;
3587       fi->tags[i+7] = 1;
3588    }
3589    tl_assert(i == FI_NUM_LINES);
3590 }
3591 
3592 /* Clearing an arbitrary range in the filter.  Unfortunately
3593    we have to do this due to core-supplied new/die-mem events. */
3594 
Filter__clear_1byte(Filter * fi,Addr a)3595 static void Filter__clear_1byte ( Filter* fi, Addr a )
3596 {
3597    Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3598    UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3599    FiLine* line   = &fi->lines[lineno];
3600    UWord   loff   = (a - atag) / 8;
3601    UShort  mask   = 0x3 << (2 * (a & 7));
3602    /* mask is C000, 3000, 0C00, 0300, 00C0, 0030, 000C or 0003 */
3603    if (LIKELY( fi->tags[lineno] == atag )) {
3604       /* hit.  clear the bits. */
3605       UShort  u16  = line->u16s[loff];
3606       line->u16s[loff] = u16 & ~mask; /* clear them */
3607    } else {
3608       /* miss.  The filter doesn't hold this address, so ignore. */
3609    }
3610 }
3611 
Filter__clear_8bytes_aligned(Filter * fi,Addr a)3612 static void Filter__clear_8bytes_aligned ( Filter* fi, Addr a )
3613 {
3614    Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3615    UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3616    FiLine* line   = &fi->lines[lineno];
3617    UWord   loff   = (a - atag) / 8;
3618    if (LIKELY( fi->tags[lineno] == atag )) {
3619       line->u16s[loff] = 0;
3620    } else {
3621     /* miss.  The filter doesn't hold this address, so ignore. */
3622    }
3623 }
3624 
3625 /* Only used to verify the fast Filter__clear_range */
3626 __attribute__((unused))
Filter__clear_range_SLOW(Filter * fi,Addr a,UWord len)3627 static void Filter__clear_range_SLOW ( Filter* fi, Addr a, UWord len )
3628 {
3629    tl_assert (CHECK_ZSM);
3630 
3631    /* slowly do part preceding 8-alignment */
3632    while (UNLIKELY(!VG_IS_8_ALIGNED(a)) && LIKELY(len > 0)) {
3633       Filter__clear_1byte( fi, a );
3634       a++;
3635       len--;
3636    }
3637    /* vector loop */
3638    while (len >= 8) {
3639       Filter__clear_8bytes_aligned( fi, a );
3640       a += 8;
3641       len -= 8;
3642    }
3643    /* slowly do tail */
3644    while (UNLIKELY(len > 0)) {
3645       Filter__clear_1byte( fi, a );
3646       a++;
3647       len--;
3648    }
3649 }
3650 
Filter__clear_range(Filter * fi,Addr a,UWord len)3651 static void Filter__clear_range ( Filter* fi, Addr a, UWord len )
3652 {
3653 #  if CHECK_ZSM > 0
3654    /* We check the below more complex algorithm with the simple one.
3655       This check is very expensive : we do first the slow way on a
3656       copy of the data, then do it the fast way. On RETURN, we check
3657       the two values are equal. */
3658    Filter fi_check = *fi;
3659    Filter__clear_range_SLOW(&fi_check, a, len);
3660 #  define RETURN goto check_and_return
3661 #  else
3662 #  define RETURN return
3663 #  endif
3664 
3665    Addr    begtag = FI_GET_TAG(a);       /* tag of range begin */
3666 
3667    Addr    end = a + len - 1;
3668    Addr    endtag = FI_GET_TAG(end); /* tag of range end. */
3669 
3670    UWord rlen = len; /* remaining length to clear */
3671 
3672    Addr    c = a; /* Current position we are clearing. */
3673    UWord   clineno = FI_GET_LINENO(c); /* Current lineno we are clearing */
3674    FiLine* cline; /* Current line we are clearing */
3675    UWord   cloff; /* Current offset in line we are clearing, when clearing
3676                      partial lines. */
3677 
3678    UShort u16;
3679 
3680    STATIC_ASSERT (FI_LINE_SZB == 32);
3681    // Below assumes filter lines are 32 bytes
3682 
3683    if (LIKELY(fi->tags[clineno] == begtag)) {
3684       /* LIKELY for the heavy caller VG_(unknown_SP_update). */
3685       /* First filter line matches begtag.
3686          If c is not at the filter line begin, the below will clear
3687          the filter line bytes starting from c. */
3688       cline = &fi->lines[clineno];
3689       cloff = (c - begtag) / 8;
3690 
3691       /* First the byte(s) needed to reach 8-alignment */
3692       if (UNLIKELY(!VG_IS_8_ALIGNED(c))) {
3693          /* hiB is the nr of bytes (higher addresses) from c to reach
3694             8-aligment. */
3695          UWord hiB = 8 - (c & 7);
3696          /* Compute 2-bit/byte mask representing hiB bytes [c..c+hiB[
3697             mask is  C000 , F000, FC00, FF00, FFC0, FFF0 or FFFC for the byte
3698             range    7..7   6..7  5..7  4..7  3..7  2..7    1..7 */
3699          UShort mask = 0xFFFF << (16 - 2*hiB);
3700 
3701          u16  = cline->u16s[cloff];
3702          if (LIKELY(rlen >= hiB)) {
3703             cline->u16s[cloff] = u16 & ~mask; /* clear all hiB from c */
3704             rlen -= hiB;
3705             c += hiB;
3706             cloff += 1;
3707          } else {
3708             /* Only have the bits for rlen bytes bytes. */
3709             mask = mask & ~(0xFFFF << (16 - 2*(hiB-rlen)));
3710             cline->u16s[cloff] = u16 & ~mask; /* clear rlen bytes from c. */
3711             RETURN;  // We have cleared all what we can.
3712          }
3713       }
3714       /* c is now 8 aligned. Clear by 8 aligned bytes,
3715          till c is filter-line aligned */
3716       while (!VG_IS_32_ALIGNED(c) && rlen >= 8) {
3717          cline->u16s[cloff] = 0;
3718          c += 8;
3719          rlen -= 8;
3720          cloff += 1;
3721       }
3722    } else {
3723       c = begtag + FI_LINE_SZB;
3724       if (c > end)
3725          RETURN;   // We have cleared all what we can.
3726       rlen -= c - a;
3727    }
3728    // We have changed c, so re-establish clineno.
3729    clineno = FI_GET_LINENO(c);
3730 
3731    if (rlen >= FI_LINE_SZB) {
3732       /* Here, c is filter line-aligned. Clear all full lines that
3733          overlap with the range starting at c, made of a full lines */
3734       UWord nfull = rlen / FI_LINE_SZB;
3735       UWord full_len = nfull * FI_LINE_SZB;
3736       rlen -= full_len;
3737       if (nfull > FI_NUM_LINES)
3738          nfull = FI_NUM_LINES; // no need to check several times the same entry.
3739 
3740       for (UWord n = 0; n < nfull; n++) {
3741          if (UNLIKELY(address_in_range(fi->tags[clineno], c, full_len))) {
3742             cline = &fi->lines[clineno];
3743             cline->u16s[0] = 0;
3744             cline->u16s[1] = 0;
3745             cline->u16s[2] = 0;
3746             cline->u16s[3] = 0;
3747             STATIC_ASSERT (4 == sizeof(cline->u16s)/sizeof(cline->u16s[0]));
3748          }
3749          clineno++;
3750          if (UNLIKELY(clineno == FI_NUM_LINES))
3751             clineno = 0;
3752       }
3753 
3754       c += full_len;
3755       clineno = FI_GET_LINENO(c);
3756    }
3757 
3758    if (CHECK_ZSM) {
3759       tl_assert(VG_IS_8_ALIGNED(c));
3760       tl_assert(clineno == FI_GET_LINENO(c));
3761    }
3762 
3763    /* Do the last filter line, if it was not cleared as a full filter line */
3764    if (UNLIKELY(rlen > 0) && fi->tags[clineno] == endtag) {
3765       cline = &fi->lines[clineno];
3766       cloff = (c - endtag) / 8;
3767       if (CHECK_ZSM) tl_assert(FI_GET_TAG(c) == endtag);
3768 
3769       /* c is 8 aligned. Clear by 8 aligned bytes, till we have less than
3770          8 bytes. */
3771       while (rlen >= 8) {
3772          cline->u16s[cloff] = 0;
3773          c += 8;
3774          rlen -= 8;
3775          cloff += 1;
3776       }
3777       /* Then the remaining byte(s) */
3778       if (rlen > 0) {
3779          /* nr of bytes from c to reach end. */
3780          UWord loB = rlen;
3781          /* Compute mask representing loB bytes [c..c+loB[ :
3782             mask is 0003, 000F, 003F, 00FF, 03FF, 0FFF or 3FFF */
3783          UShort mask = 0xFFFF >> (16 - 2*loB);
3784 
3785          u16  = cline->u16s[cloff];
3786          cline->u16s[cloff] = u16 & ~mask; /* clear all loB from c */
3787       }
3788    }
3789 
3790 #  if CHECK_ZSM > 0
3791    check_and_return:
3792    tl_assert (VG_(memcmp)(&fi_check, fi, sizeof(fi_check)) == 0);
3793 #  endif
3794 #  undef RETURN
3795 }
3796 
3797 /* ------ Read handlers for the filter. ------ */
3798 
Filter__ok_to_skip_crd64(Filter * fi,Addr a)3799 static inline Bool Filter__ok_to_skip_crd64 ( Filter* fi, Addr a )
3800 {
3801    if (UNLIKELY( !VG_IS_8_ALIGNED(a) ))
3802       return False;
3803    {
3804      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3805      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3806      FiLine* line   = &fi->lines[lineno];
3807      UWord   loff   = (a - atag) / 8;
3808      UShort  mask   = 0xAAAA;
3809      if (LIKELY( fi->tags[lineno] == atag )) {
3810         /* hit.  check line and update. */
3811         UShort u16  = line->u16s[loff];
3812         Bool   ok   = (u16 & mask) == mask; /* all R bits set? */
3813         line->u16s[loff] = u16 | mask; /* set them */
3814         return ok;
3815      } else {
3816         /* miss.  nuke existing line and re-use it. */
3817         UWord i;
3818         fi->tags[lineno] = atag;
3819         for (i = 0; i < FI_LINE_SZB / 8; i++)
3820            line->u16s[i] = 0;
3821         line->u16s[loff] = mask;
3822         return False;
3823      }
3824    }
3825 }
3826 
Filter__ok_to_skip_crd32(Filter * fi,Addr a)3827 static inline Bool Filter__ok_to_skip_crd32 ( Filter* fi, Addr a )
3828 {
3829    if (UNLIKELY( !VG_IS_4_ALIGNED(a) ))
3830       return False;
3831    {
3832      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3833      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3834      FiLine* line   = &fi->lines[lineno];
3835      UWord   loff   = (a - atag) / 8;
3836      UShort  mask   = 0xAA << (2 * (a & 4)); /* 0xAA00 or 0x00AA */
3837      if (LIKELY( fi->tags[lineno] == atag )) {
3838         /* hit.  check line and update. */
3839         UShort  u16  = line->u16s[loff];
3840         Bool    ok   = (u16 & mask) == mask; /* 4 x R bits set? */
3841         line->u16s[loff] = u16 | mask; /* set them */
3842         return ok;
3843      } else {
3844         /* miss.  nuke existing line and re-use it. */
3845         UWord   i;
3846         fi->tags[lineno] = atag;
3847         for (i = 0; i < FI_LINE_SZB / 8; i++)
3848            line->u16s[i] = 0;
3849         line->u16s[loff] = mask;
3850         return False;
3851      }
3852    }
3853 }
3854 
Filter__ok_to_skip_crd16(Filter * fi,Addr a)3855 static inline Bool Filter__ok_to_skip_crd16 ( Filter* fi, Addr a )
3856 {
3857    if (UNLIKELY( !VG_IS_2_ALIGNED(a) ))
3858       return False;
3859    {
3860      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3861      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3862      FiLine* line   = &fi->lines[lineno];
3863      UWord   loff   = (a - atag) / 8;
3864      UShort  mask   = 0xA << (2 * (a & 6));
3865      /* mask is A000, 0A00, 00A0 or 000A */
3866      if (LIKELY( fi->tags[lineno] == atag )) {
3867         /* hit.  check line and update. */
3868         UShort  u16  = line->u16s[loff];
3869         Bool    ok   = (u16 & mask) == mask; /* 2 x R bits set? */
3870         line->u16s[loff] = u16 | mask; /* set them */
3871         return ok;
3872      } else {
3873         /* miss.  nuke existing line and re-use it. */
3874         UWord   i;
3875         fi->tags[lineno] = atag;
3876         for (i = 0; i < FI_LINE_SZB / 8; i++)
3877            line->u16s[i] = 0;
3878         line->u16s[loff] = mask;
3879         return False;
3880      }
3881    }
3882 }
3883 
Filter__ok_to_skip_crd08(Filter * fi,Addr a)3884 static inline Bool Filter__ok_to_skip_crd08 ( Filter* fi, Addr a )
3885 {
3886    {
3887      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3888      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3889      FiLine* line   = &fi->lines[lineno];
3890      UWord   loff   = (a - atag) / 8;
3891      UShort  mask   = 0x2 << (2 * (a & 7));
3892      /* mask is 8000, 2000, 0800, 0200, 0080, 0020, 0008 or 0002 */
3893      if (LIKELY( fi->tags[lineno] == atag )) {
3894         /* hit.  check line and update. */
3895         UShort  u16  = line->u16s[loff];
3896         Bool    ok   = (u16 & mask) == mask; /* 1 x R bits set? */
3897         line->u16s[loff] = u16 | mask; /* set them */
3898         return ok;
3899      } else {
3900         /* miss.  nuke existing line and re-use it. */
3901         UWord   i;
3902         fi->tags[lineno] = atag;
3903         for (i = 0; i < FI_LINE_SZB / 8; i++)
3904            line->u16s[i] = 0;
3905         line->u16s[loff] = mask;
3906         return False;
3907      }
3908    }
3909 }
3910 
3911 
3912 /* ------ Write handlers for the filter. ------ */
3913 
Filter__ok_to_skip_cwr64(Filter * fi,Addr a)3914 static inline Bool Filter__ok_to_skip_cwr64 ( Filter* fi, Addr a )
3915 {
3916    if (UNLIKELY( !VG_IS_8_ALIGNED(a) ))
3917       return False;
3918    {
3919      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3920      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3921      FiLine* line   = &fi->lines[lineno];
3922      UWord   loff   = (a - atag) / 8;
3923      UShort  mask   = 0xFFFF;
3924      if (LIKELY( fi->tags[lineno] == atag )) {
3925         /* hit.  check line and update. */
3926         UShort u16  = line->u16s[loff];
3927         Bool   ok   = (u16 & mask) == mask; /* all R & W bits set? */
3928         line->u16s[loff] = u16 | mask; /* set them */
3929         return ok;
3930      } else {
3931         /* miss.  nuke existing line and re-use it. */
3932         UWord i;
3933         fi->tags[lineno] = atag;
3934         for (i = 0; i < FI_LINE_SZB / 8; i++)
3935            line->u16s[i] = 0;
3936         line->u16s[loff] = mask;
3937         return False;
3938      }
3939    }
3940 }
3941 
Filter__ok_to_skip_cwr32(Filter * fi,Addr a)3942 static inline Bool Filter__ok_to_skip_cwr32 ( Filter* fi, Addr a )
3943 {
3944    if (UNLIKELY( !VG_IS_4_ALIGNED(a) ))
3945       return False;
3946    {
3947      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3948      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3949      FiLine* line   = &fi->lines[lineno];
3950      UWord   loff   = (a - atag) / 8;
3951      UShort  mask   = 0xFF << (2 * (a & 4)); /* 0xFF00 or 0x00FF */
3952      if (LIKELY( fi->tags[lineno] == atag )) {
3953         /* hit.  check line and update. */
3954         UShort  u16  = line->u16s[loff];
3955         Bool    ok   = (u16 & mask) == mask; /* 4 x R & W bits set? */
3956         line->u16s[loff] = u16 | mask; /* set them */
3957         return ok;
3958      } else {
3959         /* miss.  nuke existing line and re-use it. */
3960         UWord   i;
3961         fi->tags[lineno] = atag;
3962         for (i = 0; i < FI_LINE_SZB / 8; i++)
3963            line->u16s[i] = 0;
3964         line->u16s[loff] = mask;
3965         return False;
3966      }
3967    }
3968 }
3969 
Filter__ok_to_skip_cwr16(Filter * fi,Addr a)3970 static inline Bool Filter__ok_to_skip_cwr16 ( Filter* fi, Addr a )
3971 {
3972    if (UNLIKELY( !VG_IS_2_ALIGNED(a) ))
3973       return False;
3974    {
3975      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
3976      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
3977      FiLine* line   = &fi->lines[lineno];
3978      UWord   loff   = (a - atag) / 8;
3979      UShort  mask   = 0xF << (2 * (a & 6));
3980      /* mask is F000, 0F00, 00F0 or 000F */
3981      if (LIKELY( fi->tags[lineno] == atag )) {
3982         /* hit.  check line and update. */
3983         UShort  u16  = line->u16s[loff];
3984         Bool    ok   = (u16 & mask) == mask; /* 2 x R & W bits set? */
3985         line->u16s[loff] = u16 | mask; /* set them */
3986         return ok;
3987      } else {
3988         /* miss.  nuke existing line and re-use it. */
3989         UWord   i;
3990         fi->tags[lineno] = atag;
3991         for (i = 0; i < FI_LINE_SZB / 8; i++)
3992            line->u16s[i] = 0;
3993         line->u16s[loff] = mask;
3994         return False;
3995      }
3996    }
3997 }
3998 
Filter__ok_to_skip_cwr08(Filter * fi,Addr a)3999 static inline Bool Filter__ok_to_skip_cwr08 ( Filter* fi, Addr a )
4000 {
4001    {
4002      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
4003      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
4004      FiLine* line   = &fi->lines[lineno];
4005      UWord   loff   = (a - atag) / 8;
4006      UShort  mask   = 0x3 << (2 * (a & 7));
4007      /* mask is C000, 3000, 0C00, 0300, 00C0, 0030, 000C or 0003 */
4008      if (LIKELY( fi->tags[lineno] == atag )) {
4009         /* hit.  check line and update. */
4010         UShort  u16  = line->u16s[loff];
4011         Bool    ok   = (u16 & mask) == mask; /* 1 x R bits set? */
4012         line->u16s[loff] = u16 | mask; /* set them */
4013         return ok;
4014      } else {
4015         /* miss.  nuke existing line and re-use it. */
4016         UWord   i;
4017         fi->tags[lineno] = atag;
4018         for (i = 0; i < FI_LINE_SZB / 8; i++)
4019            line->u16s[i] = 0;
4020         line->u16s[loff] = mask;
4021         return False;
4022      }
4023    }
4024 }
4025 
4026 
4027 /////////////////////////////////////////////////////////
4028 //                                                     //
4029 // Threads                                             //
4030 //                                                     //
4031 /////////////////////////////////////////////////////////
4032 
4033 /* Maps ThrID values to their Thr*s (which contain ThrID values that
4034    should point back to the relevant slot in the array.  Lowest
4035    numbered slot (0) is for thrid = 1024, (1) is for 1025, etc. */
4036 static XArray* /* of Thr* */ thrid_to_thr_map = NULL;
4037 
4038 /* And a counter to dole out ThrID values.  For rationale/background,
4039    see comments on definition of ScalarTS (far) above. */
4040 static ThrID thrid_counter = 1024; /* runs up to ThrID_MAX_VALID */
4041 
Thr__to_ThrID(Thr * thr)4042 static ThrID Thr__to_ThrID ( Thr* thr ) {
4043    return thr->thrid;
4044 }
Thr__from_ThrID(UInt thrid)4045 static Thr* Thr__from_ThrID ( UInt thrid ) {
4046    Thr* thr = *(Thr**)VG_(indexXA)( thrid_to_thr_map, thrid - 1024 );
4047    tl_assert(thr->thrid == thrid);
4048    return thr;
4049 }
4050 
4051 /* True if the cached rcec for thr is valid and can be used to build the
4052    current stack trace just by changing the last frame to the current IP. */
cached_rcec_valid(Thr * thr)4053 static inline Bool cached_rcec_valid(Thr *thr)
4054 {
4055    UWord cached_stackvalid = VG_(get_SP_s1) (thr->hgthread->coretid);
4056    return cached_stackvalid != 0;
4057 }
4058 /* Set the validity of the cached rcec of thr. */
set_cached_rcec_validity(Thr * thr,Bool valid)4059 static inline void set_cached_rcec_validity(Thr *thr, Bool valid)
4060 {
4061    VG_(set_SP_s1) (thr->hgthread->coretid, valid);
4062 }
4063 
Thr__new(void)4064 static Thr* Thr__new ( void )
4065 {
4066    Thr* thr = HG_(zalloc)( "libhb.Thr__new.1", sizeof(Thr) );
4067    thr->viR = VtsID_INVALID;
4068    thr->viW = VtsID_INVALID;
4069    thr->llexit_done = False;
4070    thr->joinedwith_done = False;
4071    thr->filter = HG_(zalloc)( "libhb.Thr__new.2", sizeof(Filter) );
4072    if (HG_(clo_history_level) == 1)
4073       thr->local_Kws_n_stacks
4074          = VG_(newXA)( HG_(zalloc),
4075                        "libhb.Thr__new.3 (local_Kws_and_stacks)",
4076                        HG_(free), sizeof(ULong_n_EC) );
4077    /* Make an 'empty' cached rcec in thr. */
4078    thr->cached_rcec.magic = RCEC_MAGIC;
4079    thr->cached_rcec.rc = 0;
4080    thr->cached_rcec.rcX = 0;
4081    thr->cached_rcec.next = NULL;
4082 
4083    /* Add this Thr* <-> ThrID binding to the mapping, and
4084       cross-check */
4085    if (!thrid_to_thr_map) {
4086       thrid_to_thr_map = VG_(newXA)( HG_(zalloc), "libhb.Thr__new.4",
4087                                      HG_(free), sizeof(Thr*) );
4088    }
4089 
4090    if (thrid_counter >= ThrID_MAX_VALID) {
4091       /* We're hosed.  We have to stop. */
4092       scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
4093    }
4094 
4095    thr->thrid = thrid_counter++;
4096    Word ix = VG_(addToXA)( thrid_to_thr_map, &thr );
4097    tl_assert(ix + 1024 == thr->thrid);
4098 
4099    return thr;
4100 }
4101 
note_local_Kw_n_stack_for(Thr * thr)4102 static void note_local_Kw_n_stack_for ( Thr* thr )
4103 {
4104    Word       nPresent;
4105    ULong_n_EC pair;
4106    tl_assert(thr);
4107 
4108    // We only collect this info at history level 1 (approx)
4109    if (HG_(clo_history_level) != 1)
4110       return;
4111 
4112    /* This is the scalar Kw for thr. */
4113    pair.ull = VtsID__indexAt( thr->viW, thr );
4114    pair.ec  = main_get_EC( thr );
4115    tl_assert(pair.ec);
4116    tl_assert(thr->local_Kws_n_stacks);
4117 
4118    /* check that we're not adding duplicates */
4119    nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks );
4120 
4121    /* Throw away old stacks, if necessary.  We can't accumulate stuff
4122       indefinitely. */
4123    if (nPresent >= N_KWs_N_STACKs_PER_THREAD) {
4124       VG_(dropHeadXA)( thr->local_Kws_n_stacks, nPresent / 2 );
4125       nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks );
4126       if (0)
4127          VG_(printf)("LOCAL Kw: thr %p,  Kw %llu,  ec %p (!!! gc !!!)\n",
4128                      thr, pair.ull, pair.ec );
4129    }
4130 
4131    if (nPresent > 0) {
4132       ULong_n_EC* prevPair
4133          = (ULong_n_EC*)VG_(indexXA)( thr->local_Kws_n_stacks, nPresent-1 );
4134       tl_assert( prevPair->ull <= pair.ull );
4135    }
4136 
4137    if (nPresent == 0)
4138       pair.ec = NULL;
4139 
4140    VG_(addToXA)( thr->local_Kws_n_stacks, &pair );
4141 
4142    if (0)
4143       VG_(printf)("LOCAL Kw: thr %p,  Kw %llu,  ec %p\n",
4144                   thr, pair.ull, pair.ec );
4145    if (0)
4146       VG_(pp_ExeContext)(pair.ec);
4147 }
4148 
cmp__ULong_n_EC__by_ULong(const ULong_n_EC * pair1,const ULong_n_EC * pair2)4149 static Int cmp__ULong_n_EC__by_ULong ( const ULong_n_EC* pair1,
4150                                        const ULong_n_EC* pair2 )
4151 {
4152    if (pair1->ull < pair2->ull) return -1;
4153    if (pair1->ull > pair2->ull) return 1;
4154    return 0;
4155 }
4156 
4157 
4158 /////////////////////////////////////////////////////////
4159 //                                                     //
4160 // Shadow Values                                       //
4161 //                                                     //
4162 /////////////////////////////////////////////////////////
4163 
4164 // type SVal, SVal_INVALID and SVal_NOACCESS are defined by
4165 // hb_zsm.h.  We have to do everything else here.
4166 
4167 /* SVal is 64 bit unsigned int.
4168 
4169       <---------30--------->    <---------30--------->
4170    00 X-----Rmin-VtsID-----X 00 X-----Wmin-VtsID-----X   C(Rmin,Wmin)
4171    10 X--------------------X XX X--------------------X   A: SVal_NOACCESS
4172    11 0--------------------0 00 0--------------------0   A: SVal_INVALID
4173 
4174 */
4175 #define SVAL_TAGMASK (3ULL << 62)
4176 
SVal__isC(SVal s)4177 static inline Bool SVal__isC ( SVal s ) {
4178    return (0ULL << 62) == (s & SVAL_TAGMASK);
4179 }
SVal__mkC(VtsID rmini,VtsID wmini)4180 static inline SVal SVal__mkC ( VtsID rmini, VtsID wmini ) {
4181    //tl_assert(VtsID__is_valid(rmini));
4182    //tl_assert(VtsID__is_valid(wmini));
4183    return (((ULong)rmini) << 32) | ((ULong)wmini);
4184 }
SVal__unC_Rmin(SVal s)4185 static inline VtsID SVal__unC_Rmin ( SVal s ) {
4186    tl_assert(SVal__isC(s));
4187    return (VtsID)(s >> 32);
4188 }
SVal__unC_Wmin(SVal s)4189 static inline VtsID SVal__unC_Wmin ( SVal s ) {
4190    tl_assert(SVal__isC(s));
4191    return (VtsID)(s & 0xFFFFFFFFULL);
4192 }
4193 
SVal__isA(SVal s)4194 static inline Bool SVal__isA ( SVal s ) {
4195    return (2ULL << 62) == (s & SVAL_TAGMASK);
4196 }
4197 __attribute__((unused))
SVal__mkA(void)4198 static inline SVal SVal__mkA ( void ) {
4199    return 2ULL << 62;
4200 }
4201 
4202 /* Direct callback from lib_zsm. */
SVal__rcinc(SVal s)4203 static inline void SVal__rcinc ( SVal s ) {
4204    if (SVal__isC(s)) {
4205       VtsID__rcinc( SVal__unC_Rmin(s) );
4206       VtsID__rcinc( SVal__unC_Wmin(s) );
4207    }
4208 }
4209 
4210 /* Direct callback from lib_zsm. */
SVal__rcdec(SVal s)4211 static inline void SVal__rcdec ( SVal s ) {
4212    if (SVal__isC(s)) {
4213       VtsID__rcdec( SVal__unC_Rmin(s) );
4214       VtsID__rcdec( SVal__unC_Wmin(s) );
4215    }
4216 }
4217 
SVal2Ptr(SVal s)4218 static inline void *SVal2Ptr (SVal s)
4219 {
4220    return (void*)(UWord)s;
4221 }
4222 
Ptr2SVal(void * ptr)4223 static inline SVal Ptr2SVal (void* ptr)
4224 {
4225    return (SVal)(UWord)ptr;
4226 }
4227 
4228 
4229 
4230 /////////////////////////////////////////////////////////
4231 //                                                     //
4232 // Change-event map2                                   //
4233 //                                                     //
4234 /////////////////////////////////////////////////////////
4235 
4236 /* This is in two parts:
4237 
4238    1. A hash table of RCECs.  This is a set of reference-counted stack
4239       traces.  When the reference count of a stack trace becomes zero,
4240       it is removed from the set and freed up.  The intent is to have
4241       a set of stack traces which can be referred to from (2), but to
4242       only represent each one once.  The set is indexed/searched by
4243       ordering on the stack trace vectors.
4244 
4245    2. A Hash table of OldRefs.  These store information about each old
4246       ref that we need to record.  Hash table key is the address of the
4247       location for which the information is recorded.  For LRU
4248       purposes, each OldRef in the hash table is also on a doubly
4249       linked list maintaining the order in which the OldRef were most
4250       recently accessed.
4251       Each OldRef also maintains the stamp at which it was last accessed.
4252       With these stamps, we can quickly check which of 2 OldRef is the
4253       'newest', without having to scan the full list of LRU OldRef.
4254 
4255       The important part of an OldRef is, however, its acc component.
4256       This binds a TSW triple (thread, size, R/W) to an RCEC.
4257 
4258       We allocate a maximum of VG_(clo_conflict_cache_size) OldRef.
4259       Then we do exact LRU discarding.  For each discarded OldRef we must
4260       of course decrement the reference count on the RCEC it
4261       refers to, in order that entries from (1) eventually get
4262       discarded too.
4263 */
4264 
4265 static UWord stats__evm__lookup_found = 0;
4266 static UWord stats__evm__lookup_notfound = 0;
4267 
4268 static UWord stats__ctxt_eq_tsw_eq_rcec = 0;
4269 static UWord stats__ctxt_eq_tsw_neq_rcec = 0;
4270 static UWord stats__ctxt_neq_tsw_neq_rcec = 0;
4271 static UWord stats__ctxt_rcdec_calls = 0;
4272 static UWord stats__ctxt_rcec_gc_discards = 0;
4273 
4274 static UWord stats__ctxt_tab_curr = 0;
4275 static UWord stats__ctxt_tab_max  = 0;
4276 
4277 static UWord stats__ctxt_tab_qs   = 0;
4278 static UWord stats__ctxt_tab_cmps = 0;
4279 
4280 
4281 ///////////////////////////////////////////////////////
4282 //// Part (1): A hash table of RCECs
4283 ///
4284 
4285 //#define N_RCEC_TAB 98317 /* prime */
4286 #define N_RCEC_TAB 196613 /* prime */
4287 
4288 //////////// BEGIN RCEC pool allocator
4289 static PoolAlloc* rcec_pool_allocator;
alloc_RCEC(void)4290 static RCEC* alloc_RCEC ( void ) {
4291    return VG_(allocEltPA) ( rcec_pool_allocator );
4292 }
4293 
free_RCEC(RCEC * rcec)4294 static void free_RCEC ( RCEC* rcec ) {
4295    tl_assert(rcec->magic == RCEC_MAGIC);
4296    VG_(freeEltPA)( rcec_pool_allocator, rcec );
4297 }
4298 //////////// END RCEC pool allocator
4299 
4300 static RCEC** contextTab = NULL; /* hash table of RCEC*s */
4301 
4302 /* Count of allocated RCEC having ref count > 0 */
4303 static UWord RCEC_referenced = 0;
4304 
4305 /* True if the frames of ec1 and ec2 are different. */
RCEC__differs_by_frames(RCEC * ec1,RCEC * ec2)4306 static Bool RCEC__differs_by_frames ( RCEC* ec1, RCEC* ec2 ) {
4307    Word i;
4308    if (CHECK_CEM) {
4309       tl_assert(ec1 && ec1->magic == RCEC_MAGIC);
4310       tl_assert(ec2 && ec2->magic == RCEC_MAGIC);
4311    }
4312    if (ec1->frames_hash != ec2->frames_hash) return True;
4313    for (i = 0; i < N_FRAMES; i++) {
4314       if (ec1->frames[i] != ec2->frames[i]) return True;
4315    }
4316    return False;
4317 }
4318 
4319 /* Dec the ref of this RCEC. */
ctxt__rcdec(RCEC * ec)4320 static void ctxt__rcdec ( RCEC* ec )
4321 {
4322    stats__ctxt_rcdec_calls++;
4323    if (CHECK_CEM)
4324       tl_assert(ec && ec->magic == RCEC_MAGIC);
4325    tl_assert(ec->rc > 0);
4326    ec->rc--;
4327    if (ec->rc == 0)
4328       RCEC_referenced--;
4329 }
4330 
ctxt__rcinc(RCEC * ec)4331 static void ctxt__rcinc ( RCEC* ec )
4332 {
4333    if (CHECK_CEM)
4334       tl_assert(ec && ec->magic == RCEC_MAGIC);
4335    if (ec->rc == 0)
4336       RCEC_referenced++;
4337    ec->rc++;
4338 }
4339 
4340 
4341 /* Find 'ec' in the RCEC list whose head pointer lives at 'headp' and
4342    move it one step closer to the front of the list, so as to make
4343    subsequent searches for it cheaper. */
move_RCEC_one_step_forward(RCEC ** headp,RCEC * ec)4344 static void move_RCEC_one_step_forward ( RCEC** headp, RCEC* ec )
4345 {
4346    RCEC *ec0, *ec1, *ec2;
4347    if (ec == *headp)
4348       tl_assert(0); /* already at head of list */
4349    tl_assert(ec != NULL);
4350    ec0 = *headp;
4351    ec1 = NULL;
4352    ec2 = NULL;
4353    while (True) {
4354       if (ec0 == NULL || ec0 == ec) break;
4355       ec2 = ec1;
4356       ec1 = ec0;
4357       ec0 = ec0->next;
4358    }
4359    tl_assert(ec0 == ec);
4360    if (ec0 != NULL && ec1 != NULL && ec2 != NULL) {
4361       RCEC* tmp;
4362       /* ec0 points to ec, ec1 to its predecessor, and ec2 to ec1's
4363          predecessor.  Swap ec0 and ec1, that is, move ec0 one step
4364          closer to the start of the list. */
4365       tl_assert(ec2->next == ec1);
4366       tl_assert(ec1->next == ec0);
4367       tmp = ec0->next;
4368       ec2->next = ec0;
4369       ec0->next = ec1;
4370       ec1->next = tmp;
4371    }
4372    else
4373    if (ec0 != NULL && ec1 != NULL && ec2 == NULL) {
4374       /* it's second in the list. */
4375       tl_assert(*headp == ec1);
4376       tl_assert(ec1->next == ec0);
4377       ec1->next = ec0->next;
4378       ec0->next = ec1;
4379       *headp = ec0;
4380    }
4381 }
4382 
4383 
4384 /* Find the given RCEC in the tree, and return a pointer to it.  Or,
4385    if not present, add the given one to the tree (by making a copy of
4386    it, so the caller can immediately deallocate the original) and
4387    return a pointer to the copy.  The caller can safely have 'example'
4388    on its stack, since we will always return a pointer to a copy of
4389    it, not to the original.  Note that the inserted node will have .rc
4390    of zero and so the caller must immediately increment it. */
4391 __attribute__((noinline))
ctxt__find_or_add(RCEC * example)4392 static RCEC* ctxt__find_or_add ( RCEC* example )
4393 {
4394    UWord hent;
4395    RCEC* copy;
4396 
4397    if (CHECK_CEM) {
4398       /* Note that the single caller of ctxt__find_or_add always provides
4399          &thr->cached_rcec as argument. The sanity of thr->cached_rcec is always
4400          checked with a thread terminates. */
4401       tl_assert(example && example->magic == RCEC_MAGIC);
4402       tl_assert(example->rc == 0);
4403    }
4404 
4405    /* Search the hash table to see if we already have it. */
4406    stats__ctxt_tab_qs++;
4407    hent = example->frames_hash % N_RCEC_TAB;
4408    copy = contextTab[hent];
4409    while (1) {
4410       if (!copy) break;
4411       if (CHECK_CEM)
4412          tl_assert(copy->magic == RCEC_MAGIC);
4413       stats__ctxt_tab_cmps++;
4414       if (!RCEC__differs_by_frames(copy, example)) break;
4415       copy = copy->next;
4416    }
4417 
4418    if (copy) {
4419       tl_assert(copy != example);
4420       /* optimisation: if it's not at the head of its list, move 1
4421          step fwds, to make future searches cheaper */
4422       if (copy != contextTab[hent]) {
4423          move_RCEC_one_step_forward( &contextTab[hent], copy );
4424       }
4425    } else {
4426       copy = alloc_RCEC();
4427       tl_assert(copy != example);
4428       *copy = *example;
4429       copy->next = contextTab[hent];
4430       contextTab[hent] = copy;
4431       stats__ctxt_tab_curr++;
4432       if (stats__ctxt_tab_curr > stats__ctxt_tab_max)
4433          stats__ctxt_tab_max = stats__ctxt_tab_curr;
4434    }
4435    return copy;
4436 }
4437 
ROLW(UWord w,Int n)4438 static inline UWord ROLW ( UWord w, Int n )
4439 {
4440    Int bpw = 8 * sizeof(UWord);
4441    w = (w << n) | (w >> (bpw-n));
4442    return w;
4443 }
4444 
4445 static UWord stats__cached_rcec_identical = 0;
4446 static UWord stats__cached_rcec_updated = 0;
4447 static UWord stats__cached_rcec_fresh = 0;
4448 static UWord stats__cached_rcec_diff = 0;
4449 static UWord stats__cached_rcec_diff_known_reason = 0;
4450 
4451 /* Check if the cached rcec in thr corresponds to the current
4452    stacktrace of the thread. Returns True if ok, False otherwise.
4453    This is just used for debugging the cached rcec logic, activated
4454    using --hg-sanity-flags=xx1xxx i.e. SCE_ACCESS flag.
4455    When this flag is activated, a call to this function will happen each time
4456    a stack trace is needed for a memory access. */
4457 __attribute__((noinline))
check_cached_rcec_ok(Thr * thr,Addr previous_frame0)4458 static Bool check_cached_rcec_ok (Thr* thr, Addr previous_frame0)
4459 {
4460    Bool  ok = True;
4461    UInt  i;
4462    UWord frames[N_FRAMES];
4463    UWord sps[N_FRAMES];
4464    UWord fps[N_FRAMES];
4465    const DiEpoch cur_ep = VG_(current_DiEpoch)();
4466 
4467    for (i = 0; i < N_FRAMES; i++)
4468       frames[i] = sps[i] = fps[i] = 0;
4469    VG_(get_StackTrace)( thr->hgthread->coretid, &frames[0], N_FRAMES,
4470                         &sps[0], &fps[0], 0);
4471    for (i = 0; i < N_FRAMES; i++) {
4472       if ( thr->cached_rcec.frames[i] != frames[i] ) {
4473          /* There are a bunch of "normal" reasons for which a stack
4474             derived from the cached rcec differs from frames. */
4475          const HChar *reason = NULL;
4476 
4477          /* Old linkers (e.g. RHEL5) gave no cfi unwind information in the PLT
4478             section (fix was added in binutils around June 2011).
4479             Without PLT unwind info, stacktrace in the PLT section are
4480             missing an entry. E.g. the cached stacktrace is:
4481               ==4463==    at 0x2035C0: ___tls_get_addr (dl-tls.c:753)
4482               ==4463==    by 0x33B7F9: __libc_thread_freeres
4483                                                 (in /lib/libc-2.11.2.so)
4484               ==4463==    by 0x39BA4F: start_thread (pthread_create.c:307)
4485               ==4463==    by 0x2F107D: clone (clone.S:130)
4486            while the 'check stacktrace' is
4487               ==4463==    at 0x2035C0: ___tls_get_addr (dl-tls.c:753)
4488               ==4463==    by 0x33B82D: strerror_thread_freeres
4489                                                 (in /lib/libc-2.11.2.so)
4490               ==4463==    by 0x33B7F9: __libc_thread_freeres
4491                                                 (in /lib/libc-2.11.2.so)
4492               ==4463==    by 0x39BA4F: start_thread (pthread_create.c:307)
4493               ==4463==    by 0x2F107D: clone (clone.S:130)
4494            No cheap/easy way to detect or fix that. */
4495 
4496          /* It seems that sometimes, the CFI unwind info looks wrong
4497             for a 'ret' instruction. E.g. here is the unwind info
4498             for a 'retq' on gcc20 (amd64, Debian 7)
4499                 [0x4e3ddfe .. 0x4e3ddfe]: let cfa=oldSP+48 in RA=*(cfa+-8)
4500                                                       SP=cfa+0 BP=*(cfa+-24)
4501             This unwind info looks doubtful, as the RA should be at oldSP.
4502             No easy way to detect this problem.
4503             This gives a difference between cached rcec and
4504             current stack trace: the cached rcec is correct. */
4505 
4506          /* When returning from main, unwind info becomes erratic.
4507             So, by default, only report errors for main and above,
4508             unless asked to show below main. */
4509          if (reason == NULL) {
4510             UInt fr_main;
4511             Vg_FnNameKind fr_kind;
4512             for (fr_main = 0; fr_main < N_FRAMES; fr_main++) {
4513                fr_kind = VG_(get_fnname_kind_from_IP)
4514                                 (cur_ep, frames[fr_main]);
4515                if (fr_kind == Vg_FnNameMain || fr_kind == Vg_FnNameBelowMain)
4516                   break;
4517             }
4518             UInt kh_main;
4519             Vg_FnNameKind kh_kind;
4520             for (kh_main = 0; kh_main < N_FRAMES; kh_main++) {
4521                kh_kind = VG_(get_fnname_kind_from_IP)
4522                                 (cur_ep, thr->cached_rcec.frames[kh_main]);
4523                if (kh_kind == Vg_FnNameMain || kh_kind == Vg_FnNameBelowMain)
4524                   break;
4525             }
4526             if (kh_main == fr_main
4527                 && kh_kind == fr_kind
4528                 && (kh_main < i || (kh_main == i
4529                                     && kh_kind == Vg_FnNameBelowMain))) {
4530                // found main or below main before the difference
4531                reason = "Below main";
4532             }
4533          }
4534 
4535          /* We have places where the stack is missing some internal
4536             pthread functions. For such stacktraces, GDB reports only
4537             one function, telling:
4538                #0  0xf7fa81fe in _L_unlock_669 ()
4539                               from /lib/i386-linux-gnu/libpthread.so.0
4540                Backtrace stopped: previous frame identical to
4541                                             this frame (corrupt stack?)
4542 
4543             This is when sps and fps are identical.
4544             The cached stack trace is then
4545                ==3336==    at 0x40641FE: _L_unlock_669
4546                                               (pthread_mutex_unlock.c:310)
4547                ==3336==    by 0x40302BE: pthread_mutex_unlock
4548                                               (hg_intercepts.c:710)
4549                ==3336==    by 0x80486AF: main (cond_timedwait_test.c:14)
4550            while the 'check stacktrace' is
4551                ==3336==    at 0x40641FE: _L_unlock_669
4552                                               (pthread_mutex_unlock.c:310)
4553                ==3336==    by 0x4064206: _L_unlock_669
4554                                               (pthread_mutex_unlock.c:310)
4555                ==3336==    by 0x4064132: __pthread_mutex_unlock_usercnt
4556                                               (pthread_mutex_unlock.c:57)
4557                ==3336==    by 0x40302BE: pthread_mutex_unlock
4558                                                (hg_intercepts.c:710)
4559                ==3336==    by 0x80486AF: main (cond_timedwait_test.c:14) */
4560          if (reason == NULL) {
4561             if ((i > 0
4562                       && sps[i] == sps[i-1] && fps[i] == fps[i-1])
4563                 || (i < N_FRAMES-1
4564                       && sps[i] == sps[i+1] && fps[i] == fps[i+1])) {
4565                reason = "previous||next frame: identical sp and fp";
4566             }
4567          }
4568          if (reason == NULL) {
4569             if ((i > 0
4570                       && fps[i] == fps[i-1])
4571                 || (i < N_FRAMES-1
4572                       && fps[i] == fps[i+1])) {
4573                reason = "previous||next frame: identical fp";
4574             }
4575          }
4576 
4577          /* When we have a read or write 'in the middle of a push instruction',
4578             then the normal backtrace is not very good, while the helgrind
4579             stacktrace is better, as it undoes the not yet fully finished
4580             push instruction before getting the stacktrace. */
4581          if (reason == NULL && thr->hgthread->first_sp_delta != 0) {
4582             reason = "fixupSP probably needed for check stacktrace";
4583          }
4584 
4585          /* Unwinding becomes hectic when running the exit handlers.
4586             None of GDB, cached stacktrace and check stacktrace corresponds.
4587             So, if we find __run_exit_handlers, ignore the difference. */
4588          if (reason == NULL) {
4589             const HChar *fnname;
4590             for (UInt f = 0; f < N_FRAMES; f++) {
4591                if (VG_(get_fnname)( cur_ep, frames[f], &fnname)
4592                    && VG_(strcmp) ("__run_exit_handlers", fnname) == 0) {
4593                   reason = "exit handlers";
4594                   break;
4595                }
4596             }
4597          }
4598 
4599          // Show what we have found for this difference
4600          if (reason == NULL) {
4601             ok = False;
4602             stats__cached_rcec_diff++;
4603          } else {
4604             ok = True;
4605             stats__cached_rcec_diff_known_reason++;
4606          }
4607          if (!ok || VG_(clo_verbosity) > 2) {
4608             Bool save_show_below_main = VG_(clo_show_below_main);
4609             VG_(clo_show_below_main) = True;
4610             /* The below error msg reports an unexpected diff in 'frame %d'.
4611                The (maybe wrong) pc found in the cached stacktrace is
4612                'cached_pc %p' while an unwind gives the (maybe wrong)
4613                'check_pc %p'.
4614                After, 'previous_frame0 %p' tells where the cached stacktrace
4615                was taken.
4616                This is then followed by the full resulting cache stack trace
4617                and the full stack trace found doing unwind.
4618                Such a diff can have various origins:
4619                  * a bug in the unwinder, when the cached stack trace was taken
4620                    at 'previous_frame0'
4621                  * a bug in the unwinder, when the check stack trace was taken
4622                    (i.e. at current pc).
4623                  * a missing 'invalidate cache stack trace' somewhere in the
4624                    instructions between 'previous_frame0' and current_pc.
4625                To investigate the last case, typically, disass the range of
4626                instructions where an invalidate cached stack might miss. */
4627             VG_(printf)("%s diff tid %d frame %d "
4628                         "cached_pc %p check_pc %p\n",
4629                         reason ? reason : "unexpected",
4630                         thr->hgthread->coretid,
4631                         i,
4632                         (void*)thr->cached_rcec.frames[i],
4633                         (void*)frames[i]);
4634             VG_(printf)("cached stack trace previous_frame0 %p\n",
4635                         (void*)previous_frame0);
4636             VG_(pp_StackTrace)(cur_ep, &previous_frame0, 1);
4637             VG_(printf)("resulting cached stack trace:\n");
4638             VG_(pp_StackTrace)(cur_ep, thr->cached_rcec.frames, N_FRAMES);
4639             VG_(printf)("check stack trace:\n");
4640             VG_(pp_StackTrace)(cur_ep, frames, N_FRAMES);
4641 
4642             VG_(show_sched_status) (False,  // host_stacktrace
4643                                     False,  // stack_usage
4644                                     False); // exited_threads
4645             if (VG_(clo_vgdb_error) == 1234567890) // HACK TO ALLOW TO DEBUG
4646                VG_(gdbserver) ( thr->hgthread->coretid );
4647             VG_(clo_show_below_main) = save_show_below_main;
4648          }
4649          break; // Stop giving more errors for this stacktrace.
4650       }
4651    }
4652    return ok;
4653 }
4654 
4655 __attribute__((noinline))
get_RCEC(Thr * thr)4656 static RCEC* get_RCEC ( Thr* thr )
4657 {
4658    UInt  i;
4659    UWord hash;
4660    Addr  previous_frame0 = 0; // Assignment needed to silence gcc
4661    RCEC  *res;
4662    const Bool thr_cached_rcec_valid = cached_rcec_valid(thr);
4663    const Addr cur_ip = VG_(get_IP)(thr->hgthread->coretid);
4664 
4665    if (DEBUG_CACHED_RCEC)
4666       VG_(printf)("get rcec tid %d at IP %p SP %p"
4667                   " first_sp_delta %ld cached valid %d\n",
4668                   thr->hgthread->coretid,
4669                   (void*)cur_ip,
4670                   (void*)VG_(get_SP)(thr->hgthread->coretid),
4671                   thr->hgthread->first_sp_delta, thr_cached_rcec_valid);
4672 
4673    /* If we have a valid cached rcec, derive the new rcec from the cached one
4674       and update the cached one.
4675       Otherwise, compute a fresh rcec. */
4676 
4677    if (thr_cached_rcec_valid) {
4678       /* Update the stacktrace of the cached rcec with the current IP */
4679       previous_frame0 = thr->cached_rcec.frames[0];
4680       thr->cached_rcec.frames[0] = cur_ip;
4681 
4682 #     if defined(VGP_x86_linux)
4683       // See m_stacktrace.c kludge
4684       extern Addr VG_(client__dl_sysinfo_int80);
4685       /// #include pub_core_clientstate needed for the above ????
4686       /// or move the above into a pub_tool_??? tool_stacktrace.h maybe ????
4687       if (VG_(client__dl_sysinfo_int80) != 0 /* we know its address */
4688           && cur_ip >= VG_(client__dl_sysinfo_int80)
4689           && cur_ip < VG_(client__dl_sysinfo_int80)+3
4690           ) {
4691          thr->cached_rcec.frames[0]
4692             = (ULong) *(Addr*)(UWord)VG_(get_SP)(thr->hgthread->coretid);
4693       }
4694 #     endif
4695 
4696       if (previous_frame0 == thr->cached_rcec.frames[0])
4697          stats__cached_rcec_identical++;
4698       else
4699          stats__cached_rcec_updated++;
4700    } else {
4701       /* Compute a fresh stacktrace. */
4702       main_get_stacktrace( thr, &thr->cached_rcec.frames[0], N_FRAMES );
4703       if (DEBUG_CACHED_RCEC) {
4704          Bool save_show_below_main = VG_(clo_show_below_main);
4705          VG_(clo_show_below_main) = True;
4706          VG_(printf)("caching stack trace:\n");
4707          VG_(pp_StackTrace)(VG_(current_DiEpoch)(),
4708                             &thr->cached_rcec.frames[0], N_FRAMES);
4709          VG_(clo_show_below_main) = save_show_below_main;
4710       }
4711       stats__cached_rcec_fresh++;
4712    }
4713 
4714    hash = 0;
4715    for (i = 0; i < N_FRAMES; i++) {
4716       hash ^= thr->cached_rcec.frames[i];
4717       hash = ROLW(hash, 19);
4718    }
4719    thr->cached_rcec.frames_hash = hash;
4720    res = ctxt__find_or_add( &thr->cached_rcec );
4721 
4722    if (UNLIKELY(HG_(clo_sanity_flags) & SCE_ACCESS)
4723        && thr_cached_rcec_valid) {
4724       /* In case the cached and check differ, invalidate the cached rcec.
4725          We have less duplicated diffs reported afterwards. */
4726       if (!check_cached_rcec_ok (thr, previous_frame0))
4727          set_cached_rcec_validity(thr, False);
4728    } else {
4729       if (HG_(clo_delta_stacktrace) && !thr_cached_rcec_valid)
4730             set_cached_rcec_validity(thr, True);
4731    }
4732 
4733    return res;
4734 }
4735 
4736 ///////////////////////////////////////////////////////
4737 //// Part (2):
4738 ///  A hashtable guest-addr -> OldRef, that refers to (1)
4739 ///  Note: we use the guest address as key. This means that the entries
4740 ///  for multiple threads accessing the same address will land in the same
4741 ///  bucket. It might be nice to have a better distribution of the
4742 ///  OldRef in the hashtable by using ask key the guestaddress ^ tsw.
4743 ///  The problem is that when a race is reported on a ga, we need to retrieve
4744 ///  efficiently the accesses to ga by other threads, only using the ga.
4745 ///  Measurements on firefox have shown that the chain length is reasonable.
4746 
4747 /* Records an access: a thread, a context (size & writeness) and the
4748    number of held locks. The size (1,2,4,8) is stored as is in szB.
4749    Note that szB uses more bits than needed to store a size up to 8.
4750    This allows to use a TSW as a fully initialised UInt e.g. in
4751    cmp_oldref_tsw. If needed, a more compact representation of szB
4752    can be done (e.g. use only 4 bits, or use only 2 bits and encode the
4753    size (1,2,4,8) as 00 = 1, 01 = 2, 10 = 4, 11 = 8. */
4754 typedef
4755    struct {
4756       UInt      thrid  : SCALARTS_N_THRBITS;
4757       UInt      szB    : 32 - SCALARTS_N_THRBITS - 1;
4758       UInt      isW    : 1;
4759    } TSW; // Thread+Size+Writeness
4760 typedef
4761    struct {
4762       TSW       tsw;
4763       WordSetID locksHeldW;
4764       RCEC*     rcec;
4765    }
4766    Thr_n_RCEC;
4767 
4768 typedef
4769    struct OldRef {
4770       struct OldRef *ht_next; // to link hash table nodes together.
4771       UWord  ga; // hash_table key, == address for which we record an access.
4772       struct OldRef *prev; // to refs older than this one
4773       struct OldRef *next; // to refs newer that this one
4774       UWord stamp; // allows to order (by time of access) 2 OldRef
4775       Thr_n_RCEC acc;
4776    }
4777    OldRef;
4778 
4779 /* Returns the or->tsw as an UInt */
oldref_tsw(const OldRef * or)4780 static inline UInt oldref_tsw (const OldRef* or)
4781 {
4782    return *(const UInt*)(&or->acc.tsw);
4783 }
4784 
4785 /* Compare the tsw component for 2 OldRef.
4786    Used for OldRef hashtable (which already verifies equality of the
4787    'key' part. */
cmp_oldref_tsw(const void * node1,const void * node2)4788 static Word cmp_oldref_tsw (const void* node1, const void* node2 )
4789 {
4790    const UInt tsw1 = oldref_tsw(node1);
4791    const UInt tsw2 = oldref_tsw(node2);
4792 
4793    if (tsw1 < tsw2) return -1;
4794    if (tsw1 > tsw2) return  1;
4795    return 0;
4796 }
4797 
4798 
4799 //////////// BEGIN OldRef pool allocator
4800 static PoolAlloc* oldref_pool_allocator;
4801 // Note: We only allocate elements in this pool allocator, we never free them.
4802 // We stop allocating elements at VG_(clo_conflict_cache_size).
4803 //////////// END OldRef pool allocator
4804 
4805 static OldRef mru;
4806 static OldRef lru;
4807 // A double linked list, chaining all OldREf in a mru/lru order.
4808 // mru/lru are sentinel nodes.
4809 // Whenever an oldref is re-used, its position is changed as the most recently
4810 // used (i.e. pointed to by mru.prev).
4811 // When a new oldref is needed, it is allocated from the pool
4812 //  if we have not yet reached --conflict-cache-size.
4813 // Otherwise, if all oldref have already been allocated,
4814 // the least recently used (i.e. pointed to by lru.next) is re-used.
4815 // When an OldRef is used, it is moved as the most recently used entry
4816 // (i.e. pointed to by mru.prev).
4817 
4818 // Removes r from the double linked list
4819 // Note: we do not need to test for special cases such as
4820 // NULL next or prev pointers, because we have sentinel nodes
4821 // at both sides of the list. So, a node is always forward and
4822 // backward linked.
OldRef_unchain(OldRef * r)4823 static inline void OldRef_unchain(OldRef *r)
4824 {
4825    r->next->prev = r->prev;
4826    r->prev->next = r->next;
4827 }
4828 
4829 // Insert new as the newest OldRef
4830 // Similarly to OldRef_unchain, no need to test for NULL
4831 // pointers, as e.g. mru.prev is always guaranteed to point
4832 // to a non NULL node (lru when the list is empty).
OldRef_newest(OldRef * new)4833 static inline void OldRef_newest(OldRef *new)
4834 {
4835    new->next = &mru;
4836    new->prev = mru.prev;
4837    mru.prev = new;
4838    new->prev->next = new;
4839 }
4840 
4841 
4842 static VgHashTable* oldrefHT    = NULL; /* Hash table* OldRef* */
4843 static UWord     oldrefHTN    = 0;    /* # elems in oldrefHT */
4844 /* Note: the nr of ref in the oldrefHT will always be equal to
4845    the nr of elements that were allocated from the OldRef pool allocator
4846    as we never free an OldRef : we just re-use them. */
4847 
4848 
4849 /* allocates a new OldRef or re-use the lru one if all allowed OldRef
4850    have already been allocated. */
alloc_or_reuse_OldRef(void)4851 static OldRef* alloc_or_reuse_OldRef ( void )
4852 {
4853    if (oldrefHTN < HG_(clo_conflict_cache_size)) {
4854       oldrefHTN++;
4855       return VG_(allocEltPA) ( oldref_pool_allocator );
4856    } else {
4857       OldRef *oldref_ht;
4858       OldRef *oldref = lru.next;
4859 
4860       OldRef_unchain(oldref);
4861       oldref_ht = VG_(HT_gen_remove) (oldrefHT, oldref, cmp_oldref_tsw);
4862       tl_assert (oldref == oldref_ht);
4863       ctxt__rcdec( oldref->acc.rcec );
4864       return oldref;
4865    }
4866 }
4867 
4868 
min_UInt(UInt a,UInt b)4869 inline static UInt min_UInt ( UInt a, UInt b ) {
4870    return a < b ? a : b;
4871 }
4872 
4873 /* Compare the intervals [a1,a1+n1) and [a2,a2+n2).  Return -1 if the
4874    first interval is lower, 1 if the first interval is higher, and 0
4875    if there is any overlap.  Redundant paranoia with casting is there
4876    following what looked distinctly like a bug in gcc-4.1.2, in which
4877    some of the comparisons were done signedly instead of
4878    unsignedly. */
4879 /* Copied from exp-ptrcheck/sg_main.c */
cmp_nonempty_intervals(Addr a1,SizeT n1,Addr a2,SizeT n2)4880 static inline Word cmp_nonempty_intervals ( Addr a1, SizeT n1,
4881                                             Addr a2, SizeT n2 ) {
4882    UWord a1w = (UWord)a1;
4883    UWord n1w = (UWord)n1;
4884    UWord a2w = (UWord)a2;
4885    UWord n2w = (UWord)n2;
4886    tl_assert(n1w > 0 && n2w > 0);
4887    if (a1w + n1w <= a2w) return -1L;
4888    if (a2w + n2w <= a1w) return 1L;
4889    return 0;
4890 }
4891 
4892 static UWord event_map_stamp = 0; // Used to stamp each OldRef when touched.
4893 
event_map_bind(Addr a,SizeT szB,Bool isW,Thr * thr)4894 static void event_map_bind ( Addr a, SizeT szB, Bool isW, Thr* thr )
4895 {
4896    OldRef  example;
4897    OldRef* ref;
4898    RCEC*   rcec;
4899 
4900    tl_assert(thr);
4901    ThrID thrid = thr->thrid;
4902    tl_assert(thrid != 0); /* zero is used to denote an empty slot. */
4903 
4904    WordSetID locksHeldW = thr->hgthread->locksetW;
4905 
4906    rcec = get_RCEC( thr );
4907 
4908    /* Look in the oldrefHT to see if we already have a record for this
4909       address/thr/sz/isW. */
4910    example.ga = a;
4911    example.acc.tsw = (TSW) {.thrid = thrid,
4912                             .szB = szB,
4913                             .isW = (UInt)(isW & 1)};
4914    ref = VG_(HT_gen_lookup) (oldrefHT, &example, cmp_oldref_tsw);
4915 
4916    if (ref) {
4917       /* We already have a record for this address and this (thrid, R/W,
4918          size) triple. */
4919       tl_assert (ref->ga == a);
4920 
4921       /* thread 'thr' has an entry.  Update its RCEC, if it differs. */
4922       if (rcec == ref->acc.rcec)
4923          stats__ctxt_eq_tsw_eq_rcec++;
4924       else {
4925          stats__ctxt_eq_tsw_neq_rcec++;
4926          ctxt__rcdec( ref->acc.rcec );
4927          ctxt__rcinc(rcec);
4928          ref->acc.rcec       = rcec;
4929       }
4930       tl_assert(ref->acc.tsw.thrid == thrid);
4931       /* Update the stamp, RCEC and the W-held lockset. */
4932       ref->stamp = event_map_stamp;
4933       ref->acc.locksHeldW = locksHeldW;
4934 
4935       OldRef_unchain(ref);
4936       OldRef_newest(ref);
4937 
4938    } else {
4939       tl_assert (szB == 4 || szB == 8 ||szB == 1 || szB == 2);
4940       // We only need to check the size the first time we insert a ref.
4941       // Check for most frequent cases first
4942       // Note: we could support a szB up to 1 << (32 - SCALARTS_N_THRBITS - 1)
4943 
4944       /* We don't have a record for this address+triple.  Create a new one. */
4945       stats__ctxt_neq_tsw_neq_rcec++;
4946       ref = alloc_or_reuse_OldRef();
4947       ref->ga = a;
4948       ref->acc.tsw = (TSW) {.thrid  = thrid,
4949                             .szB    = szB,
4950                             .isW    = (UInt)(isW & 1)};
4951       ref->stamp = event_map_stamp;
4952       ref->acc.locksHeldW = locksHeldW;
4953       ref->acc.rcec       = rcec;
4954       ctxt__rcinc(rcec);
4955 
4956       VG_(HT_add_node) ( oldrefHT, ref );
4957       OldRef_newest (ref);
4958    }
4959    event_map_stamp++;
4960 }
4961 
4962 
4963 /* Extract info from the conflicting-access machinery.
4964    Returns the most recent conflicting access with thr/[a, a+szB[/isW. */
libhb_event_map_lookup(ExeContext ** resEC,Thr ** resThr,SizeT * resSzB,Bool * resIsW,WordSetID * locksHeldW,Thr * thr,Addr a,SizeT szB,Bool isW)4965 Bool libhb_event_map_lookup ( /*OUT*/ExeContext** resEC,
4966                               /*OUT*/Thr**        resThr,
4967                               /*OUT*/SizeT*       resSzB,
4968                               /*OUT*/Bool*        resIsW,
4969                               /*OUT*/WordSetID*   locksHeldW,
4970                               Thr* thr, Addr a, SizeT szB, Bool isW )
4971 {
4972    Word    i, j;
4973    OldRef *ref = NULL;
4974    SizeT  ref_szB = 0;
4975 
4976    OldRef *cand_ref;
4977    SizeT  cand_ref_szB;
4978    Addr   cand_a;
4979 
4980    Addr toCheck[15];
4981    Int  nToCheck = 0;
4982 
4983    tl_assert(thr);
4984    tl_assert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
4985 
4986    ThrID thrid = thr->thrid;
4987 
4988    toCheck[nToCheck++] = a;
4989    for (i = -7; i < (Word)szB; i++) {
4990       if (i != 0)
4991          toCheck[nToCheck++] = a + i;
4992    }
4993    tl_assert(nToCheck <= 15);
4994 
4995    /* Now see if we can find a suitable matching event for
4996       any of the addresses in toCheck[0 .. nToCheck-1]. */
4997    for (j = 0; j < nToCheck; j++) {
4998 
4999       cand_a = toCheck[j];
5000       //      VG_(printf)("test %ld %p\n", j, cand_a);
5001 
5002       /* Find the first HT element for this address.
5003          We might have several of these. They will be linked via ht_next.
5004          We however need to check various elements as the list contains
5005          all elements that map to the same bucket. */
5006       for (cand_ref = VG_(HT_lookup)( oldrefHT, cand_a );
5007            cand_ref; cand_ref = cand_ref->ht_next) {
5008          if (cand_ref->ga != cand_a)
5009             /* OldRef for another address in this HT bucket. Ignore. */
5010             continue;
5011 
5012          if (cand_ref->acc.tsw.thrid == thrid)
5013             /* This is an access by the same thread, but we're only
5014                interested in accesses from other threads.  Ignore. */
5015             continue;
5016 
5017          if ((!cand_ref->acc.tsw.isW) && (!isW))
5018             /* We don't want to report a read racing against another
5019                read; that's stupid.  So in this case move on. */
5020             continue;
5021 
5022          cand_ref_szB        = cand_ref->acc.tsw.szB;
5023          if (cmp_nonempty_intervals(a, szB, cand_a, cand_ref_szB) != 0)
5024             /* No overlap with the access we're asking about.  Ignore. */
5025             continue;
5026 
5027          /* We have a match. Keep this match if it is newer than
5028             the previous match. Note that stamp are Unsigned Words, and
5029             for long running applications, event_map_stamp might have cycled.
5030             So, 'roll' each stamp using event_map_stamp to have the
5031             stamps in the good order, in case event_map_stamp recycled. */
5032          if (!ref
5033              || (ref->stamp - event_map_stamp)
5034                    < (cand_ref->stamp - event_map_stamp)) {
5035             ref = cand_ref;
5036             ref_szB = cand_ref_szB;
5037          }
5038       }
5039 
5040       if (ref) {
5041          /* return with success */
5042          Int n, maxNFrames;
5043          RCEC*     ref_rcec = ref->acc.rcec;
5044          tl_assert(ref->acc.tsw.thrid);
5045          tl_assert(ref_rcec);
5046          tl_assert(ref_rcec->magic == RCEC_MAGIC);
5047          tl_assert(ref_szB >= 1);
5048          /* Count how many non-zero frames we have. */
5049          maxNFrames = min_UInt(N_FRAMES, VG_(clo_backtrace_size));
5050          for (n = 0; n < maxNFrames; n++) {
5051             if (0 == ref_rcec->frames[n]) break;
5052          }
5053          *resEC      = VG_(make_ExeContext_from_StackTrace)(ref_rcec->frames,
5054                                                             n);
5055          *resThr     = Thr__from_ThrID(ref->acc.tsw.thrid);
5056          *resSzB     = ref_szB;
5057          *resIsW     = ref->acc.tsw.isW;
5058          *locksHeldW = ref->acc.locksHeldW;
5059          stats__evm__lookup_found++;
5060          return True;
5061       }
5062 
5063       /* consider next address in toCheck[] */
5064    } /* for (j = 0; j < nToCheck; j++) */
5065 
5066    /* really didn't find anything. */
5067    stats__evm__lookup_notfound++;
5068    return False;
5069 }
5070 
5071 
libhb_event_map_access_history(Addr a,SizeT szB,Access_t fn)5072 void libhb_event_map_access_history ( Addr a, SizeT szB, Access_t fn )
5073 {
5074    OldRef *ref = lru.next;
5075    SizeT ref_szB;
5076    Int n;
5077 
5078    while (ref != &mru) {
5079       ref_szB = ref->acc.tsw.szB;
5080       if (cmp_nonempty_intervals(a, szB, ref->ga, ref_szB) == 0) {
5081          RCEC* ref_rcec = ref->acc.rcec;
5082          for (n = 0; n < N_FRAMES; n++) {
5083             if (0 == ref_rcec->frames[n]) {
5084                break;
5085             }
5086          }
5087          (*fn)(ref_rcec->frames, n,
5088                Thr__from_ThrID(ref->acc.tsw.thrid),
5089                ref->ga,
5090                ref_szB,
5091                ref->acc.tsw.isW,
5092                ref->acc.locksHeldW);
5093       }
5094       tl_assert (ref->next == &mru
5095                  || ((ref->stamp - event_map_stamp)
5096                         < ref->next->stamp - event_map_stamp));
5097       ref = ref->next;
5098    }
5099 }
5100 
event_map_init(void)5101 static void event_map_init ( void )
5102 {
5103    Word i;
5104 
5105    /* Context (RCEC) pool allocator */
5106    rcec_pool_allocator = VG_(newPA) (
5107                              sizeof(RCEC),
5108                              1000 /* RCECs per pool */,
5109                              HG_(zalloc),
5110                              "libhb.event_map_init.1 (RCEC pools)",
5111                              HG_(free)
5112                           );
5113 
5114    /* Context table */
5115    tl_assert(!contextTab);
5116    contextTab = HG_(zalloc)( "libhb.event_map_init.2 (context table)",
5117                              N_RCEC_TAB * sizeof(RCEC*) );
5118    for (i = 0; i < N_RCEC_TAB; i++)
5119       contextTab[i] = NULL;
5120 
5121    /* Oldref pool allocator */
5122    oldref_pool_allocator = VG_(newPA)(
5123                                sizeof(OldRef),
5124                                1000 /* OldRefs per pool */,
5125                                HG_(zalloc),
5126                                "libhb.event_map_init.3 (OldRef pools)",
5127                                HG_(free)
5128                             );
5129 
5130    /* Oldref hashtable */
5131    tl_assert(!oldrefHT);
5132    oldrefHT = VG_(HT_construct) ("libhb.event_map_init.4 (oldref hashtable)");
5133 
5134    oldrefHTN = 0;
5135    mru.prev = &lru;
5136    mru.next = NULL;
5137    lru.prev = NULL;
5138    lru.next = &mru;
5139    mru.acc = (Thr_n_RCEC) {.tsw = {.thrid = 0,
5140                                    .szB = 0,
5141                                    .isW = 0},
5142                            .locksHeldW = 0,
5143                            .rcec = NULL};
5144    lru.acc = mru.acc;
5145 }
5146 
event_map__check_reference_counts(void)5147 static void event_map__check_reference_counts ( void )
5148 {
5149    RCEC*   rcec;
5150    OldRef* oldref;
5151    Word    i;
5152    UWord   nEnts = 0;
5153 
5154    /* Set the 'check' reference counts to zero.  Also, optionally
5155       check that the real reference counts are non-zero.  We allow
5156       these to fall to zero before a GC, but the GC must get rid of
5157       all those that are zero, hence none should be zero after a
5158       GC. */
5159    for (i = 0; i < N_RCEC_TAB; i++) {
5160       for (rcec = contextTab[i]; rcec; rcec = rcec->next) {
5161          nEnts++;
5162          tl_assert(rcec);
5163          tl_assert(rcec->magic == RCEC_MAGIC);
5164          rcec->rcX = 0;
5165       }
5166    }
5167 
5168    /* check that the stats are sane */
5169    tl_assert(nEnts == stats__ctxt_tab_curr);
5170    tl_assert(stats__ctxt_tab_curr <= stats__ctxt_tab_max);
5171 
5172    /* visit all the referencing points, inc check ref counts */
5173    VG_(HT_ResetIter)( oldrefHT );
5174    oldref = VG_(HT_Next)( oldrefHT );
5175    while (oldref) {
5176       tl_assert (oldref->acc.tsw.thrid);
5177       tl_assert (oldref->acc.rcec);
5178       tl_assert (oldref->acc.rcec->magic == RCEC_MAGIC);
5179       oldref->acc.rcec->rcX++;
5180       oldref = VG_(HT_Next)( oldrefHT );
5181    }
5182 
5183    /* compare check ref counts with actual */
5184    for (i = 0; i < N_RCEC_TAB; i++) {
5185       for (rcec = contextTab[i]; rcec; rcec = rcec->next) {
5186          tl_assert(rcec->rc == rcec->rcX);
5187       }
5188    }
5189 }
5190 
5191 __attribute__((noinline))
do_RCEC_GC(void)5192 static void do_RCEC_GC ( void )
5193 {
5194    UInt i;
5195 
5196    if (VG_(clo_stats)) {
5197       static UInt ctr = 1;
5198       VG_(message)(Vg_DebugMsg,
5199                   "libhb: RCEC GC: #%u  %lu slots,"
5200                    " %lu cur ents(ref'd %lu),"
5201                    " %lu max ents\n",
5202                    ctr++,
5203                    (UWord)N_RCEC_TAB,
5204                    stats__ctxt_tab_curr, RCEC_referenced,
5205                    stats__ctxt_tab_max );
5206    }
5207    tl_assert (stats__ctxt_tab_curr > RCEC_referenced);
5208 
5209    /* Throw away all RCECs with zero reference counts */
5210    for (i = 0; i < N_RCEC_TAB; i++) {
5211       RCEC** pp = &contextTab[i];
5212       RCEC*  p  = *pp;
5213       while (p) {
5214          if (p->rc == 0) {
5215             *pp = p->next;
5216             free_RCEC(p);
5217             p = *pp;
5218             tl_assert(stats__ctxt_tab_curr > 0);
5219             stats__ctxt_rcec_gc_discards++;
5220             stats__ctxt_tab_curr--;
5221          } else {
5222             pp = &p->next;
5223             p = p->next;
5224          }
5225       }
5226    }
5227 
5228    tl_assert (stats__ctxt_tab_curr == RCEC_referenced);
5229 }
5230 
5231 /////////////////////////////////////////////////////////
5232 //                                                     //
5233 // Core MSM                                            //
5234 //                                                     //
5235 /////////////////////////////////////////////////////////
5236 
5237 /* Logic in msmcread/msmcwrite updated/verified after re-analysis, 19
5238    Nov 08, and again after [...],
5239    June 09. */
5240 
5241 static ULong stats__msmcread         = 0;
5242 static ULong stats__msmcread_change  = 0;
5243 static ULong stats__msmcwrite        = 0;
5244 static ULong stats__msmcwrite_change = 0;
5245 
5246 /* Some notes on the H1 history mechanism:
5247 
5248    Transition rules are:
5249 
5250    read_{Kr,Kw}(Cr,Cw)  = (Cr,           Cr `join` Kw)
5251    write_{Kr,Kw}(Cr,Cw) = (Cr `join` Kw, Cr `join` Kw)
5252 
5253    After any access by a thread T to a location L, L's constraint pair
5254    (Cr,Cw) has Cw[T] == T's Kw[T], that is, == T's scalar W-clock.
5255 
5256    After a race by thread T conflicting with some previous access by
5257    some other thread U, for a location with constraint (before
5258    processing the later access) (Cr,Cw), then Cw[U] is the segment in
5259    which the previously access lies.
5260 
5261    Hence in record_race_info, we pass in Cfailed and Kfailed, which
5262    are compared so as to find out which thread(s) this access
5263    conflicts with.  Once that is established, we also require the
5264    pre-update Cw for the location, so we can index into it for those
5265    threads, to get the scalar clock values for the point at which the
5266    former accesses were made.  (In fact we only bother to do any of
5267    this for an arbitrarily chosen one of the conflicting threads, as
5268    that's simpler, it avoids flooding the user with vast amounts of
5269    mostly useless information, and because the program is wrong if it
5270    contains any races at all -- so we don't really need to show all
5271    conflicting access pairs initially, so long as we only show none if
5272    none exist).
5273 
5274    ---
5275 
5276    That requires the auxiliary proof that
5277 
5278       (Cr `join` Kw)[T] == Kw[T]
5279 
5280    Why should that be true?  Because for any thread T, Kw[T] >= the
5281    scalar clock value for T known by any other thread.  In other
5282    words, because T's value for its own scalar clock is at least as up
5283    to date as the value for it known by any other thread (that is true
5284    for both the R- and W- scalar clocks).  Hence no other thread will
5285    be able to feed in a value for that element (indirectly via a
5286    constraint) which will exceed Kw[T], and hence the join cannot
5287    cause that particular element to advance.
5288 */
5289 
5290 __attribute__((noinline))
record_race_info(Thr * acc_thr,Addr acc_addr,SizeT szB,Bool isWrite,VtsID Cfailed,VtsID Kfailed,VtsID Cw)5291 static void record_race_info ( Thr* acc_thr,
5292                                Addr acc_addr, SizeT szB, Bool isWrite,
5293                                VtsID Cfailed,
5294                                VtsID Kfailed,
5295                                VtsID Cw )
5296 {
5297    /* Call here to report a race.  We just hand it onwards to
5298       HG_(record_error_Race).  If that in turn discovers that the
5299       error is going to be collected, then, at history_level 2, that
5300       queries the conflicting-event map.  The alternative would be to
5301       query it right here.  But that causes a lot of pointless queries
5302       for errors which will shortly be discarded as duplicates, and
5303       can become a performance overhead; so we defer the query until
5304       we know the error is not a duplicate. */
5305 
5306    /* Stacks for the bounds of the (or one of the) conflicting
5307       segment(s).  These are only set at history_level 1. */
5308    ExeContext* hist1_seg_start = NULL;
5309    ExeContext* hist1_seg_end   = NULL;
5310    Thread*     hist1_conf_thr  = NULL;
5311 
5312    tl_assert(acc_thr);
5313    tl_assert(acc_thr->hgthread);
5314    tl_assert(acc_thr->hgthread->hbthr == acc_thr);
5315    tl_assert(HG_(clo_history_level) >= 0 && HG_(clo_history_level) <= 2);
5316 
5317    if (HG_(clo_history_level) == 1) {
5318       Bool found;
5319       Word firstIx, lastIx;
5320       ULong_n_EC key;
5321 
5322       /* At history_level 1, we must round up the relevant stack-pair
5323          for the conflicting segment right now.  This is because
5324          deferring it is complex; we can't (easily) put Kfailed and
5325          Cfailed into the XError and wait for later without
5326          getting tied up in difficulties with VtsID reference
5327          counting.  So just do it now. */
5328       Thr*  confThr;
5329       ULong confTym = 0;
5330       /* Which thread are we in conflict with?  There may be more than
5331          one, in which case VtsID__findFirst_notLEQ selects one arbitrarily
5332          (in fact it's the one with the lowest Thr* value). */
5333       confThr = VtsID__findFirst_notLEQ( Cfailed, Kfailed );
5334       /* This must exist!  since if it was NULL then there's no
5335          conflict (semantics of return value of
5336          VtsID__findFirst_notLEQ), and msmc{read,write}, which has
5337          called us, just checked exactly this -- that there was in
5338          fact a race. */
5339       tl_assert(confThr);
5340 
5341       /* Get the scalar clock value that the conflicting thread
5342          introduced into the constraint.  A careful examination of the
5343          base machine rules shows that this must be the same as the
5344          conflicting thread's scalar clock when it created this
5345          constraint.  Hence we know the scalar clock of the
5346          conflicting thread when the conflicting access was made. */
5347       confTym = VtsID__indexAt( Cfailed, confThr );
5348 
5349       /* Using this scalar clock, index into the conflicting thread's
5350          collection of stack traces made each time its vector clock
5351          (hence its scalar clock) changed.  This gives the stack
5352          traces at the start and end of the conflicting segment (well,
5353          as per comment just above, of one of the conflicting
5354          segments, if there are more than one). */
5355       key.ull = confTym;
5356       key.ec  = NULL;
5357       /* tl_assert(confThr); -- asserted just above */
5358       tl_assert(confThr->local_Kws_n_stacks);
5359       firstIx = lastIx = 0;
5360       found = VG_(lookupXA_UNSAFE)(
5361                  confThr->local_Kws_n_stacks,
5362                  &key, &firstIx, &lastIx,
5363                  (XACmpFn_t)cmp__ULong_n_EC__by_ULong
5364               );
5365       if (0) VG_(printf)("record_race_info %u %u %u  confThr %p "
5366                          "confTym %llu found %d (%ld,%ld)\n",
5367                          Cfailed, Kfailed, Cw,
5368                          confThr, confTym, found, firstIx, lastIx);
5369       /* We can't indefinitely collect stack traces at VTS
5370          transitions, since we'd eventually run out of memory.  Hence
5371          note_local_Kw_n_stack_for will eventually throw away old
5372          ones, which in turn means we might fail to find index value
5373          confTym in the array. */
5374       if (found) {
5375          ULong_n_EC *pair_start, *pair_end;
5376          pair_start
5377             = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks, lastIx );
5378          hist1_seg_start = pair_start->ec;
5379          if (lastIx+1 < VG_(sizeXA)( confThr->local_Kws_n_stacks )) {
5380             pair_end
5381                = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks,
5382                                             lastIx+1 );
5383             /* from properties of VG_(lookupXA) and the comparison fn used: */
5384             tl_assert(pair_start->ull < pair_end->ull);
5385             hist1_seg_end = pair_end->ec;
5386             /* Could do a bit better here.  It may be that pair_end
5387                doesn't have a stack, but the following entries in the
5388                array have the same scalar Kw and to have a stack.  So
5389                we should search a bit further along the array than
5390                lastIx+1 if hist1_seg_end is NULL. */
5391          } else {
5392             if (!confThr->llexit_done)
5393                hist1_seg_end = main_get_EC( confThr );
5394          }
5395          // seg_start could be NULL iff this is the first stack in the thread
5396          //if (seg_start) VG_(pp_ExeContext)(seg_start);
5397          //if (seg_end)   VG_(pp_ExeContext)(seg_end);
5398          hist1_conf_thr = confThr->hgthread;
5399       }
5400    }
5401 
5402    HG_(record_error_Race)( acc_thr->hgthread, acc_addr,
5403                            szB, isWrite,
5404                            hist1_conf_thr, hist1_seg_start, hist1_seg_end );
5405 }
5406 
is_sane_SVal_C(SVal sv)5407 static Bool is_sane_SVal_C ( SVal sv ) {
5408    Bool leq;
5409    if (!SVal__isC(sv)) return True;
5410    leq = VtsID__cmpLEQ( SVal__unC_Rmin(sv), SVal__unC_Wmin(sv) );
5411    return leq;
5412 }
5413 
5414 
5415 /* Compute new state following a read */
msmcread(SVal svOld,Thr * acc_thr,Addr acc_addr,SizeT szB)5416 static inline SVal msmcread ( SVal svOld,
5417                               /* The following are only needed for
5418                                  creating error reports. */
5419                               Thr* acc_thr,
5420                               Addr acc_addr, SizeT szB )
5421 {
5422    SVal svNew = SVal_INVALID;
5423    stats__msmcread++;
5424 
5425    /* Redundant sanity check on the constraints */
5426    if (CHECK_MSM) {
5427       tl_assert(is_sane_SVal_C(svOld));
5428    }
5429 
5430    if (LIKELY(SVal__isC(svOld))) {
5431       VtsID tviR  = acc_thr->viR;
5432       VtsID tviW  = acc_thr->viW;
5433       VtsID rmini = SVal__unC_Rmin(svOld);
5434       VtsID wmini = SVal__unC_Wmin(svOld);
5435       Bool  leq   = VtsID__cmpLEQ(rmini,tviR);
5436       if (LIKELY(leq)) {
5437          /* no race */
5438          /* Note: RWLOCK subtlety: use tviW, not tviR */
5439          svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) );
5440          goto out;
5441       } else {
5442          /* assert on sanity of constraints. */
5443          Bool leqxx = VtsID__cmpLEQ(rmini,wmini);
5444          tl_assert(leqxx);
5445          // same as in non-race case
5446          svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) );
5447          record_race_info( acc_thr, acc_addr, szB, False/*!isWrite*/,
5448                            rmini, /* Cfailed */
5449                            tviR,  /* Kfailed */
5450                            wmini  /* Cw */ );
5451          goto out;
5452       }
5453    }
5454    if (SVal__isA(svOld)) {
5455       /* reading no-access memory (sigh); leave unchanged */
5456       /* check for no pollution */
5457       tl_assert(svOld == SVal_NOACCESS);
5458       svNew = SVal_NOACCESS;
5459       goto out;
5460    }
5461    if (0) VG_(printf)("msmcread: bad svOld: 0x%016llx\n", svOld);
5462    tl_assert(0);
5463 
5464   out:
5465    if (CHECK_MSM) {
5466       tl_assert(is_sane_SVal_C(svNew));
5467    }
5468    if (UNLIKELY(svNew != svOld)) {
5469       tl_assert(svNew != SVal_INVALID);
5470       if (HG_(clo_history_level) >= 2
5471           && SVal__isC(svOld) && SVal__isC(svNew)) {
5472          event_map_bind( acc_addr, szB, False/*!isWrite*/, acc_thr );
5473          stats__msmcread_change++;
5474       }
5475    }
5476    return svNew;
5477 }
5478 
5479 
5480 /* Compute new state following a write */
msmcwrite(SVal svOld,Thr * acc_thr,Addr acc_addr,SizeT szB)5481 static inline SVal msmcwrite ( SVal svOld,
5482                               /* The following are only needed for
5483                                  creating error reports. */
5484                               Thr* acc_thr,
5485                               Addr acc_addr, SizeT szB )
5486 {
5487    SVal svNew = SVal_INVALID;
5488    stats__msmcwrite++;
5489 
5490    /* Redundant sanity check on the constraints */
5491    if (CHECK_MSM) {
5492       tl_assert(is_sane_SVal_C(svOld));
5493    }
5494 
5495    if (LIKELY(SVal__isC(svOld))) {
5496       VtsID tviW  = acc_thr->viW;
5497       VtsID wmini = SVal__unC_Wmin(svOld);
5498       Bool  leq   = VtsID__cmpLEQ(wmini,tviW);
5499       if (LIKELY(leq)) {
5500          /* no race */
5501          svNew = SVal__mkC( tviW, tviW );
5502          goto out;
5503       } else {
5504          VtsID rmini = SVal__unC_Rmin(svOld);
5505          /* assert on sanity of constraints. */
5506          Bool leqxx = VtsID__cmpLEQ(rmini,wmini);
5507          tl_assert(leqxx);
5508          // same as in non-race case
5509          // proof: in the non-race case, we have
5510          //    rmini <= wmini (invar on constraints)
5511          //    tviW <= tviR (invar on thread clocks)
5512          //    wmini <= tviW (from run-time check)
5513          // hence from transitivity of <= we have
5514          //    rmini <= wmini <= tviW
5515          // and so join(rmini,tviW) == tviW
5516          // and    join(wmini,tviW) == tviW
5517          // qed.
5518          svNew = SVal__mkC( VtsID__join2(rmini, tviW),
5519                             VtsID__join2(wmini, tviW) );
5520          record_race_info( acc_thr, acc_addr, szB, True/*isWrite*/,
5521                            wmini, /* Cfailed */
5522                            tviW,  /* Kfailed */
5523                            wmini  /* Cw */ );
5524          goto out;
5525       }
5526    }
5527    if (SVal__isA(svOld)) {
5528       /* writing no-access memory (sigh); leave unchanged */
5529       /* check for no pollution */
5530       tl_assert(svOld == SVal_NOACCESS);
5531       svNew = SVal_NOACCESS;
5532       goto out;
5533    }
5534    if (0) VG_(printf)("msmcwrite: bad svOld: 0x%016llx\n", svOld);
5535    tl_assert(0);
5536 
5537   out:
5538    if (CHECK_MSM) {
5539       tl_assert(is_sane_SVal_C(svNew));
5540    }
5541    if (UNLIKELY(svNew != svOld)) {
5542       tl_assert(svNew != SVal_INVALID);
5543       if (HG_(clo_history_level) >= 2
5544           && SVal__isC(svOld) && SVal__isC(svNew)) {
5545          event_map_bind( acc_addr, szB, True/*isWrite*/, acc_thr );
5546          stats__msmcwrite_change++;
5547       }
5548    }
5549    return svNew;
5550 }
5551 
5552 
5553 /////////////////////////////////////////////////////////
5554 //                                                     //
5555 // Apply core MSM to specific memory locations         //
5556 //                                                     //
5557 /////////////////////////////////////////////////////////
5558 
5559 /*------------- ZSM accesses: 8 bit sapply ------------- */
5560 
zsm_sapply08__msmcread(Thr * thr,Addr a)5561 static void zsm_sapply08__msmcread ( Thr* thr, Addr a ) {
5562    CacheLine* cl;
5563    UWord      cloff, tno, toff;
5564    SVal       svOld, svNew;
5565    UShort     descr;
5566    stats__cline_cread08s++;
5567    cl    = get_cacheline(a);
5568    cloff = get_cacheline_offset(a);
5569    tno   = get_treeno(a);
5570    toff  = get_tree_offset(a); /* == 0 .. 7 */
5571    descr = cl->descrs[tno];
5572    if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5573       SVal* tree = &cl->svals[tno << 3];
5574       cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5575       if (CHECK_ZSM)
5576          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5577    }
5578    svOld = cl->svals[cloff];
5579    svNew = msmcread( svOld, thr,a,1 );
5580    if (CHECK_ZSM)
5581       tl_assert(svNew != SVal_INVALID);
5582    cl->svals[cloff] = svNew;
5583 }
5584 
zsm_sapply08__msmcwrite(Thr * thr,Addr a)5585 static void zsm_sapply08__msmcwrite ( Thr* thr, Addr a ) {
5586    CacheLine* cl;
5587    UWord      cloff, tno, toff;
5588    SVal       svOld, svNew;
5589    UShort     descr;
5590    stats__cline_cwrite08s++;
5591    cl    = get_cacheline(a);
5592    cloff = get_cacheline_offset(a);
5593    tno   = get_treeno(a);
5594    toff  = get_tree_offset(a); /* == 0 .. 7 */
5595    descr = cl->descrs[tno];
5596    if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5597       SVal* tree = &cl->svals[tno << 3];
5598       cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5599       if (CHECK_ZSM)
5600          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5601    }
5602    svOld = cl->svals[cloff];
5603    svNew = msmcwrite( svOld, thr,a,1 );
5604    if (CHECK_ZSM)
5605       tl_assert(svNew != SVal_INVALID);
5606    cl->svals[cloff] = svNew;
5607 }
5608 
5609 /*------------- ZSM accesses: 16 bit sapply ------------- */
5610 
zsm_sapply16__msmcread(Thr * thr,Addr a)5611 static void zsm_sapply16__msmcread ( Thr* thr, Addr a ) {
5612    CacheLine* cl;
5613    UWord      cloff, tno, toff;
5614    SVal       svOld, svNew;
5615    UShort     descr;
5616    stats__cline_cread16s++;
5617    if (UNLIKELY(!aligned16(a))) goto slowcase;
5618    cl    = get_cacheline(a);
5619    cloff = get_cacheline_offset(a);
5620    tno   = get_treeno(a);
5621    toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
5622    descr = cl->descrs[tno];
5623    if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
5624       if (valid_value_is_below_me_16(descr, toff)) {
5625          goto slowcase;
5626       } else {
5627          SVal* tree = &cl->svals[tno << 3];
5628          cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
5629       }
5630       if (CHECK_ZSM)
5631          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5632    }
5633    svOld = cl->svals[cloff];
5634    svNew = msmcread( svOld, thr,a,2 );
5635    if (CHECK_ZSM)
5636       tl_assert(svNew != SVal_INVALID);
5637    cl->svals[cloff] = svNew;
5638    return;
5639   slowcase: /* misaligned, or must go further down the tree */
5640    stats__cline_16to8splits++;
5641    zsm_sapply08__msmcread( thr, a + 0 );
5642    zsm_sapply08__msmcread( thr, a + 1 );
5643 }
5644 
zsm_sapply16__msmcwrite(Thr * thr,Addr a)5645 static void zsm_sapply16__msmcwrite ( Thr* thr, Addr a ) {
5646    CacheLine* cl;
5647    UWord      cloff, tno, toff;
5648    SVal       svOld, svNew;
5649    UShort     descr;
5650    stats__cline_cwrite16s++;
5651    if (UNLIKELY(!aligned16(a))) goto slowcase;
5652    cl    = get_cacheline(a);
5653    cloff = get_cacheline_offset(a);
5654    tno   = get_treeno(a);
5655    toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
5656    descr = cl->descrs[tno];
5657    if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
5658       if (valid_value_is_below_me_16(descr, toff)) {
5659          goto slowcase;
5660       } else {
5661          SVal* tree = &cl->svals[tno << 3];
5662          cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
5663       }
5664       if (CHECK_ZSM)
5665          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5666    }
5667    svOld = cl->svals[cloff];
5668    svNew = msmcwrite( svOld, thr,a,2 );
5669    if (CHECK_ZSM)
5670       tl_assert(svNew != SVal_INVALID);
5671    cl->svals[cloff] = svNew;
5672    return;
5673   slowcase: /* misaligned, or must go further down the tree */
5674    stats__cline_16to8splits++;
5675    zsm_sapply08__msmcwrite( thr, a + 0 );
5676    zsm_sapply08__msmcwrite( thr, a + 1 );
5677 }
5678 
5679 /*------------- ZSM accesses: 32 bit sapply ------------- */
5680 
zsm_sapply32__msmcread(Thr * thr,Addr a)5681 static void zsm_sapply32__msmcread ( Thr* thr, Addr a ) {
5682    CacheLine* cl;
5683    UWord      cloff, tno, toff;
5684    SVal       svOld, svNew;
5685    UShort     descr;
5686    stats__cline_cread32s++;
5687    if (UNLIKELY(!aligned32(a))) goto slowcase;
5688    cl    = get_cacheline(a);
5689    cloff = get_cacheline_offset(a);
5690    tno   = get_treeno(a);
5691    toff  = get_tree_offset(a); /* == 0 or 4 */
5692    descr = cl->descrs[tno];
5693    if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
5694       if (valid_value_is_above_me_32(descr, toff)) {
5695          SVal* tree = &cl->svals[tno << 3];
5696          cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
5697       } else {
5698          goto slowcase;
5699       }
5700       if (CHECK_ZSM)
5701          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5702    }
5703    svOld = cl->svals[cloff];
5704    svNew = msmcread( svOld, thr,a,4 );
5705    if (CHECK_ZSM)
5706       tl_assert(svNew != SVal_INVALID);
5707    cl->svals[cloff] = svNew;
5708    return;
5709   slowcase: /* misaligned, or must go further down the tree */
5710    stats__cline_32to16splits++;
5711    zsm_sapply16__msmcread( thr, a + 0 );
5712    zsm_sapply16__msmcread( thr, a + 2 );
5713 }
5714 
zsm_sapply32__msmcwrite(Thr * thr,Addr a)5715 static void zsm_sapply32__msmcwrite ( Thr* thr, Addr a ) {
5716    CacheLine* cl;
5717    UWord      cloff, tno, toff;
5718    SVal       svOld, svNew;
5719    UShort     descr;
5720    stats__cline_cwrite32s++;
5721    if (UNLIKELY(!aligned32(a))) goto slowcase;
5722    cl    = get_cacheline(a);
5723    cloff = get_cacheline_offset(a);
5724    tno   = get_treeno(a);
5725    toff  = get_tree_offset(a); /* == 0 or 4 */
5726    descr = cl->descrs[tno];
5727    if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
5728       if (valid_value_is_above_me_32(descr, toff)) {
5729          SVal* tree = &cl->svals[tno << 3];
5730          cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
5731       } else {
5732          goto slowcase;
5733       }
5734       if (CHECK_ZSM)
5735          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5736    }
5737    svOld = cl->svals[cloff];
5738    svNew = msmcwrite( svOld, thr,a,4 );
5739    if (CHECK_ZSM)
5740       tl_assert(svNew != SVal_INVALID);
5741    cl->svals[cloff] = svNew;
5742    return;
5743   slowcase: /* misaligned, or must go further down the tree */
5744    stats__cline_32to16splits++;
5745    zsm_sapply16__msmcwrite( thr, a + 0 );
5746    zsm_sapply16__msmcwrite( thr, a + 2 );
5747 }
5748 
5749 /*------------- ZSM accesses: 64 bit sapply ------------- */
5750 
zsm_sapply64__msmcread(Thr * thr,Addr a)5751 static void zsm_sapply64__msmcread ( Thr* thr, Addr a ) {
5752    CacheLine* cl;
5753    UWord      cloff, tno;
5754    //UWord      toff;
5755    SVal       svOld, svNew;
5756    UShort     descr;
5757    stats__cline_cread64s++;
5758    if (UNLIKELY(!aligned64(a))) goto slowcase;
5759    cl    = get_cacheline(a);
5760    cloff = get_cacheline_offset(a);
5761    tno   = get_treeno(a);
5762    //toff  = get_tree_offset(a); /* == 0, unused */
5763    descr = cl->descrs[tno];
5764    if (UNLIKELY( !(descr & TREE_DESCR_64) )) {
5765       goto slowcase;
5766    }
5767    svOld = cl->svals[cloff];
5768    svNew = msmcread( svOld, thr,a,8 );
5769    if (CHECK_ZSM)
5770       tl_assert(svNew != SVal_INVALID);
5771    cl->svals[cloff] = svNew;
5772    return;
5773   slowcase: /* misaligned, or must go further down the tree */
5774    stats__cline_64to32splits++;
5775    zsm_sapply32__msmcread( thr, a + 0 );
5776    zsm_sapply32__msmcread( thr, a + 4 );
5777 }
5778 
zsm_sapply64__msmcwrite(Thr * thr,Addr a)5779 static void zsm_sapply64__msmcwrite ( Thr* thr, Addr a ) {
5780    CacheLine* cl;
5781    UWord      cloff, tno;
5782    //UWord      toff;
5783    SVal       svOld, svNew;
5784    UShort     descr;
5785    stats__cline_cwrite64s++;
5786    if (UNLIKELY(!aligned64(a))) goto slowcase;
5787    cl    = get_cacheline(a);
5788    cloff = get_cacheline_offset(a);
5789    tno   = get_treeno(a);
5790    //toff  = get_tree_offset(a); /* == 0, unused */
5791    descr = cl->descrs[tno];
5792    if (UNLIKELY( !(descr & TREE_DESCR_64) )) {
5793       goto slowcase;
5794    }
5795    svOld = cl->svals[cloff];
5796    svNew = msmcwrite( svOld, thr,a,8 );
5797    if (CHECK_ZSM)
5798       tl_assert(svNew != SVal_INVALID);
5799    cl->svals[cloff] = svNew;
5800    return;
5801   slowcase: /* misaligned, or must go further down the tree */
5802    stats__cline_64to32splits++;
5803    zsm_sapply32__msmcwrite( thr, a + 0 );
5804    zsm_sapply32__msmcwrite( thr, a + 4 );
5805 }
5806 
5807 /*--------------- ZSM accesses: 8 bit swrite --------------- */
5808 
5809 static
zsm_swrite08(Addr a,SVal svNew)5810 void zsm_swrite08 ( Addr a, SVal svNew ) {
5811    CacheLine* cl;
5812    UWord      cloff, tno, toff;
5813    UShort     descr;
5814    stats__cline_swrite08s++;
5815    cl    = get_cacheline(a);
5816    cloff = get_cacheline_offset(a);
5817    tno   = get_treeno(a);
5818    toff  = get_tree_offset(a); /* == 0 .. 7 */
5819    descr = cl->descrs[tno];
5820    if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5821       SVal* tree = &cl->svals[tno << 3];
5822       cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5823       if (CHECK_ZSM)
5824          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5825    }
5826    tl_assert(svNew != SVal_INVALID);
5827    cl->svals[cloff] = svNew;
5828 }
5829 
5830 /*--------------- ZSM accesses: 16 bit swrite --------------- */
5831 
5832 static
zsm_swrite16(Addr a,SVal svNew)5833 void zsm_swrite16 ( Addr a, SVal svNew ) {
5834    CacheLine* cl;
5835    UWord      cloff, tno, toff;
5836    UShort     descr;
5837    stats__cline_swrite16s++;
5838    if (UNLIKELY(!aligned16(a))) goto slowcase;
5839    cl    = get_cacheline(a);
5840    cloff = get_cacheline_offset(a);
5841    tno   = get_treeno(a);
5842    toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
5843    descr = cl->descrs[tno];
5844    if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
5845       if (valid_value_is_below_me_16(descr, toff)) {
5846          /* Writing at this level.  Need to fix up 'descr'. */
5847          cl->descrs[tno] = pullup_descr_to_16(descr, toff);
5848          /* At this point, the tree does not match cl->descr[tno] any
5849             more.  The assignments below will fix it up. */
5850       } else {
5851          /* We can't indiscriminately write on the w16 node as in the
5852             w64 case, as that might make the node inconsistent with
5853             its parent.  So first, pull down to this level. */
5854          SVal* tree = &cl->svals[tno << 3];
5855          cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
5856       if (CHECK_ZSM)
5857          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5858       }
5859    }
5860    tl_assert(svNew != SVal_INVALID);
5861    cl->svals[cloff + 0] = svNew;
5862    cl->svals[cloff + 1] = SVal_INVALID;
5863    return;
5864   slowcase: /* misaligned */
5865    stats__cline_16to8splits++;
5866    zsm_swrite08( a + 0, svNew );
5867    zsm_swrite08( a + 1, svNew );
5868 }
5869 
5870 /*--------------- ZSM accesses: 32 bit swrite --------------- */
5871 
5872 static
zsm_swrite32(Addr a,SVal svNew)5873 void zsm_swrite32 ( Addr a, SVal svNew ) {
5874    CacheLine* cl;
5875    UWord      cloff, tno, toff;
5876    UShort     descr;
5877    stats__cline_swrite32s++;
5878    if (UNLIKELY(!aligned32(a))) goto slowcase;
5879    cl    = get_cacheline(a);
5880    cloff = get_cacheline_offset(a);
5881    tno   = get_treeno(a);
5882    toff  = get_tree_offset(a); /* == 0 or 4 */
5883    descr = cl->descrs[tno];
5884    if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
5885       if (valid_value_is_above_me_32(descr, toff)) {
5886          /* We can't indiscriminately write on the w32 node as in the
5887             w64 case, as that might make the node inconsistent with
5888             its parent.  So first, pull down to this level. */
5889          SVal* tree = &cl->svals[tno << 3];
5890          cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
5891          if (CHECK_ZSM)
5892             tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
5893       } else {
5894          /* Writing at this level.  Need to fix up 'descr'. */
5895          cl->descrs[tno] = pullup_descr_to_32(descr, toff);
5896          /* At this point, the tree does not match cl->descr[tno] any
5897             more.  The assignments below will fix it up. */
5898       }
5899    }
5900    tl_assert(svNew != SVal_INVALID);
5901    cl->svals[cloff + 0] = svNew;
5902    cl->svals[cloff + 1] = SVal_INVALID;
5903    cl->svals[cloff + 2] = SVal_INVALID;
5904    cl->svals[cloff + 3] = SVal_INVALID;
5905    return;
5906   slowcase: /* misaligned */
5907    stats__cline_32to16splits++;
5908    zsm_swrite16( a + 0, svNew );
5909    zsm_swrite16( a + 2, svNew );
5910 }
5911 
5912 /*--------------- ZSM accesses: 64 bit swrite --------------- */
5913 
5914 static
zsm_swrite64(Addr a,SVal svNew)5915 void zsm_swrite64 ( Addr a, SVal svNew ) {
5916    CacheLine* cl;
5917    UWord      cloff, tno;
5918    //UWord    toff;
5919    stats__cline_swrite64s++;
5920    if (UNLIKELY(!aligned64(a))) goto slowcase;
5921    cl    = get_cacheline(a);
5922    cloff = get_cacheline_offset(a);
5923    tno   = get_treeno(a);
5924    //toff  = get_tree_offset(a); /* == 0, unused */
5925    cl->descrs[tno] = TREE_DESCR_64;
5926    if (CHECK_ZSM)
5927       tl_assert(svNew != SVal_INVALID); /* EXPENSIVE */
5928    cl->svals[cloff + 0] = svNew;
5929    cl->svals[cloff + 1] = SVal_INVALID;
5930    cl->svals[cloff + 2] = SVal_INVALID;
5931    cl->svals[cloff + 3] = SVal_INVALID;
5932    cl->svals[cloff + 4] = SVal_INVALID;
5933    cl->svals[cloff + 5] = SVal_INVALID;
5934    cl->svals[cloff + 6] = SVal_INVALID;
5935    cl->svals[cloff + 7] = SVal_INVALID;
5936    return;
5937   slowcase: /* misaligned */
5938    stats__cline_64to32splits++;
5939    zsm_swrite32( a + 0, svNew );
5940    zsm_swrite32( a + 4, svNew );
5941 }
5942 
5943 /*------------- ZSM accesses: 8 bit sread/scopy ------------- */
5944 
5945 static
zsm_sread08(Addr a)5946 SVal zsm_sread08 ( Addr a ) {
5947    CacheLine* cl;
5948    UWord      cloff, tno, toff;
5949    UShort     descr;
5950    stats__cline_sread08s++;
5951    cl    = get_cacheline(a);
5952    cloff = get_cacheline_offset(a);
5953    tno   = get_treeno(a);
5954    toff  = get_tree_offset(a); /* == 0 .. 7 */
5955    descr = cl->descrs[tno];
5956    if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
5957       SVal* tree = &cl->svals[tno << 3];
5958       cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
5959    }
5960    return cl->svals[cloff];
5961 }
5962 
zsm_scopy08(Addr src,Addr dst,Bool uu_normalise)5963 static void zsm_scopy08 ( Addr src, Addr dst, Bool uu_normalise ) {
5964    SVal       sv;
5965    stats__cline_scopy08s++;
5966    sv = zsm_sread08( src );
5967    zsm_swrite08( dst, sv );
5968 }
5969 
5970 
5971 /* Block-copy states (needed for implementing realloc()).  Note this
5972    doesn't change the filtering arrangements.  The caller of
5973    zsm_scopy_range needs to attend to that. */
5974 
zsm_scopy_range(Addr src,Addr dst,SizeT len)5975 static void zsm_scopy_range ( Addr src, Addr dst, SizeT len )
5976 {
5977    SizeT i;
5978    if (len == 0)
5979       return;
5980 
5981    /* assert for non-overlappingness */
5982    tl_assert(src+len <= dst || dst+len <= src);
5983 
5984    /* To be simple, just copy byte by byte.  But so as not to wreck
5985       performance for later accesses to dst[0 .. len-1], normalise
5986       destination lines as we finish with them, and also normalise the
5987       line containing the first and last address. */
5988    for (i = 0; i < len; i++) {
5989       Bool normalise
5990          = get_cacheline_offset( dst+i+1 ) == 0 /* last in line */
5991            || i == 0       /* first in range */
5992            || i == len-1;  /* last in range */
5993       zsm_scopy08( src+i, dst+i, normalise );
5994    }
5995 }
5996 
5997 
5998 /* For setting address ranges to a given value.  Has considerable
5999    sophistication so as to avoid generating large numbers of pointless
6000    cache loads/writebacks for large ranges. */
6001 
6002 /* Do small ranges in-cache, in the obvious way. */
6003 static
zsm_sset_range_SMALL(Addr a,SizeT len,SVal svNew)6004 void zsm_sset_range_SMALL ( Addr a, SizeT len, SVal svNew )
6005 {
6006    /* fast track a couple of common cases */
6007    if (len == 4 && aligned32(a)) {
6008       zsm_swrite32( a, svNew );
6009       return;
6010    }
6011    if (len == 8 && aligned64(a)) {
6012       zsm_swrite64( a, svNew );
6013       return;
6014    }
6015 
6016    /* be completely general (but as efficient as possible) */
6017    if (len == 0) return;
6018 
6019    if (!aligned16(a) && len >= 1) {
6020       zsm_swrite08( a, svNew );
6021       a += 1;
6022       len -= 1;
6023       tl_assert(aligned16(a));
6024    }
6025    if (len == 0) return;
6026 
6027    if (!aligned32(a) && len >= 2) {
6028       zsm_swrite16( a, svNew );
6029       a += 2;
6030       len -= 2;
6031       tl_assert(aligned32(a));
6032    }
6033    if (len == 0) return;
6034 
6035    if (!aligned64(a) && len >= 4) {
6036       zsm_swrite32( a, svNew );
6037       a += 4;
6038       len -= 4;
6039       tl_assert(aligned64(a));
6040    }
6041    if (len == 0) return;
6042 
6043    if (len >= 8) {
6044       tl_assert(aligned64(a));
6045       while (len >= 8) {
6046          zsm_swrite64( a, svNew );
6047          a += 8;
6048          len -= 8;
6049       }
6050       tl_assert(aligned64(a));
6051    }
6052    if (len == 0) return;
6053 
6054    if (len >= 4)
6055       tl_assert(aligned32(a));
6056    if (len >= 4) {
6057       zsm_swrite32( a, svNew );
6058       a += 4;
6059       len -= 4;
6060    }
6061    if (len == 0) return;
6062 
6063    if (len >= 2)
6064       tl_assert(aligned16(a));
6065    if (len >= 2) {
6066       zsm_swrite16( a, svNew );
6067       a += 2;
6068       len -= 2;
6069    }
6070    if (len == 0) return;
6071 
6072    if (len >= 1) {
6073       zsm_swrite08( a, svNew );
6074       //a += 1;
6075       len -= 1;
6076    }
6077    tl_assert(len == 0);
6078 }
6079 
6080 
6081 /* If we're doing a small range, hand off to zsm_sset_range_SMALL.  But
6082    for larger ranges, try to operate directly on the out-of-cache
6083    representation, rather than dragging lines into the cache,
6084    overwriting them, and forcing them out.  This turns out to be an
6085    important performance optimisation.
6086 
6087    Note that this doesn't change the filtering arrangements.  The
6088    caller of zsm_sset_range needs to attend to that. */
6089 
zsm_sset_range(Addr a,SizeT len,SVal svNew)6090 static void zsm_sset_range ( Addr a, SizeT len, SVal svNew )
6091 {
6092    tl_assert(svNew != SVal_INVALID);
6093    stats__cache_make_New_arange += (ULong)len;
6094 
6095    if (0 && len > 500)
6096       VG_(printf)("make New      ( %#lx, %lu )\n", a, len );
6097 
6098    if (0) {
6099       static UWord n_New_in_cache = 0;
6100       static UWord n_New_not_in_cache = 0;
6101       /* tag is 'a' with the in-line offset masked out,
6102          eg a[31]..a[4] 0000 */
6103       Addr       tag = a & ~(N_LINE_ARANGE - 1);
6104       UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
6105       if (LIKELY(tag == cache_shmem.tags0[wix])) {
6106          n_New_in_cache++;
6107       } else {
6108          n_New_not_in_cache++;
6109       }
6110       if (0 == ((n_New_in_cache + n_New_not_in_cache) % 100000))
6111          VG_(printf)("shadow_mem_make_New: IN %lu OUT %lu\n",
6112                      n_New_in_cache, n_New_not_in_cache );
6113    }
6114 
6115    if (LIKELY(len < 2 * N_LINE_ARANGE)) {
6116       zsm_sset_range_SMALL( a, len, svNew );
6117    } else {
6118       Addr  before_start  = a;
6119       Addr  aligned_start = cacheline_ROUNDUP(a);
6120       Addr  after_start   = cacheline_ROUNDDN(a + len);
6121       UWord before_len    = aligned_start - before_start;
6122       UWord aligned_len   = after_start - aligned_start;
6123       UWord after_len     = a + len - after_start;
6124       tl_assert(before_start <= aligned_start);
6125       tl_assert(aligned_start <= after_start);
6126       tl_assert(before_len < N_LINE_ARANGE);
6127       tl_assert(after_len < N_LINE_ARANGE);
6128       tl_assert(get_cacheline_offset(aligned_start) == 0);
6129       if (get_cacheline_offset(a) == 0) {
6130          tl_assert(before_len == 0);
6131          tl_assert(a == aligned_start);
6132       }
6133       if (get_cacheline_offset(a+len) == 0) {
6134          tl_assert(after_len == 0);
6135          tl_assert(after_start == a+len);
6136       }
6137       if (before_len > 0) {
6138          zsm_sset_range_SMALL( before_start, before_len, svNew );
6139       }
6140       if (after_len > 0) {
6141          zsm_sset_range_SMALL( after_start, after_len, svNew );
6142       }
6143       stats__cache_make_New_inZrep += (ULong)aligned_len;
6144 
6145       while (1) {
6146          Addr tag;
6147          UWord wix;
6148          if (aligned_start >= after_start)
6149             break;
6150          tl_assert(get_cacheline_offset(aligned_start) == 0);
6151          tag = aligned_start & ~(N_LINE_ARANGE - 1);
6152          wix = (aligned_start >> N_LINE_BITS) & (N_WAY_NENT - 1);
6153          if (tag == cache_shmem.tags0[wix]) {
6154             UWord i;
6155             for (i = 0; i < N_LINE_ARANGE / 8; i++)
6156                zsm_swrite64( aligned_start + i * 8, svNew );
6157          } else {
6158             UWord i;
6159             Word zix;
6160             SecMap* sm;
6161             LineZ* lineZ;
6162             /* This line is not in the cache.  Do not force it in; instead
6163                modify it in-place. */
6164             /* find the Z line to write in and rcdec it or the
6165                associated F line. */
6166             find_Z_for_writing( &sm, &zix, tag );
6167             tl_assert(sm);
6168             tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES);
6169             lineZ = &sm->linesZ[zix];
6170             lineZ->dict[0] = svNew;
6171             lineZ->dict[1] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
6172             for (i = 0; i < N_LINE_ARANGE/4; i++)
6173                lineZ->ix2s[i] = 0; /* all refer to dict[0] */
6174             rcinc_LineZ(lineZ);
6175          }
6176          aligned_start += N_LINE_ARANGE;
6177          aligned_len -= N_LINE_ARANGE;
6178       }
6179       tl_assert(aligned_start == after_start);
6180       tl_assert(aligned_len == 0);
6181    }
6182 }
6183 
6184 
6185 /////////////////////////////////////////////////////////
6186 //                                                     //
6187 // Front-filtering accesses                            //
6188 //                                                     //
6189 /////////////////////////////////////////////////////////
6190 
6191 static UWord stats__f_ac = 0;
6192 static UWord stats__f_sk = 0;
6193 
6194 #if 0
6195 #  define STATS__F_SHOW \
6196      do { \
6197         if (UNLIKELY(0 == (stats__f_ac & 0xFFFFFF))) \
6198            VG_(printf)("filters: ac %lu sk %lu\n",   \
6199            stats__f_ac, stats__f_sk); \
6200      } while (0)
6201 #else
6202 #  define STATS__F_SHOW /* */
6203 #endif
6204 
zsm_sapply08_f__msmcwrite(Thr * thr,Addr a)6205 void zsm_sapply08_f__msmcwrite ( Thr* thr, Addr a ) {
6206    stats__f_ac++;
6207    STATS__F_SHOW;
6208    if (LIKELY(Filter__ok_to_skip_cwr08(thr->filter, a))) {
6209       stats__f_sk++;
6210       return;
6211    }
6212    zsm_sapply08__msmcwrite(thr, a);
6213 }
6214 
zsm_sapply16_f__msmcwrite(Thr * thr,Addr a)6215 void zsm_sapply16_f__msmcwrite ( Thr* thr, Addr a ) {
6216    stats__f_ac++;
6217    STATS__F_SHOW;
6218    if (LIKELY(Filter__ok_to_skip_cwr16(thr->filter, a))) {
6219       stats__f_sk++;
6220       return;
6221    }
6222    zsm_sapply16__msmcwrite(thr, a);
6223 }
6224 
zsm_sapply32_f__msmcwrite(Thr * thr,Addr a)6225 void zsm_sapply32_f__msmcwrite ( Thr* thr, Addr a ) {
6226    stats__f_ac++;
6227    STATS__F_SHOW;
6228    if (LIKELY(Filter__ok_to_skip_cwr32(thr->filter, a))) {
6229       stats__f_sk++;
6230       return;
6231    }
6232    zsm_sapply32__msmcwrite(thr, a);
6233 }
6234 
zsm_sapply64_f__msmcwrite(Thr * thr,Addr a)6235 void zsm_sapply64_f__msmcwrite ( Thr* thr, Addr a ) {
6236    stats__f_ac++;
6237    STATS__F_SHOW;
6238    if (LIKELY(Filter__ok_to_skip_cwr64(thr->filter, a))) {
6239       stats__f_sk++;
6240       return;
6241    }
6242    zsm_sapply64__msmcwrite(thr, a);
6243 }
6244 
zsm_sapplyNN_f__msmcwrite(Thr * thr,Addr a,SizeT len)6245 void zsm_sapplyNN_f__msmcwrite ( Thr* thr, Addr a, SizeT len )
6246 {
6247    /* fast track a couple of common cases */
6248    if (len == 4 && aligned32(a)) {
6249       zsm_sapply32_f__msmcwrite( thr, a );
6250       return;
6251    }
6252    if (len == 8 && aligned64(a)) {
6253       zsm_sapply64_f__msmcwrite( thr, a );
6254       return;
6255    }
6256 
6257    /* be completely general (but as efficient as possible) */
6258    if (len == 0) return;
6259 
6260    if (!aligned16(a) && len >= 1) {
6261       zsm_sapply08_f__msmcwrite( thr, a );
6262       a += 1;
6263       len -= 1;
6264       tl_assert(aligned16(a));
6265    }
6266    if (len == 0) return;
6267 
6268    if (!aligned32(a) && len >= 2) {
6269       zsm_sapply16_f__msmcwrite( thr, a );
6270       a += 2;
6271       len -= 2;
6272       tl_assert(aligned32(a));
6273    }
6274    if (len == 0) return;
6275 
6276    if (!aligned64(a) && len >= 4) {
6277       zsm_sapply32_f__msmcwrite( thr, a );
6278       a += 4;
6279       len -= 4;
6280       tl_assert(aligned64(a));
6281    }
6282    if (len == 0) return;
6283 
6284    if (len >= 8) {
6285       tl_assert(aligned64(a));
6286       while (len >= 8) {
6287          zsm_sapply64_f__msmcwrite( thr, a );
6288          a += 8;
6289          len -= 8;
6290       }
6291       tl_assert(aligned64(a));
6292    }
6293    if (len == 0) return;
6294 
6295    if (len >= 4)
6296       tl_assert(aligned32(a));
6297    if (len >= 4) {
6298       zsm_sapply32_f__msmcwrite( thr, a );
6299       a += 4;
6300       len -= 4;
6301    }
6302    if (len == 0) return;
6303 
6304    if (len >= 2)
6305       tl_assert(aligned16(a));
6306    if (len >= 2) {
6307       zsm_sapply16_f__msmcwrite( thr, a );
6308       a += 2;
6309       len -= 2;
6310    }
6311    if (len == 0) return;
6312 
6313    if (len >= 1) {
6314       zsm_sapply08_f__msmcwrite( thr, a );
6315       //a += 1;
6316       len -= 1;
6317    }
6318    tl_assert(len == 0);
6319 }
6320 
zsm_sapply08_f__msmcread(Thr * thr,Addr a)6321 void zsm_sapply08_f__msmcread ( Thr* thr, Addr a ) {
6322    stats__f_ac++;
6323    STATS__F_SHOW;
6324    if (LIKELY(Filter__ok_to_skip_crd08(thr->filter, a))) {
6325       stats__f_sk++;
6326       return;
6327    }
6328    zsm_sapply08__msmcread(thr, a);
6329 }
6330 
zsm_sapply16_f__msmcread(Thr * thr,Addr a)6331 void zsm_sapply16_f__msmcread ( Thr* thr, Addr a ) {
6332    stats__f_ac++;
6333    STATS__F_SHOW;
6334    if (LIKELY(Filter__ok_to_skip_crd16(thr->filter, a))) {
6335       stats__f_sk++;
6336       return;
6337    }
6338    zsm_sapply16__msmcread(thr, a);
6339 }
6340 
zsm_sapply32_f__msmcread(Thr * thr,Addr a)6341 void zsm_sapply32_f__msmcread ( Thr* thr, Addr a ) {
6342    stats__f_ac++;
6343    STATS__F_SHOW;
6344    if (LIKELY(Filter__ok_to_skip_crd32(thr->filter, a))) {
6345       stats__f_sk++;
6346       return;
6347    }
6348    zsm_sapply32__msmcread(thr, a);
6349 }
6350 
zsm_sapply64_f__msmcread(Thr * thr,Addr a)6351 void zsm_sapply64_f__msmcread ( Thr* thr, Addr a ) {
6352    stats__f_ac++;
6353    STATS__F_SHOW;
6354    if (LIKELY(Filter__ok_to_skip_crd64(thr->filter, a))) {
6355       stats__f_sk++;
6356       return;
6357    }
6358    zsm_sapply64__msmcread(thr, a);
6359 }
6360 
zsm_sapplyNN_f__msmcread(Thr * thr,Addr a,SizeT len)6361 void zsm_sapplyNN_f__msmcread ( Thr* thr, Addr a, SizeT len )
6362 {
6363    /* fast track a couple of common cases */
6364    if (len == 4 && aligned32(a)) {
6365       zsm_sapply32_f__msmcread( thr, a );
6366       return;
6367    }
6368    if (len == 8 && aligned64(a)) {
6369       zsm_sapply64_f__msmcread( thr, a );
6370       return;
6371    }
6372 
6373    /* be completely general (but as efficient as possible) */
6374    if (len == 0) return;
6375 
6376    if (!aligned16(a) && len >= 1) {
6377       zsm_sapply08_f__msmcread( thr, a );
6378       a += 1;
6379       len -= 1;
6380       tl_assert(aligned16(a));
6381    }
6382    if (len == 0) return;
6383 
6384    if (!aligned32(a) && len >= 2) {
6385       zsm_sapply16_f__msmcread( thr, a );
6386       a += 2;
6387       len -= 2;
6388       tl_assert(aligned32(a));
6389    }
6390    if (len == 0) return;
6391 
6392    if (!aligned64(a) && len >= 4) {
6393       zsm_sapply32_f__msmcread( thr, a );
6394       a += 4;
6395       len -= 4;
6396       tl_assert(aligned64(a));
6397    }
6398    if (len == 0) return;
6399 
6400    if (len >= 8) {
6401       tl_assert(aligned64(a));
6402       while (len >= 8) {
6403          zsm_sapply64_f__msmcread( thr, a );
6404          a += 8;
6405          len -= 8;
6406       }
6407       tl_assert(aligned64(a));
6408    }
6409    if (len == 0) return;
6410 
6411    if (len >= 4)
6412       tl_assert(aligned32(a));
6413    if (len >= 4) {
6414       zsm_sapply32_f__msmcread( thr, a );
6415       a += 4;
6416       len -= 4;
6417    }
6418    if (len == 0) return;
6419 
6420    if (len >= 2)
6421       tl_assert(aligned16(a));
6422    if (len >= 2) {
6423       zsm_sapply16_f__msmcread( thr, a );
6424       a += 2;
6425       len -= 2;
6426    }
6427    if (len == 0) return;
6428 
6429    if (len >= 1) {
6430       zsm_sapply08_f__msmcread( thr, a );
6431       //a += 1;
6432       len -= 1;
6433    }
6434    tl_assert(len == 0);
6435 }
6436 
libhb_Thr_resumes(Thr * thr)6437 void libhb_Thr_resumes ( Thr* thr )
6438 {
6439    if (0) VG_(printf)("resume %p\n", thr);
6440    tl_assert(thr);
6441    tl_assert(!thr->llexit_done);
6442    Filter__clear(thr->filter, "libhb_Thr_resumes");
6443    /* A kludge, but .. if this thread doesn't have any marker stacks
6444       at all, get one right now.  This is easier than figuring out
6445       exactly when at thread startup we can and can't take a stack
6446       snapshot. */
6447    if (HG_(clo_history_level) == 1) {
6448       tl_assert(thr->local_Kws_n_stacks);
6449       if (VG_(sizeXA)( thr->local_Kws_n_stacks ) == 0)
6450          note_local_Kw_n_stack_for(thr);
6451    }
6452 }
6453 
6454 
6455 /////////////////////////////////////////////////////////
6456 //                                                     //
6457 // Synchronisation objects                             //
6458 //                                                     //
6459 /////////////////////////////////////////////////////////
6460 
6461 /* A double linked list of all the SO's. */
6462 SO* admin_SO = NULL;
6463 
SO__Alloc(void)6464 static SO* SO__Alloc ( void )
6465 {
6466    SO* so = HG_(zalloc)( "libhb.SO__Alloc.1", sizeof(SO) );
6467    so->viR   = VtsID_INVALID;
6468    so->viW   = VtsID_INVALID;
6469    so->magic = SO_MAGIC;
6470    /* Add to double linked list */
6471    if (admin_SO) {
6472       tl_assert(admin_SO->admin_prev == NULL);
6473       admin_SO->admin_prev = so;
6474       so->admin_next = admin_SO;
6475    } else {
6476       so->admin_next = NULL;
6477    }
6478    so->admin_prev = NULL;
6479    admin_SO = so;
6480    /* */
6481    return so;
6482 }
6483 
SO__Dealloc(SO * so)6484 static void SO__Dealloc ( SO* so )
6485 {
6486    tl_assert(so);
6487    tl_assert(so->magic == SO_MAGIC);
6488    if (so->viR == VtsID_INVALID) {
6489       tl_assert(so->viW == VtsID_INVALID);
6490    } else {
6491       tl_assert(so->viW != VtsID_INVALID);
6492       VtsID__rcdec(so->viR);
6493       VtsID__rcdec(so->viW);
6494    }
6495    so->magic = 0;
6496    /* Del from double linked list */
6497    if (so->admin_prev)
6498       so->admin_prev->admin_next = so->admin_next;
6499    if (so->admin_next)
6500       so->admin_next->admin_prev = so->admin_prev;
6501    if (so == admin_SO)
6502       admin_SO = so->admin_next;
6503    /* */
6504    HG_(free)( so );
6505 }
6506 
6507 
6508 /////////////////////////////////////////////////////////
6509 //                                                     //
6510 // Top Level API                                       //
6511 //                                                     //
6512 /////////////////////////////////////////////////////////
6513 
show_thread_state(const HChar * str,Thr * t)6514 static void show_thread_state ( const HChar* str, Thr* t )
6515 {
6516    if (1) return;
6517    if (t->viR == t->viW) {
6518       VG_(printf)("thr \"%s\" %p has vi* %u==", str, t, t->viR );
6519       VtsID__pp( t->viR );
6520       VG_(printf)("%s","\n");
6521    } else {
6522       VG_(printf)("thr \"%s\" %p has viR %u==", str, t, t->viR );
6523       VtsID__pp( t->viR );
6524       VG_(printf)(" viW %u==", t->viW);
6525       VtsID__pp( t->viW );
6526       VG_(printf)("%s","\n");
6527    }
6528 }
6529 
6530 
libhb_init(void (* get_stacktrace)(Thr *,Addr *,UWord),ExeContext * (* get_EC)(Thr *))6531 Thr* libhb_init (
6532         void        (*get_stacktrace)( Thr*, Addr*, UWord ),
6533         ExeContext* (*get_EC)( Thr* )
6534      )
6535 {
6536    Thr*  thr;
6537    VtsID vi;
6538 
6539    // We will have to have to store a large number of these,
6540    // so make sure they're the size we expect them to be.
6541    STATIC_ASSERT(sizeof(ScalarTS) == 8);
6542 
6543    /* because first 1024 unusable */
6544    STATIC_ASSERT(SCALARTS_N_THRBITS >= 11);
6545    /* so as to fit in a UInt w/ 5 bits to spare (see defn of
6546       Thr_n_RCEC and TSW). */
6547    STATIC_ASSERT(SCALARTS_N_THRBITS <= 27);
6548 
6549    /* Need to be sure that Thr_n_RCEC is 2 words (64-bit) or 3 words
6550       (32-bit).  It's not correctness-critical, but there are a lot of
6551       them, so it's important from a space viewpoint.  Unfortunately
6552       we simply can't pack it into 2 words on a 32-bit target. */
6553    STATIC_ASSERT(   (sizeof(UWord) == 8 && sizeof(Thr_n_RCEC) == 16)
6554                  || (sizeof(UWord) == 4 && sizeof(Thr_n_RCEC) == 12));
6555    STATIC_ASSERT(sizeof(TSW) == sizeof(UInt));
6556 
6557    /* Word sets really are 32 bits.  Even on a 64 bit target. */
6558    STATIC_ASSERT(sizeof(WordSetID) == 4);
6559    STATIC_ASSERT(sizeof(WordSet) == sizeof(WordSetID));
6560 
6561    tl_assert(get_stacktrace);
6562    tl_assert(get_EC);
6563    main_get_stacktrace   = get_stacktrace;
6564    main_get_EC           = get_EC;
6565 
6566    // No need to initialise hg_wordfm.
6567    // No need to initialise hg_wordset.
6568 
6569    /* Allocated once and never deallocated.  Used as a temporary in
6570       VTS singleton, tick and join operations. */
6571    temp_max_sized_VTS = VTS__new( "libhb.libhb_init.1", ThrID_MAX_VALID );
6572    temp_max_sized_VTS->id = VtsID_INVALID;
6573    verydead_thread_tables_init();
6574    vts_set_init();
6575    vts_tab_init();
6576    event_map_init();
6577    VtsID__invalidate_caches();
6578 
6579    // initialise shadow memory
6580    zsm_init( );
6581 
6582    thr = Thr__new();
6583    vi  = VtsID__mk_Singleton( thr, 1 );
6584    thr->viR = vi;
6585    thr->viW = vi;
6586    VtsID__rcinc(thr->viR);
6587    VtsID__rcinc(thr->viW);
6588 
6589    show_thread_state("  root", thr);
6590    return thr;
6591 }
6592 
6593 
libhb_create(Thr * parent)6594 Thr* libhb_create ( Thr* parent )
6595 {
6596    /* The child's VTSs are copies of the parent's VTSs, but ticked at
6597       the child's index.  Since the child's index is guaranteed
6598       unique, it has never been seen before, so the implicit value
6599       before the tick is zero and after that is one. */
6600    Thr* child = Thr__new();
6601 
6602    child->viR = VtsID__tick( parent->viR, child );
6603    child->viW = VtsID__tick( parent->viW, child );
6604    Filter__clear(child->filter, "libhb_create(child)");
6605    VtsID__rcinc(child->viR);
6606    VtsID__rcinc(child->viW);
6607    /* We need to do note_local_Kw_n_stack_for( child ), but it's too
6608       early for that - it may not have a valid TId yet.  So, let
6609       libhb_Thr_resumes pick it up the first time the thread runs. */
6610 
6611    tl_assert(VtsID__indexAt( child->viR, child ) == 1);
6612    tl_assert(VtsID__indexAt( child->viW, child ) == 1);
6613 
6614    /* and the parent has to move along too */
6615    VtsID__rcdec(parent->viR);
6616    VtsID__rcdec(parent->viW);
6617    parent->viR = VtsID__tick( parent->viR, parent );
6618    parent->viW = VtsID__tick( parent->viW, parent );
6619    Filter__clear(parent->filter, "libhb_create(parent)");
6620    VtsID__rcinc(parent->viR);
6621    VtsID__rcinc(parent->viW);
6622    note_local_Kw_n_stack_for( parent );
6623 
6624    show_thread_state(" child", child);
6625    show_thread_state("parent", parent);
6626 
6627    return child;
6628 }
6629 
6630 /* Shut down the library, and print stats (in fact that's _all_
6631    this is for. */
libhb_shutdown(Bool show_stats)6632 void libhb_shutdown ( Bool show_stats )
6633 {
6634    if (show_stats) {
6635       VG_(printf)("%s","<<< BEGIN libhb stats >>>\n");
6636       VG_(printf)(" secmaps: %'10lu allocd (%'12lu g-a-range)\n",
6637                   stats__secmaps_allocd,
6638                   stats__secmap_ga_space_covered);
6639       VG_(printf)("  linesZ: %'10lu allocd (%'12lu bytes occupied)\n",
6640                   stats__secmap_linesZ_allocd,
6641                   stats__secmap_linesZ_bytes);
6642       VG_(printf)("  linesF: %'10lu allocd (%'12lu bytes occupied)"
6643                   " (%'10lu used)\n",
6644                   VG_(sizePA) (LineF_pool_allocator),
6645                   VG_(sizePA) (LineF_pool_allocator) * sizeof(LineF),
6646                   shmem__SecMap_used_linesF());
6647       VG_(printf)(" secmaps: %'10lu in map (can be scanGCed %'5lu)"
6648                   " #%lu scanGC \n",
6649                   stats__secmaps_in_map_shmem,
6650                   shmem__SecMap_do_GC(False /* really do GC */),
6651                   stats__secmaps_scanGC);
6652       tl_assert (VG_(sizeFM) (map_shmem) == stats__secmaps_in_map_shmem);
6653       VG_(printf)(" secmaps: %'10lu in freelist,"
6654                   " total (scanGCed %'lu, ssetGCed %'lu)\n",
6655                   SecMap_freelist_length(),
6656                   stats__secmaps_scanGCed,
6657                   stats__secmaps_ssetGCed);
6658       VG_(printf)(" secmaps: %'10lu searches (%'12lu slow)\n",
6659                   stats__secmaps_search, stats__secmaps_search_slow);
6660 
6661       VG_(printf)("%s","\n");
6662       VG_(printf)("   cache: %'lu totrefs (%'lu misses)\n",
6663                   stats__cache_totrefs, stats__cache_totmisses );
6664       VG_(printf)("   cache: %'14lu Z-fetch,    %'14lu F-fetch\n",
6665                   stats__cache_Z_fetches, stats__cache_F_fetches );
6666       VG_(printf)("   cache: %'14lu Z-wback,    %'14lu F-wback\n",
6667                   stats__cache_Z_wbacks, stats__cache_F_wbacks );
6668       VG_(printf)("   cache: %'14lu flushes_invals\n",
6669                   stats__cache_flushes_invals );
6670       VG_(printf)("   cache: %'14llu arange_New  %'14llu direct-to-Zreps\n",
6671                   stats__cache_make_New_arange,
6672                   stats__cache_make_New_inZrep);
6673 
6674       VG_(printf)("%s","\n");
6675       VG_(printf)("   cline: %'10lu normalises\n",
6676                   stats__cline_normalises );
6677       VG_(printf)("   cline: c rds 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
6678                   stats__cline_cread64s,
6679                   stats__cline_cread32s,
6680                   stats__cline_cread16s,
6681                   stats__cline_cread08s );
6682       VG_(printf)("   cline: c wrs 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
6683                   stats__cline_cwrite64s,
6684                   stats__cline_cwrite32s,
6685                   stats__cline_cwrite16s,
6686                   stats__cline_cwrite08s );
6687       VG_(printf)("   cline: s wrs 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
6688                   stats__cline_swrite64s,
6689                   stats__cline_swrite32s,
6690                   stats__cline_swrite16s,
6691                   stats__cline_swrite08s );
6692       VG_(printf)("   cline: s rd1s %'lu, s copy1s %'lu\n",
6693                   stats__cline_sread08s, stats__cline_scopy08s );
6694       VG_(printf)("   cline:    splits: 8to4 %'12lu    4to2 %'12lu"
6695                   "    2to1 %'12lu\n",
6696                   stats__cline_64to32splits, stats__cline_32to16splits,
6697                   stats__cline_16to8splits );
6698       VG_(printf)("   cline: pulldowns: 8to4 %'12lu    4to2 %'12lu"
6699                   "    2to1 %'12lu\n",
6700                   stats__cline_64to32pulldown, stats__cline_32to16pulldown,
6701                   stats__cline_16to8pulldown );
6702       if (0)
6703       VG_(printf)("   cline: sizeof(CacheLineZ) %ld,"
6704                   " covers %ld bytes of arange\n",
6705                   (Word)sizeof(LineZ),
6706                   (Word)N_LINE_ARANGE);
6707 
6708       VG_(printf)("%s","\n");
6709 
6710       VG_(printf)("   libhb: %'13llu msmcread  (%'llu dragovers)\n",
6711                   stats__msmcread, stats__msmcread_change);
6712       VG_(printf)("   libhb: %'13llu msmcwrite (%'llu dragovers)\n",
6713                   stats__msmcwrite, stats__msmcwrite_change);
6714       VG_(printf)("   libhb: %'13llu cmpLEQ queries (%'llu misses)\n",
6715                   stats__cmpLEQ_queries, stats__cmpLEQ_misses);
6716       VG_(printf)("   libhb: %'13llu join2  queries (%'llu misses)\n",
6717                   stats__join2_queries, stats__join2_misses);
6718 
6719       VG_(printf)("%s","\n");
6720       VG_(printf)("   libhb: VTSops: tick %'lu,  join %'lu,  cmpLEQ %'lu\n",
6721                   stats__vts__tick, stats__vts__join,  stats__vts__cmpLEQ );
6722       VG_(printf)("   libhb: VTSops: cmp_structural %'lu (%'lu slow)\n",
6723                   stats__vts__cmp_structural, stats__vts__cmp_structural_slow);
6724       VG_(printf)("   libhb: VTSset: find__or__clone_and_add %'lu"
6725                   " (%'lu allocd)\n",
6726                    stats__vts_set__focaa, stats__vts_set__focaa_a );
6727       VG_(printf)( "   libhb: VTSops: indexAt_SLOW %'lu\n",
6728                    stats__vts__indexat_slow );
6729 
6730       VG_(printf)("%s","\n");
6731       VG_(printf)(
6732          "   libhb: %ld entries in vts_table (approximately %lu bytes)\n",
6733          VG_(sizeXA)( vts_tab ), VG_(sizeXA)( vts_tab ) * sizeof(VtsTE)
6734       );
6735       VG_(printf)("   libhb: #%lu vts_tab GC    #%lu vts pruning\n",
6736                   stats__vts_tab_GC, stats__vts_pruning);
6737       VG_(printf)( "   libhb: %lu entries in vts_set\n",
6738                    VG_(sizeFM)( vts_set ) );
6739 
6740       VG_(printf)("%s","\n");
6741       {
6742          UInt live = 0;
6743          UInt llexit_done = 0;
6744          UInt joinedwith_done = 0;
6745          UInt llexit_and_joinedwith_done = 0;
6746 
6747          Thread* hgthread = get_admin_threads();
6748          tl_assert(hgthread);
6749          while (hgthread) {
6750             Thr* hbthr = hgthread->hbthr;
6751             tl_assert(hbthr);
6752             if (hbthr->llexit_done && hbthr->joinedwith_done)
6753                llexit_and_joinedwith_done++;
6754             else if (hbthr->llexit_done)
6755                llexit_done++;
6756             else if (hbthr->joinedwith_done)
6757                joinedwith_done++;
6758             else
6759                live++;
6760             hgthread = hgthread->admin;
6761          }
6762          VG_(printf)("   libhb: threads live: %u exit_and_joinedwith %u"
6763                      " exit %u joinedwith %u\n",
6764                      live, llexit_and_joinedwith_done,
6765                      llexit_done, joinedwith_done);
6766          VG_(printf)("   libhb: %d verydead_threads, "
6767                      "%d verydead_threads_not_pruned\n",
6768                      (int) VG_(sizeXA)( verydead_thread_table),
6769                      (int) VG_(sizeXA)( verydead_thread_table_not_pruned));
6770          tl_assert (VG_(sizeXA)( verydead_thread_table)
6771                     + VG_(sizeXA)( verydead_thread_table_not_pruned)
6772                     == llexit_and_joinedwith_done);
6773       }
6774 
6775       VG_(printf)("%s","\n");
6776       VG_(printf)( "   libhb: oldrefHTN %lu (%'d bytes)\n",
6777                    oldrefHTN, (int)(oldrefHTN * sizeof(OldRef)));
6778       tl_assert (oldrefHTN == VG_(HT_count_nodes) (oldrefHT));
6779       VG_(printf)( "   libhb: oldref lookup found=%lu notfound=%lu\n",
6780                    stats__evm__lookup_found, stats__evm__lookup_notfound);
6781       if (VG_(clo_verbosity) > 1)
6782          VG_(HT_print_stats) (oldrefHT, cmp_oldref_tsw);
6783       VG_(printf)( "   libhb: oldref bind tsw/rcec "
6784                    "==/==:%'lu ==/!=:%'lu !=/!=:%'lu\n",
6785                    stats__ctxt_eq_tsw_eq_rcec, stats__ctxt_eq_tsw_neq_rcec,
6786                    stats__ctxt_neq_tsw_neq_rcec);
6787       VG_(printf)( "   libhb: ctxt__rcdec calls %'lu. rcec gc discards %'lu\n",
6788                    stats__ctxt_rcdec_calls, stats__ctxt_rcec_gc_discards);
6789       VG_(printf)( "   libhb: contextTab: %lu slots,"
6790                    " %lu cur ents(ref'd %lu),"
6791                    " %lu max ents\n",
6792                    (UWord)N_RCEC_TAB,
6793                    stats__ctxt_tab_curr, RCEC_referenced,
6794                    stats__ctxt_tab_max );
6795       VG_(printf) ("   libhb: stats__cached_rcec "
6796                    "identical %'lu updated %'lu fresh %'lu\n",
6797                    stats__cached_rcec_identical, stats__cached_rcec_updated,
6798                    stats__cached_rcec_fresh);
6799       if (stats__cached_rcec_diff > 0)
6800          VG_(printf) ("   libhb: stats__cached_rcec diff unk reason%'lu\n",
6801                       stats__cached_rcec_diff);
6802       if (stats__cached_rcec_diff_known_reason > 0)
6803          VG_(printf) ("   libhb: stats__cached_rcec diff known reason %'lu\n",
6804                       stats__cached_rcec_diff_known_reason);
6805 
6806       {
6807 #        define  MAXCHAIN 10
6808          UInt chains[MAXCHAIN+1]; // [MAXCHAIN] gets all chains >= MAXCHAIN
6809          UInt non0chain = 0;
6810          UInt n;
6811          UInt i;
6812          RCEC *p;
6813 
6814          for (i = 0; i <= MAXCHAIN; i++) chains[i] = 0;
6815          for (i = 0; i < N_RCEC_TAB; i++) {
6816             n = 0;
6817             for (p = contextTab[i]; p; p = p->next)
6818                n++;
6819             if (n < MAXCHAIN)
6820                chains[n]++;
6821             else
6822                chains[MAXCHAIN]++;
6823             if (n > 0)
6824                non0chain++;
6825          }
6826          VG_(printf)( "   libhb: contextTab chain of [length]=nchain."
6827                       " Avg chain len %3.1f\n"
6828                       "        ",
6829                       (Double)stats__ctxt_tab_curr
6830                       / (Double)(non0chain ? non0chain : 1));
6831          for (i = 0; i <= MAXCHAIN; i++) {
6832             if (chains[i] != 0)
6833                 VG_(printf)( "[%u%s]=%u ",
6834                              i, i == MAXCHAIN ? "+" : "",
6835                              chains[i]);
6836          }
6837          VG_(printf)( "\n");
6838 #        undef MAXCHAIN
6839       }
6840       VG_(printf)( "   libhb: contextTab: %lu queries, %lu cmps\n",
6841                    stats__ctxt_tab_qs,
6842                    stats__ctxt_tab_cmps );
6843 #if 0
6844       VG_(printf)("sizeof(AvlNode)     = %lu\n", sizeof(AvlNode));
6845       VG_(printf)("sizeof(WordBag)     = %lu\n", sizeof(WordBag));
6846       VG_(printf)("sizeof(MaybeWord)   = %lu\n", sizeof(MaybeWord));
6847       VG_(printf)("sizeof(CacheLine)   = %lu\n", sizeof(CacheLine));
6848       VG_(printf)("sizeof(LineZ)       = %lu\n", sizeof(LineZ));
6849       VG_(printf)("sizeof(LineF)       = %lu\n", sizeof(LineF));
6850       VG_(printf)("sizeof(SecMap)      = %lu\n", sizeof(SecMap));
6851       VG_(printf)("sizeof(Cache)       = %lu\n", sizeof(Cache));
6852       VG_(printf)("sizeof(SMCacheEnt)  = %lu\n", sizeof(SMCacheEnt));
6853       VG_(printf)("sizeof(CountedSVal) = %lu\n", sizeof(CountedSVal));
6854       VG_(printf)("sizeof(VTS)         = %lu\n", sizeof(VTS));
6855       VG_(printf)("sizeof(ScalarTS)    = %lu\n", sizeof(ScalarTS));
6856       VG_(printf)("sizeof(VtsTE)       = %lu\n", sizeof(VtsTE));
6857       VG_(printf)("sizeof(MSMInfo)     = %lu\n", sizeof(MSMInfo));
6858 
6859       VG_(printf)("sizeof(struct _XArray)     = %lu\n", sizeof(struct _XArray));
6860       VG_(printf)("sizeof(struct _WordFM)     = %lu\n", sizeof(struct _WordFM));
6861       VG_(printf)("sizeof(struct _Thr)     = %lu\n", sizeof(struct _Thr));
6862       VG_(printf)("sizeof(struct _SO)     = %lu\n", sizeof(struct _SO));
6863 #endif
6864 
6865       VG_(printf)("%s","<<< END libhb stats >>>\n");
6866       VG_(printf)("%s","\n");
6867 
6868    }
6869 }
6870 
6871 /* Receive notification that a thread has low level exited.  The
6872    significance here is that we do not expect to see any more memory
6873    references from it. */
libhb_async_exit(Thr * thr)6874 void libhb_async_exit ( Thr* thr )
6875 {
6876    tl_assert(thr);
6877    tl_assert(!thr->llexit_done);
6878    thr->llexit_done = True;
6879 
6880    /* Check nobody messed up with the cached_rcec */
6881    tl_assert (thr->cached_rcec.magic == RCEC_MAGIC);
6882    tl_assert (thr->cached_rcec.rc == 0);
6883    tl_assert (thr->cached_rcec.rcX == 0);
6884    tl_assert (thr->cached_rcec.next == NULL);
6885 
6886    /* Just to be sure, declare the cached stack invalid. */
6887    set_cached_rcec_validity(thr, False);
6888 
6889    /* free up Filter and local_Kws_n_stacks (well, actually not the
6890       latter ..) */
6891    tl_assert(thr->filter);
6892    HG_(free)(thr->filter);
6893    thr->filter = NULL;
6894 
6895    /* Tell the VTS mechanism this thread has exited, so it can
6896       participate in VTS pruning.  Note this can only happen if the
6897       thread has both ll_exited and has been joined with. */
6898    if (thr->joinedwith_done)
6899       VTS__declare_thread_very_dead(thr);
6900 
6901    /* Another space-accuracy tradeoff.  Do we want to be able to show
6902       H1 history for conflicts in threads which have since exited?  If
6903       yes, then we better not free up thr->local_Kws_n_stacks.  The
6904       downside is a potential per-thread leak of up to
6905       N_KWs_N_STACKs_PER_THREAD * sizeof(ULong_n_EC) * whatever the
6906       XArray average overcommit factor is (1.5 I'd guess). */
6907    // hence:
6908    // VG_(deleteXA)(thr->local_Kws_n_stacks);
6909    // thr->local_Kws_n_stacks = NULL;
6910 }
6911 
6912 /* Receive notification that a thread has been joined with.  The
6913    significance here is that we do not expect to see any further
6914    references to its vector clocks (Thr::viR and Thr::viW). */
libhb_joinedwith_done(Thr * thr)6915 void libhb_joinedwith_done ( Thr* thr )
6916 {
6917    tl_assert(thr);
6918    /* Caller must ensure that this is only ever called once per Thr. */
6919    tl_assert(!thr->joinedwith_done);
6920    thr->joinedwith_done = True;
6921    if (thr->llexit_done)
6922       VTS__declare_thread_very_dead(thr);
6923 }
6924 
6925 
6926 /* Both Segs and SOs point to VTSs.  However, there is no sharing, so
6927    a Seg that points at a VTS is its one-and-only owner, and ditto for
6928    a SO that points at a VTS. */
6929 
libhb_so_alloc(void)6930 SO* libhb_so_alloc ( void )
6931 {
6932    return SO__Alloc();
6933 }
6934 
libhb_so_dealloc(SO * so)6935 void libhb_so_dealloc ( SO* so )
6936 {
6937    tl_assert(so);
6938    tl_assert(so->magic == SO_MAGIC);
6939    SO__Dealloc(so);
6940 }
6941 
6942 /* See comments in libhb.h for details on the meaning of
6943    strong vs weak sends and strong vs weak receives. */
libhb_so_send(Thr * thr,SO * so,Bool strong_send)6944 void libhb_so_send ( Thr* thr, SO* so, Bool strong_send )
6945 {
6946    /* Copy the VTSs from 'thr' into the sync object, and then move
6947       the thread along one step. */
6948 
6949    tl_assert(so);
6950    tl_assert(so->magic == SO_MAGIC);
6951 
6952    /* stay sane .. a thread's read-clock must always lead or be the
6953       same as its write-clock */
6954    { Bool leq = VtsID__cmpLEQ(thr->viW, thr->viR);
6955      tl_assert(leq);
6956    }
6957 
6958    /* since we're overwriting the VtsIDs in the SO, we need to drop
6959       any references made by the previous contents thereof */
6960    if (so->viR == VtsID_INVALID) {
6961       tl_assert(so->viW == VtsID_INVALID);
6962       so->viR = thr->viR;
6963       so->viW = thr->viW;
6964       VtsID__rcinc(so->viR);
6965       VtsID__rcinc(so->viW);
6966    } else {
6967       /* In a strong send, we dump any previous VC in the SO and
6968          install the sending thread's VC instead.  For a weak send we
6969          must join2 with what's already there. */
6970       tl_assert(so->viW != VtsID_INVALID);
6971       VtsID__rcdec(so->viR);
6972       VtsID__rcdec(so->viW);
6973       so->viR = strong_send ? thr->viR : VtsID__join2( so->viR, thr->viR );
6974       so->viW = strong_send ? thr->viW : VtsID__join2( so->viW, thr->viW );
6975       VtsID__rcinc(so->viR);
6976       VtsID__rcinc(so->viW);
6977    }
6978 
6979    /* move both parent clocks along */
6980    VtsID__rcdec(thr->viR);
6981    VtsID__rcdec(thr->viW);
6982    thr->viR = VtsID__tick( thr->viR, thr );
6983    thr->viW = VtsID__tick( thr->viW, thr );
6984    if (!thr->llexit_done) {
6985       Filter__clear(thr->filter, "libhb_so_send");
6986       note_local_Kw_n_stack_for(thr);
6987    }
6988    VtsID__rcinc(thr->viR);
6989    VtsID__rcinc(thr->viW);
6990 
6991    if (strong_send)
6992       show_thread_state("s-send", thr);
6993    else
6994       show_thread_state("w-send", thr);
6995 }
6996 
libhb_so_recv(Thr * thr,SO * so,Bool strong_recv)6997 void libhb_so_recv ( Thr* thr, SO* so, Bool strong_recv )
6998 {
6999    tl_assert(so);
7000    tl_assert(so->magic == SO_MAGIC);
7001 
7002    if (so->viR != VtsID_INVALID) {
7003       tl_assert(so->viW != VtsID_INVALID);
7004 
7005       /* Weak receive (basically, an R-acquisition of a R-W lock).
7006          This advances the read-clock of the receiver, but not the
7007          write-clock. */
7008       VtsID__rcdec(thr->viR);
7009       thr->viR = VtsID__join2( thr->viR, so->viR );
7010       VtsID__rcinc(thr->viR);
7011 
7012       /* At one point (r10589) it seemed safest to tick the clocks for
7013          the receiving thread after the join.  But on reflection, I
7014          wonder if that might cause it to 'overtake' constraints,
7015          which could lead to missing races.  So, back out that part of
7016          r10589. */
7017       //VtsID__rcdec(thr->viR);
7018       //thr->viR = VtsID__tick( thr->viR, thr );
7019       //VtsID__rcinc(thr->viR);
7020 
7021       /* For a strong receive, we also advance the receiver's write
7022          clock, which means the receive as a whole is essentially
7023          equivalent to a W-acquisition of a R-W lock. */
7024       if (strong_recv) {
7025          VtsID__rcdec(thr->viW);
7026          thr->viW = VtsID__join2( thr->viW, so->viW );
7027          VtsID__rcinc(thr->viW);
7028 
7029          /* See comment just above, re r10589. */
7030          //VtsID__rcdec(thr->viW);
7031          //thr->viW = VtsID__tick( thr->viW, thr );
7032          //VtsID__rcinc(thr->viW);
7033       }
7034 
7035       if (thr->filter)
7036          Filter__clear(thr->filter, "libhb_so_recv");
7037       note_local_Kw_n_stack_for(thr);
7038 
7039       if (strong_recv)
7040          show_thread_state("s-recv", thr);
7041       else
7042          show_thread_state("w-recv", thr);
7043 
7044    } else {
7045       tl_assert(so->viW == VtsID_INVALID);
7046       /* Deal with degenerate case: 'so' has no vts, so there has been
7047          no message posted to it.  Just ignore this case. */
7048       show_thread_state("d-recv", thr);
7049    }
7050 }
7051 
libhb_so_everSent(SO * so)7052 Bool libhb_so_everSent ( SO* so )
7053 {
7054    if (so->viR == VtsID_INVALID) {
7055       tl_assert(so->viW == VtsID_INVALID);
7056       return False;
7057    } else {
7058       tl_assert(so->viW != VtsID_INVALID);
7059       return True;
7060    }
7061 }
7062 
7063 #define XXX1 0 // 0x67a106c
7064 #define XXX2 0
7065 
TRACEME(Addr a,SizeT szB)7066 static inline Bool TRACEME(Addr a, SizeT szB) {
7067    if (XXX1 && a <= XXX1 && XXX1 <= a+szB) return True;
7068    if (XXX2 && a <= XXX2 && XXX2 <= a+szB) return True;
7069    return False;
7070 }
trace(Thr * thr,Addr a,SizeT szB,const HChar * s)7071 static void trace ( Thr* thr, Addr a, SizeT szB, const HChar* s )
7072 {
7073   SVal sv = zsm_sread08(a);
7074   VG_(printf)("thr %p (%#lx,%lu) %s: 0x%016llx ", thr,a,szB,s,sv);
7075   show_thread_state("", thr);
7076   VG_(printf)("%s","\n");
7077 }
7078 
libhb_srange_new(Thr * thr,Addr a,SizeT szB)7079 void libhb_srange_new ( Thr* thr, Addr a, SizeT szB )
7080 {
7081    SVal sv = SVal__mkC(thr->viW, thr->viW);
7082    tl_assert(is_sane_SVal_C(sv));
7083    if (0 && TRACEME(a,szB)) trace(thr,a,szB,"nw-before");
7084    zsm_sset_range( a, szB, sv );
7085    Filter__clear_range( thr->filter, a, szB );
7086    if (0 && TRACEME(a,szB)) trace(thr,a,szB,"nw-after ");
7087 }
7088 
libhb_srange_noaccess_NoFX(Thr * thr,Addr a,SizeT szB)7089 void libhb_srange_noaccess_NoFX ( Thr* thr, Addr a, SizeT szB )
7090 {
7091    /* do nothing */
7092 }
7093 
7094 
7095 /* Set the lines zix_start till zix_end to NOACCESS. */
zsm_secmap_line_range_noaccess(SecMap * sm,UInt zix_start,UInt zix_end)7096 static void zsm_secmap_line_range_noaccess (SecMap *sm,
7097                                             UInt zix_start, UInt zix_end)
7098 {
7099    for (UInt lz = zix_start; lz <= zix_end; lz++) {
7100       LineZ* lineZ;
7101       lineZ = &sm->linesZ[lz];
7102       if (lineZ->dict[0] != SVal_INVALID) {
7103          rcdec_LineZ(lineZ);
7104          lineZ->dict[0] = SVal_NOACCESS;
7105          lineZ->dict[1] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
7106       } else {
7107          clear_LineF_of_Z(lineZ);
7108       }
7109       for (UInt i = 0; i < N_LINE_ARANGE/4; i++)
7110          lineZ->ix2s[i] = 0; /* all refer to dict[0] */
7111    }
7112 }
7113 
7114 /* Set the given range to SVal_NOACCESS in-place in the secmap.
7115    a must be cacheline aligned. len must be a multiple of a cacheline
7116    and must be < N_SECMAP_ARANGE. */
zsm_sset_range_noaccess_in_secmap(Addr a,SizeT len)7117 static void zsm_sset_range_noaccess_in_secmap(Addr a, SizeT len)
7118 {
7119    tl_assert (is_valid_scache_tag (a));
7120    tl_assert (0 == (len & (N_LINE_ARANGE - 1)));
7121    tl_assert (len < N_SECMAP_ARANGE);
7122 
7123    SecMap *sm1 = shmem__find_SecMap (a);
7124    SecMap *sm2 = shmem__find_SecMap (a + len - 1);
7125    UWord zix_start = shmem__get_SecMap_offset(a          ) >> N_LINE_BITS;
7126    UWord zix_end   = shmem__get_SecMap_offset(a + len - 1) >> N_LINE_BITS;
7127 
7128    if (sm1) {
7129       if (CHECK_ZSM) tl_assert(is_sane_SecMap(sm1));
7130       zsm_secmap_line_range_noaccess (sm1, zix_start,
7131                                       sm1 == sm2 ? zix_end : N_SECMAP_ZLINES-1);
7132    }
7133    if (sm2 && sm1 != sm2) {
7134       if (CHECK_ZSM) tl_assert(is_sane_SecMap(sm2));
7135       zsm_secmap_line_range_noaccess (sm2, 0, zix_end);
7136    }
7137 }
7138 
7139 /* Set the given address range to SVal_NOACCESS.
7140    The SecMaps fully set to SVal_NOACCESS will be pushed in SecMap_freelist. */
zsm_sset_range_noaccess(Addr addr,SizeT len)7141 static void zsm_sset_range_noaccess (Addr addr, SizeT len)
7142 {
7143    /*
7144        BPC = Before, Partial Cacheline, = addr
7145              (i.e. starting inside a cacheline/inside a SecMap)
7146        BFC = Before, Full Cacheline(s), but not full SecMap
7147              (i.e. starting inside a SecMap)
7148        FSM = Full SecMap(s)
7149              (i.e. starting a SecMap)
7150        AFC = After, Full Cacheline(s), but not full SecMap
7151              (i.e. first address after the full SecMap(s))
7152        APC = After, Partial Cacheline, i.e. first address after the
7153              full CacheLines).
7154        ARE = After Range End = addr+len = first address not part of the range.
7155 
7156        If addr     starts a Cacheline, then BPC == BFC.
7157        If addr     starts a SecMap,    then BPC == BFC == FSM.
7158        If addr+len starts a SecMap,    then APC == ARE == AFC
7159        If addr+len starts a Cacheline, then APC == ARE
7160    */
7161    Addr ARE = addr + len;
7162    Addr BPC = addr;
7163    Addr BFC = ROUNDUP(BPC, N_LINE_ARANGE);
7164    Addr FSM = ROUNDUP(BPC, N_SECMAP_ARANGE);
7165    Addr AFC = ROUNDDN(ARE, N_SECMAP_ARANGE);
7166    Addr APC = ROUNDDN(ARE, N_LINE_ARANGE);
7167    SizeT Plen = len; // Plen will be split between the following:
7168    SizeT BPClen;
7169    SizeT BFClen;
7170    SizeT FSMlen;
7171    SizeT AFClen;
7172    SizeT APClen;
7173 
7174    /* Consumes from Plen the nr of bytes between from and to.
7175       from and to must be aligned on a multiple of round.
7176       The length consumed will be a multiple of round, with
7177       a maximum of Plen. */
7178 #  define PlenCONSUME(from, to, round, consumed) \
7179    do {                                          \
7180    if (from < to) {                              \
7181       if (to - from < Plen)                      \
7182          consumed = to - from;                   \
7183       else                                       \
7184          consumed = ROUNDDN(Plen, round);        \
7185    } else {                                      \
7186       consumed = 0;                              \
7187    }                                             \
7188    Plen -= consumed; } while (0)
7189 
7190    PlenCONSUME(BPC, BFC, 1,               BPClen);
7191    PlenCONSUME(BFC, FSM, N_LINE_ARANGE,   BFClen);
7192    PlenCONSUME(FSM, AFC, N_SECMAP_ARANGE, FSMlen);
7193    PlenCONSUME(AFC, APC, N_LINE_ARANGE,   AFClen);
7194    PlenCONSUME(APC, ARE, 1,               APClen);
7195 
7196    if (0)
7197       VG_(printf) ("addr %p[%lu] ARE %p"
7198                    " BPC %p[%lu] BFC %p[%lu] FSM %p[%lu]"
7199                    " AFC %p[%lu] APC %p[%lu]\n",
7200                    (void*)addr, len, (void*)ARE,
7201                    (void*)BPC, BPClen, (void*)BFC, BFClen, (void*)FSM, FSMlen,
7202                    (void*)AFC, AFClen, (void*)APC, APClen);
7203 
7204    tl_assert (Plen == 0);
7205 
7206    /* Set to NOACCESS pieces before and after not covered by entire SecMaps. */
7207 
7208    /* First we set the partial cachelines. This is done through the cache. */
7209    if (BPClen > 0)
7210       zsm_sset_range_SMALL (BPC, BPClen, SVal_NOACCESS);
7211    if (APClen > 0)
7212       zsm_sset_range_SMALL (APC, APClen, SVal_NOACCESS);
7213 
7214    /* After this, we will not use the cache anymore. We will directly work
7215       in-place on the z shadow memory in SecMap(s).
7216       So, we invalidate the cachelines for the whole range we are setting
7217       to NOACCESS below. */
7218    shmem__invalidate_scache_range (BFC, APC - BFC);
7219 
7220    if (BFClen > 0)
7221       zsm_sset_range_noaccess_in_secmap (BFC, BFClen);
7222    if (AFClen > 0)
7223       zsm_sset_range_noaccess_in_secmap (AFC, AFClen);
7224 
7225    if (FSMlen > 0) {
7226       /* Set to NOACCESS all the SecMaps, pushing the SecMaps to the
7227          free list. */
7228       Addr  sm_start = FSM;
7229       while (sm_start < AFC) {
7230          SecMap *sm = shmem__find_SecMap (sm_start);
7231          if (sm) {
7232             Addr gaKey;
7233             SecMap *fm_sm;
7234 
7235             if (CHECK_ZSM) tl_assert(is_sane_SecMap(sm));
7236             for (UInt lz = 0; lz < N_SECMAP_ZLINES; lz++) {
7237                LineZ *lineZ = &sm->linesZ[lz];
7238                if (LIKELY(lineZ->dict[0] != SVal_INVALID))
7239                   rcdec_LineZ(lineZ);
7240                else
7241                   clear_LineF_of_Z(lineZ);
7242             }
7243             if (!VG_(delFromFM)(map_shmem, &gaKey, (UWord*)&fm_sm, sm_start))
7244                tl_assert (0);
7245             stats__secmaps_in_map_shmem--;
7246             tl_assert (gaKey == sm_start);
7247             tl_assert (sm == fm_sm);
7248             stats__secmaps_ssetGCed++;
7249             push_SecMap_on_freelist (sm);
7250          }
7251          sm_start += N_SECMAP_ARANGE;
7252       }
7253       tl_assert (sm_start == AFC);
7254 
7255       /* The above loop might have kept copies of freed SecMap in the smCache.
7256          => clear them. */
7257       if (address_in_range(smCache[0].gaKey, FSM, FSMlen)) {
7258          smCache[0].gaKey = 1;
7259          smCache[0].sm = NULL;
7260       }
7261       if (address_in_range(smCache[1].gaKey, FSM, FSMlen)) {
7262          smCache[1].gaKey = 1;
7263          smCache[1].sm = NULL;
7264       }
7265       if (address_in_range(smCache[2].gaKey, FSM, FSMlen)) {
7266          smCache[2].gaKey = 1;
7267          smCache[2].sm = NULL;
7268       }
7269       STATIC_ASSERT (3 == sizeof(smCache)/sizeof(SMCacheEnt));
7270    }
7271 }
7272 
libhb_srange_noaccess_AHAE(Thr * thr,Addr a,SizeT szB)7273 void libhb_srange_noaccess_AHAE ( Thr* thr, Addr a, SizeT szB )
7274 {
7275    /* This really does put the requested range in NoAccess.  It's
7276       expensive though. */
7277    SVal sv = SVal_NOACCESS;
7278    tl_assert(is_sane_SVal_C(sv));
7279    if (LIKELY(szB < 2 * N_LINE_ARANGE))
7280       zsm_sset_range_SMALL (a, szB, SVal_NOACCESS);
7281    else
7282       zsm_sset_range_noaccess (a, szB);
7283    Filter__clear_range( thr->filter, a, szB );
7284 }
7285 
7286 /* Works byte at a time. Can be optimised if needed. */
libhb_srange_get_abits(Addr a,UChar * abits,SizeT len)7287 UWord libhb_srange_get_abits (Addr a, UChar *abits, SizeT len)
7288 {
7289    UWord anr = 0; // nr of bytes addressable.
7290 
7291    /* Get the accessibility of each byte. Pay attention to not
7292       create SecMap or LineZ when checking if a byte is addressable.
7293 
7294       Note: this is used for client request. Performance deemed not critical.
7295       So for simplicity, we work byte per byte.
7296       Performance could be improved  by working with full cachelines
7297       or with full SecMap, when reaching a cacheline or secmap boundary. */
7298    for (SizeT i = 0; i < len; i++) {
7299       SVal       sv = SVal_INVALID;
7300       Addr       b = a + i;
7301       Addr       tag = b & ~(N_LINE_ARANGE - 1);
7302       UWord      wix = (b >> N_LINE_BITS) & (N_WAY_NENT - 1);
7303       UWord      cloff = get_cacheline_offset(b);
7304 
7305       /* Note: we do not use get_cacheline(b) to avoid creating cachelines
7306          and/or SecMap for non addressable bytes. */
7307       if (tag == cache_shmem.tags0[wix]) {
7308          CacheLine copy = cache_shmem.lyns0[wix];
7309          /* We work on a copy of the cacheline, as we do not want to
7310             record the client request as a real read.
7311             The below is somewhat similar to zsm_sapply08__msmcread but
7312             avoids side effects on the cache. */
7313          UWord toff = get_tree_offset(b); /* == 0 .. 7 */
7314          UWord tno  = get_treeno(b);
7315          UShort descr = copy.descrs[tno];
7316          if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
7317             SVal* tree = &copy.svals[tno << 3];
7318             copy.descrs[tno] = pulldown_to_8(tree, toff, descr);
7319          }
7320          sv = copy.svals[cloff];
7321       } else {
7322          /* Byte not found in the cacheline. Search for a SecMap. */
7323          SecMap *sm = shmem__find_SecMap(b);
7324          LineZ *lineZ;
7325          if (sm == NULL)
7326             sv = SVal_NOACCESS;
7327          else {
7328             UWord zix = shmem__get_SecMap_offset(b) >> N_LINE_BITS;
7329             lineZ = &sm->linesZ[zix];
7330             if (lineZ->dict[0] == SVal_INVALID) {
7331                LineF *lineF = SVal2Ptr(lineZ->dict[1]);
7332                sv = lineF->w64s[cloff];
7333             } else {
7334                UWord ix = read_twobit_array( lineZ->ix2s, cloff );
7335                sv = lineZ->dict[ix];
7336             }
7337          }
7338       }
7339 
7340       tl_assert (sv != SVal_INVALID);
7341       if (sv == SVal_NOACCESS) {
7342          if (abits)
7343             abits[i] = 0x00;
7344       } else {
7345          if (abits)
7346             abits[i] = 0xff;
7347          anr++;
7348       }
7349    }
7350 
7351    return anr;
7352 }
7353 
7354 
libhb_srange_untrack(Thr * thr,Addr a,SizeT szB)7355 void libhb_srange_untrack ( Thr* thr, Addr a, SizeT szB )
7356 {
7357    SVal sv = SVal_NOACCESS;
7358    tl_assert(is_sane_SVal_C(sv));
7359    if (0 && TRACEME(a,szB)) trace(thr,a,szB,"untrack-before");
7360    if (LIKELY(szB < 2 * N_LINE_ARANGE))
7361       zsm_sset_range_SMALL (a, szB, SVal_NOACCESS);
7362    else
7363       zsm_sset_range_noaccess (a, szB);
7364    Filter__clear_range( thr->filter, a, szB );
7365    if (0 && TRACEME(a,szB)) trace(thr,a,szB,"untrack-after ");
7366 }
7367 
libhb_get_Thr_hgthread(Thr * thr)7368 Thread* libhb_get_Thr_hgthread ( Thr* thr ) {
7369    tl_assert(thr);
7370    return thr->hgthread;
7371 }
7372 
libhb_set_Thr_hgthread(Thr * thr,Thread * hgthread)7373 void libhb_set_Thr_hgthread ( Thr* thr, Thread* hgthread ) {
7374    tl_assert(thr);
7375    thr->hgthread = hgthread;
7376 }
7377 
libhb_copy_shadow_state(Thr * thr,Addr src,Addr dst,SizeT len)7378 void libhb_copy_shadow_state ( Thr* thr, Addr src, Addr dst, SizeT len )
7379 {
7380    zsm_scopy_range(src, dst, len);
7381    Filter__clear_range( thr->filter, dst, len );
7382 }
7383 
libhb_maybe_GC(void)7384 void libhb_maybe_GC ( void )
7385 {
7386    /* GC the unreferenced (zero rc) RCECs when
7387          (1) reaching a significant nr of RCECs (to avoid scanning a contextTab
7388              with mostly NULL ptr)
7389      and (2) approaching the max nr of RCEC (as we have in any case
7390              at least that amount of RCEC in the pool allocator)
7391              Note: the margin allows to avoid a small but constant increase
7392              of the max nr of RCEC due to the fact that libhb_maybe_GC is
7393              not called when the current nr of RCEC exactly reaches the max.
7394      and (3) the nr of referenced RCECs is less than 75% than total nr RCECs.
7395      Avoid growing too much the nr of RCEC keeps the memory use low,
7396      and avoids to have too many elements in the (fixed) contextTab hashtable.
7397    */
7398    if (UNLIKELY(stats__ctxt_tab_curr > N_RCEC_TAB/2
7399                 && stats__ctxt_tab_curr + 1000 >= stats__ctxt_tab_max
7400                 && (stats__ctxt_tab_curr * 3)/4 > RCEC_referenced))
7401       do_RCEC_GC();
7402 
7403    /* If there are still no entries available (all the table entries are full),
7404       and we hit the threshold point, then do a GC */
7405    Bool vts_tab_GC = vts_tab_freelist == VtsID_INVALID
7406       && VG_(sizeXA)( vts_tab ) >= vts_next_GC_at;
7407    if (UNLIKELY (vts_tab_GC))
7408       vts_tab__do_GC( False/*don't show stats*/ );
7409 
7410    /* scan GC the SecMaps when
7411           (1) no SecMap in the freelist
7412       and (2) the current nr of live secmaps exceeds the threshold. */
7413    if (UNLIKELY(SecMap_freelist == NULL
7414                 && stats__secmaps_in_map_shmem >= next_SecMap_GC_at)) {
7415       // If we did a vts tab GC, then no need to flush the cache again.
7416       if (!vts_tab_GC)
7417          zsm_flush_cache();
7418       shmem__SecMap_do_GC(True);
7419    }
7420 
7421    /* Check the reference counts (expensive) */
7422    if (CHECK_CEM)
7423       event_map__check_reference_counts();
7424 }
7425 
7426 
7427 /////////////////////////////////////////////////////////////////
7428 /////////////////////////////////////////////////////////////////
7429 //                                                             //
7430 // SECTION END main library                                    //
7431 //                                                             //
7432 /////////////////////////////////////////////////////////////////
7433 /////////////////////////////////////////////////////////////////
7434 
7435 /*--------------------------------------------------------------------*/
7436 /*--- end                                             libhb_main.c ---*/
7437 /*--------------------------------------------------------------------*/
7438