1/*========================== begin_copyright_notice ============================
2
3Copyright (C) 2017-2021 Intel Corporation
4
5SPDX-License-Identifier: MIT
6
7============================= end_copyright_notice ===========================*/
8
9#include "../Languages/OpenCL/IBiF_SPIRV_Utils.cl"
10
11extern __constant int __OptDisable;
12
13// EmitVISAPass support for __builtin_IB_memfence/__builtin_IB_typedmemfence requires some arguments to
14// be constants as those are used to prepare a message descriptor, so must be known at compile time.
15// To assure that all the arguments are constants for O0 path, there is a special function
16// marked with __attribute__((optnone)) which implements seperate call instruction created for each
17// arguments configuration.
18
19// MEMFENCE IMPLEMENTATION
20
21void __attribute__((optnone)) __intel_memfence_optnone(bool flushRW, bool isGlobal, bool invalidateL1)
22{
23#define MEMFENCE_IF(V1, V5, V6)                                    \
24if (flushRW == V1 && isGlobal == V5 && invalidateL1 == V6)         \
25{                                                                  \
26    __builtin_IB_memfence(true, V1, false, false, false, V5, V6);  \
27}
28
29    MEMFENCE_IF(false, false, false)
30    else MEMFENCE_IF(false, false, true)
31    else MEMFENCE_IF(false, true, false)
32    else MEMFENCE_IF(false, true, true)
33    else MEMFENCE_IF(true, false, false)
34    else MEMFENCE_IF(true, false, true)
35    else MEMFENCE_IF(true, true, false)
36    else MEMFENCE_IF(true, true, true)
37
38#undef MEMFENCE_IF
39}
40void __intel_memfence(bool flushRW, bool isGlobal, bool invalidateL1)
41{
42    __builtin_IB_memfence(true, flushRW, false, false, false, isGlobal, invalidateL1);
43}
44
45void __intel_memfence_handler(bool flushRW, bool isGlobal, bool invalidateL1)
46{
47    if (__OptDisable)
48        __intel_memfence_optnone(flushRW, isGlobal, invalidateL1);
49    else
50        __intel_memfence(flushRW, isGlobal, invalidateL1);
51}
52
53// TYPEDMEMFENCE IMPLEMENTATION
54
55void __attribute__((optnone)) __intel_typedmemfence_optnone(bool invalidateL1)
56{
57    if (invalidateL1)
58        __builtin_IB_typedmemfence(true);
59    else
60        __builtin_IB_typedmemfence(false);
61}
62
63void __intel_typedmemfence(bool invalidateL1)
64{
65    __builtin_IB_typedmemfence(invalidateL1);
66}
67
68void __intel_typedmemfence_handler(bool invalidateL1)
69{
70    if (__OptDisable)
71        __intel_typedmemfence_optnone(invalidateL1);
72    else
73        __intel_typedmemfence(invalidateL1);
74}
75
76// Barrier Instructions
77
78static void __intel_atomic_work_item_fence( Scope_t Memory, uint Semantics )
79{
80    bool fence = Semantics & ( Acquire | Release | AcquireRelease | SequentiallyConsistent );
81
82    bool invalidateL1 = Semantics & ( Acquire | AcquireRelease | SequentiallyConsistent );
83
84    // We always need to 'fence' image memory (aka, flush caches, drain pipelines)
85    fence |= ( Semantics & ImageMemory );
86
87    if (fence)
88    {
89        if (Semantics & ImageMemory)
90        {
91            // An image fence requires a fence with R/W invalidate (L3 flush) + a flush
92            // of the sampler cache
93            __intel_typedmemfence_handler(invalidateL1);
94        }
95        // A global/local memory fence requires a hardware fence in general,
96        // although on some platforms they may be elided; platform-specific checks are performed in codegen
97        if (Semantics & WorkgroupMemory)
98        {
99           __intel_memfence_handler(false, false, false);
100        }
101        if (Semantics & CrossWorkgroupMemory)
102        {
103           bool flushL3 = Memory == Device || Memory == CrossDevice;
104           __intel_memfence_handler(flushL3, true, invalidateL1);
105        }
106    }
107}
108
109void SPIRV_OVERLOADABLE SPIRV_BUILTIN(ControlBarrier, _i32_i32_i32, )(int Execution, int Memory, int Semantics)
110{
111    if (Execution != Subgroup)
112    {
113        // sub group barrier requires no fence
114        __intel_atomic_work_item_fence( Memory, Semantics );
115    }
116
117    if( Execution <= Workgroup )
118    {
119        __builtin_IB_thread_group_barrier();
120    }
121    else  if( Execution == Subgroup )
122    {
123        // nothing will be emited but we need to prevent optimization spliting control flow
124        __builtin_IB_sub_group_barrier();
125    }
126}
127
128void SPIRV_OVERLOADABLE SPIRV_BUILTIN(MemoryBarrier, _i32_i32, )(int Memory, int Semantics)
129{
130    __intel_atomic_work_item_fence( Memory, Semantics );
131}
132
133
134// Named Barrier
135
136void __intel_getInitializedNamedBarrierArray(local uint* id)
137{
138    *id = 0;
139    SPIRV_BUILTIN(ControlBarrier, _i32_i32_i32, )( Workgroup, 0, SequentiallyConsistent | WorkgroupMemory );
140}
141
142bool __intel_is_first_work_group_item( void );
143
144local __namedBarrier* __builtin_spirv_OpNamedBarrierInitialize_i32_p3__namedBarrier_p3i32(int SubGroupCount, local __namedBarrier* nb_array, local uint* id)
145{
146    local __namedBarrier* NB = &nb_array[*id];
147    NB->count = SubGroupCount;
148    NB->orig_count = SubGroupCount;
149    NB->inc = 0;
150    SPIRV_BUILTIN(ControlBarrier, _i32_i32_i32, )( Workgroup, 0, SequentiallyConsistent | WorkgroupMemory );
151    if (__intel_is_first_work_group_item())
152    {
153        (*id)++;
154    }
155    SPIRV_BUILTIN(ControlBarrier, _i32_i32_i32, )( Workgroup, 0, SequentiallyConsistent | WorkgroupMemory );
156    return NB;
157}
158
159
160static INLINE OVERLOADABLE
161uint AtomicCompareExchange(local uint *Pointer, uint Scope, uint Equal, uint Unequal, uint Value, uint Comparator)
162{
163    return SPIRV_BUILTIN(AtomicCompareExchange, _p3i32_i32_i32_i32_i32_i32, )((local int*)Pointer, Scope, Equal, Unequal, Value, Comparator);
164}
165
166static INLINE
167uint SubgroupLocalId()
168{
169    return SPIRV_BUILTIN_NO_OP(BuiltInSubgroupLocalInvocationId, , )();
170}
171
172static INLINE OVERLOADABLE
173uint AtomicLoad(local uint *Pointer, uint Scope, uint Semantics)
174{
175    return SPIRV_BUILTIN(AtomicLoad, _p3i32_i32_i32, )((local int*)Pointer, Scope, Semantics);
176}
177
178static INLINE OVERLOADABLE
179void AtomicStore(local uint *Pointer, uint Scope, uint Semantics, uint Value)
180{
181    SPIRV_BUILTIN(AtomicStore, _p3i32_i32_i32_i32, )((local int*)Pointer, Scope, Semantics, Value);
182}
183
184static INLINE OVERLOADABLE
185uint AtomicInc(local uint *Pointer, uint Scope, uint Semantics)
186{
187    return SPIRV_BUILTIN(AtomicIIncrement, _p3i32_i32_i32, )((local int*)Pointer, Scope, Semantics);
188}
189
190static INLINE
191uint Broadcast(uint Execution, uint Value, uint3 LocalId)
192{
193    return SPIRV_BUILTIN(GroupBroadcast, _i32_i32_v3i32, )(Execution, as_int(Value), as_int3(LocalId));
194}
195
196static INLINE OVERLOADABLE
197uint SubgroupAtomicCompareExchange(local uint *Pointer, uint Scope, uint Equal, uint Unequal, uint Value, uint Comparator)
198{
199    uint result = 0;
200    if (SubgroupLocalId() == 0)
201        result = AtomicCompareExchange((volatile local uint*)Pointer, Scope, Equal, Unequal, Value, Comparator);
202    result = Broadcast(Subgroup, result, (uint3)0);
203    return result;
204}
205
206static INLINE OVERLOADABLE
207uint SubgroupAtomicInc(local uint *Pointer, uint Scope, uint Semantics)
208{
209    uint result = 0;
210    if (SubgroupLocalId() == 0)
211        result = AtomicInc((volatile local uint*)Pointer, Scope, Semantics);
212    result = Broadcast(Subgroup, result, (uint3)0);
213    return result;
214}
215
216static void MemoryBarrier(Scope_t Memory, uint Semantics)
217{
218    SPIRV_BUILTIN(MemoryBarrier, _i32_i32, )(Memory, Semantics);
219}
220
221void __builtin_spirv_OpMemoryNamedBarrier_p3__namedBarrier_i32_i32(local __namedBarrier* NB,Scope_t Memory, uint Semantics)
222{
223    const uint AtomSema = SequentiallyConsistent | WorkgroupMemory;
224    while (1)
225    {
226        const uint cnt = AtomicLoad(&NB->count, Workgroup, AtomSema);
227        if (cnt > 0)
228        {
229            uint before = SubgroupAtomicCompareExchange(&NB->count, Workgroup, AtomSema, AtomSema, cnt - 1, cnt);
230            if (before == cnt)
231            {
232                break;
233            }
234        }
235    }
236
237    while(AtomicLoad(&NB->count, Workgroup, AtomSema) > 0);
238    MemoryBarrier(Memory, Semantics);
239    uint inc = SubgroupAtomicInc(&NB->inc, Workgroup, AtomSema);
240    if(inc == ((NB->orig_count) - 1))
241    {
242        AtomicStore(&NB->inc, Workgroup, AtomSema, 0);
243        AtomicStore(&NB->count, Workgroup, AtomSema, NB->orig_count);
244    }
245}
246void __builtin_spirv_OpMemoryNamedBarrierWrapperOCL_p3__namedBarrier_i32(local __namedBarrier* barrier, cl_mem_fence_flags flags)
247{
248    __builtin_spirv_OpMemoryNamedBarrier_p3__namedBarrier_i32_i32(barrier, Workgroup, AcquireRelease | get_spirv_mem_fence(flags));
249}
250
251void __builtin_spirv_OpMemoryNamedBarrierWrapperOCL_p3__namedBarrier_i32_i32(local __namedBarrier* barrier, cl_mem_fence_flags flags, memory_scope scope)
252{
253    __builtin_spirv_OpMemoryNamedBarrier_p3__namedBarrier_i32_i32(barrier, get_spirv_mem_scope(scope), AcquireRelease | get_spirv_mem_fence(flags));
254}
255
256
257