1 /*
2 * SPDX-FileCopyrightText: Copyright (c) 2019 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 * SPDX-License-Identifier: MIT
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 #ifndef __NV_SEMAPHORE_H__
25 #define __NV_SEMAPHORE_H__
26
27 #include "nvtypes.h"
28 #include "nvCpuIntrinsics.h"
29
30 #ifdef __cplusplus
31 extern "C" {
32 #endif
33
34 typedef volatile struct {
35 NvU32 payload;
36 NvU32 reportValue;
37 NvU64 timer;
38 } NvReportSemaphore32;
39
40 typedef volatile struct {
41 NvU64 reportValue;
42 NvU64 timer;
43 } NvReportSemaphore64;
44
45 typedef volatile union {
46 NvReportSemaphore32 sema32;
47 NvReportSemaphore64 sema64;
48 } NvReportSemaphore;
49
50 /*
51 * These structures can't change size. They map to the GPU and other driver
52 * components expect the same size.
53 */
54 ct_assert(sizeof(NvReportSemaphore32) == 16);
55 ct_assert(sizeof(NvReportSemaphore64) == 16);
56 ct_assert(sizeof(NvReportSemaphore) == 16);
57
58 /*
59 * Pre-Volta GPUs can only read/write a 32-bit semaphore. Rather than try to
60 * use multiple semaphore writes to emulate a full 64-bit write, which is prone
61 * to race conditions when the value wraps, derive the full 64-bit value by
62 * comparing the current GPU-accessible value with the the last value written by
63 * the CPU or submitted to be written by the GPU, which is stashed in the
64 * timestamp field of the semaphore by the CPU in both these cases.
65 */
NvTimeSemFermiSetMaxSubmittedVal(volatile NvU64 * maxSubmittedPtr,const NvU64 value)66 static inline void NvTimeSemFermiSetMaxSubmittedVal(
67 volatile NvU64 *maxSubmittedPtr,
68 const NvU64 value)
69 {
70 NvU64 oldValue =
71 (NvU64)__NVatomicCompareExchange64((volatile NvS64 *)maxSubmittedPtr,
72 0, 0);
73
74 // Atomically set report->timer to max(value, report->time).
75 while (oldValue < value) {
76 const NvU64 prevValue =
77 (NvU64)__NVatomicCompareExchange64((volatile NvS64 *)maxSubmittedPtr,
78 (NvS64)value,
79 (NvS64)oldValue);
80 if (prevValue == oldValue) {
81 // The specified value was set. Done.
82 nvAssert(*maxSubmittedPtr >= value);
83 break;
84 }
85
86 oldValue = prevValue;
87 }
88 }
89
NvTimeSemFermiSetMaxSubmitted(NvReportSemaphore32 * report,const NvU64 value)90 static inline void NvTimeSemFermiSetMaxSubmitted(
91 NvReportSemaphore32 *report,
92 const NvU64 value)
93 {
94 NvTimeSemFermiSetMaxSubmittedVal(&report->timer, value);
95 }
96
NvTimeSemFermiGetPayloadVal(volatile void * payloadPtr,volatile void * maxSubmittedPtr)97 static inline NvU64 NvTimeSemFermiGetPayloadVal(
98 volatile void *payloadPtr,
99 volatile void *maxSubmittedPtr)
100 {
101 // The ordering of the two operations below is critical. Other threads
102 // may be submitting GPU work that modifies the semaphore value, or
103 // modifying it from the CPU themselves. Both of those operations first
104 // set the 64-bit max submitted/timer value, then modify or submit work
105 // to modify the 32-bit payload value. Consider this hypothetical timeline
106 // if the order of operations below is reversed:
107 //
108 // thread1:
109 // -SetMaxSubmitted(0x1);
110 // -report->payload = 0x1;
111 //
112 // thread2:
113 // -Reads 0x1 from report->timer
114 //
115 // thread1:
116 // -SetMaxSubmitted(0x7fffffff);
117 // -report->payload = 0x7fffffff;
118 // -SetMaxSubmitted(0x100000000);
119 // -report->payload = 0x00000000;
120 //
121 // thread2:
122 // -Reads 0x0 from report->payload
123 //
124 // The logic below would see 0 (payload) is less than 1 (max submitted) and
125 // determine a wrap is outstanding, subtract one from the high 32-bits of
126 // the max submitted value (0x00000000 - 0x1), overflow, and return the
127 // current 64-bit value as 0xffffffff00000000 when the correct value is
128 // 0x100000000. To avoid this, we must read the payload prior to reading
129 // the max submitted value from the timer field. The logic can correctly
130 // adjust the max submitted value back down if a wrap occurs between these
131 // two operations, but has no way to bump the max submitted value up if a
132 // wrap occurs with the opposite ordering.
133 NvU64 current = *(volatile NvU32*)payloadPtr;
134 // Use an atomic exchange to ensure the 64-bit read is atomic even on 32-bit
135 // CPUs.
136 NvU64 submitted = (NvU64)
137 __NVatomicCompareExchange64((volatile NvS64 *)maxSubmittedPtr, 0ll, 0ll);
138
139 nvAssert(!(current & 0xFFFFFFFF00000000ull));
140
141 // The value is monotonically increasing, and differ by no more than
142 // 2^31 - 1. Hence, if the low word of the submitted value is less
143 // than the low word of the current value, exactly one 32-bit wrap
144 // occurred between the current value and the most recently
145 // submitted value. Walk back the high word to match the value
146 // associated with the current GPU-visible value.
147 if ((submitted & 0xFFFFFFFFull) < current) {
148 submitted -= 0x100000000ull;
149 }
150
151 current |= (submitted & 0xFFFFFFFF00000000ull);
152
153 return current;
154 }
155
NvTimeSemFermiGetPayload(NvReportSemaphore32 * report)156 static inline NvU64 NvTimeSemFermiGetPayload(
157 NvReportSemaphore32 *report)
158 {
159 return NvTimeSemFermiGetPayloadVal(&report->payload, &report->timer);
160 }
161
NvTimeSemFermiSetPayload(NvReportSemaphore32 * report,const NvU64 payload)162 static inline void NvTimeSemFermiSetPayload(
163 NvReportSemaphore32 *report,
164 const NvU64 payload)
165 {
166 // First save the actual value to the reserved/timer bits
167 NvTimeSemFermiSetMaxSubmittedVal(&report->timer, payload);
168
169 // Then write the low bits to the GPU-accessible semaphore value.
170 report->payload = (NvU32)(payload & 0xFFFFFFFFULL);
171 }
172
173 /*
174 * Volta and up.
175 */
176
NvTimeSemVoltaGetPayloadVal(volatile void * payloadPtr)177 static inline NvU64 NvTimeSemVoltaGetPayloadVal(
178 volatile void *payloadPtr)
179 {
180 nvAssert(payloadPtr);
181 return (NvU64)
182 __NVatomicCompareExchange64((volatile NvS64 *)payloadPtr,
183 0, 0);
184 }
185
NvTimeSemVoltaGetPayload(NvReportSemaphore64 * report)186 static inline NvU64 NvTimeSemVoltaGetPayload(
187 NvReportSemaphore64 *report)
188 {
189 return NvTimeSemVoltaGetPayloadVal(&report->reportValue);
190 }
191
NvTimeSemVoltaSetPayload(NvReportSemaphore64 * report,const NvU64 payload)192 static inline void NvTimeSemVoltaSetPayload(
193 NvReportSemaphore64 *report,
194 const NvU64 payload)
195 {
196 NvU64 oldPayload = 0;
197
198 while (NV_TRUE) {
199 NvU64 prevPayload = (NvU64)
200 __NVatomicCompareExchange64((volatile NvS64 *)&report->reportValue,
201 (NvS64)payload, (NvS64)oldPayload);
202
203 if (prevPayload == oldPayload) {
204 break;
205 }
206
207 nvAssert(prevPayload < payload);
208
209 oldPayload = prevPayload;
210 }
211 }
212
213 #ifdef __cplusplus
214 };
215 #endif
216
217 #endif /* __NV_SEMAPHORE_H__ */
218