1 //===--- Cuda.cpp - Cuda Tool and ToolChain Implementations -----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "Cuda.h"
10 #include "CommonArgs.h"
11 #include "clang/Basic/Cuda.h"
12 #include "clang/Config/config.h"
13 #include "clang/Driver/Compilation.h"
14 #include "clang/Driver/Distro.h"
15 #include "clang/Driver/Driver.h"
16 #include "clang/Driver/DriverDiagnostic.h"
17 #include "clang/Driver/InputInfo.h"
18 #include "clang/Driver/Options.h"
19 #include "llvm/ADT/Optional.h"
20 #include "llvm/Option/ArgList.h"
21 #include "llvm/Support/FileSystem.h"
22 #include "llvm/Support/Host.h"
23 #include "llvm/Support/Path.h"
24 #include "llvm/Support/Process.h"
25 #include "llvm/Support/Program.h"
26 #include "llvm/Support/TargetParser.h"
27 #include "llvm/Support/VirtualFileSystem.h"
28 #include <system_error>
29
30 using namespace clang::driver;
31 using namespace clang::driver::toolchains;
32 using namespace clang::driver::tools;
33 using namespace clang;
34 using namespace llvm::opt;
35
36 namespace {
37 struct CudaVersionInfo {
38 std::string DetectedVersion;
39 CudaVersion Version;
40 };
41 // Parses the contents of version.txt in an CUDA installation. It should
42 // contain one line of the from e.g. "CUDA Version 7.5.2".
parseCudaVersionFile(llvm::StringRef V)43 CudaVersionInfo parseCudaVersionFile(llvm::StringRef V) {
44 V = V.trim();
45 if (!V.startswith("CUDA Version "))
46 return {V.str(), CudaVersion::UNKNOWN};
47 V = V.substr(strlen("CUDA Version "));
48 SmallVector<StringRef,4> VersionParts;
49 V.split(VersionParts, '.');
50 return {"version.txt: " + V.str() + ".",
51 VersionParts.size() < 2
52 ? CudaVersion::UNKNOWN
53 : CudaStringToVersion(
54 join_items(".", VersionParts[0], VersionParts[1]))};
55 }
56
getCudaVersion(uint32_t raw_version)57 CudaVersion getCudaVersion(uint32_t raw_version) {
58 if (raw_version < 7050)
59 return CudaVersion::CUDA_70;
60 if (raw_version < 8000)
61 return CudaVersion::CUDA_75;
62 if (raw_version < 9000)
63 return CudaVersion::CUDA_80;
64 if (raw_version < 9010)
65 return CudaVersion::CUDA_90;
66 if (raw_version < 9020)
67 return CudaVersion::CUDA_91;
68 if (raw_version < 10000)
69 return CudaVersion::CUDA_92;
70 if (raw_version < 10010)
71 return CudaVersion::CUDA_100;
72 if (raw_version < 10020)
73 return CudaVersion::CUDA_101;
74 if (raw_version < 11000)
75 return CudaVersion::CUDA_102;
76 if (raw_version < 11010)
77 return CudaVersion::CUDA_110;
78 if (raw_version < 11020)
79 return CudaVersion::CUDA_111;
80 return CudaVersion::LATEST;
81 }
82
parseCudaHFile(llvm::StringRef Input)83 CudaVersionInfo parseCudaHFile(llvm::StringRef Input) {
84 // Helper lambda which skips the words if the line starts with them or returns
85 // None otherwise.
86 auto StartsWithWords =
87 [](llvm::StringRef Line,
88 const SmallVector<StringRef, 3> words) -> llvm::Optional<StringRef> {
89 for (StringRef word : words) {
90 if (!Line.consume_front(word))
91 return {};
92 Line = Line.ltrim();
93 }
94 return Line;
95 };
96
97 Input = Input.ltrim();
98 while (!Input.empty()) {
99 if (auto Line =
100 StartsWithWords(Input.ltrim(), {"#", "define", "CUDA_VERSION"})) {
101 uint32_t RawVersion;
102 Line->consumeInteger(10, RawVersion);
103 return {"cuda.h: CUDA_VERSION=" + Twine(RawVersion).str() + ".",
104 getCudaVersion(RawVersion)};
105 }
106 // Find next non-empty line.
107 Input = Input.drop_front(Input.find_first_of("\n\r")).ltrim();
108 }
109 return {"cuda.h: CUDA_VERSION not found.", CudaVersion::UNKNOWN};
110 }
111 } // namespace
112
WarnIfUnsupportedVersion()113 void CudaInstallationDetector::WarnIfUnsupportedVersion() {
114 if (DetectedVersionIsNotSupported)
115 D.Diag(diag::warn_drv_unknown_cuda_version)
116 << DetectedVersion
117 << CudaVersionToString(CudaVersion::LATEST_SUPPORTED);
118 }
119
CudaInstallationDetector(const Driver & D,const llvm::Triple & HostTriple,const llvm::opt::ArgList & Args)120 CudaInstallationDetector::CudaInstallationDetector(
121 const Driver &D, const llvm::Triple &HostTriple,
122 const llvm::opt::ArgList &Args)
123 : D(D) {
124 struct Candidate {
125 std::string Path;
126 bool StrictChecking;
127
128 Candidate(std::string Path, bool StrictChecking = false)
129 : Path(Path), StrictChecking(StrictChecking) {}
130 };
131 SmallVector<Candidate, 4> Candidates;
132
133 // In decreasing order so we prefer newer versions to older versions.
134 std::initializer_list<const char *> Versions = {"8.0", "7.5", "7.0"};
135 auto &FS = D.getVFS();
136
137 if (Args.hasArg(clang::driver::options::OPT_cuda_path_EQ)) {
138 Candidates.emplace_back(
139 Args.getLastArgValue(clang::driver::options::OPT_cuda_path_EQ).str());
140 } else if (HostTriple.isOSWindows()) {
141 for (const char *Ver : Versions)
142 Candidates.emplace_back(
143 D.SysRoot + "/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v" +
144 Ver);
145 } else {
146 if (!Args.hasArg(clang::driver::options::OPT_cuda_path_ignore_env)) {
147 // Try to find ptxas binary. If the executable is located in a directory
148 // called 'bin/', its parent directory might be a good guess for a valid
149 // CUDA installation.
150 // However, some distributions might installs 'ptxas' to /usr/bin. In that
151 // case the candidate would be '/usr' which passes the following checks
152 // because '/usr/include' exists as well. To avoid this case, we always
153 // check for the directory potentially containing files for libdevice,
154 // even if the user passes -nocudalib.
155 if (llvm::ErrorOr<std::string> ptxas =
156 llvm::sys::findProgramByName("ptxas")) {
157 SmallString<256> ptxasAbsolutePath;
158 llvm::sys::fs::real_path(*ptxas, ptxasAbsolutePath);
159
160 StringRef ptxasDir = llvm::sys::path::parent_path(ptxasAbsolutePath);
161 if (llvm::sys::path::filename(ptxasDir) == "bin")
162 Candidates.emplace_back(
163 std::string(llvm::sys::path::parent_path(ptxasDir)),
164 /*StrictChecking=*/true);
165 }
166 }
167
168 Candidates.emplace_back(D.SysRoot + "/usr/local/cuda");
169 for (const char *Ver : Versions)
170 Candidates.emplace_back(D.SysRoot + "/usr/local/cuda-" + Ver);
171
172 Distro Dist(FS, llvm::Triple(llvm::sys::getProcessTriple()));
173 if (Dist.IsDebian() || Dist.IsUbuntu())
174 // Special case for Debian to have nvidia-cuda-toolkit work
175 // out of the box. More info on http://bugs.debian.org/882505
176 Candidates.emplace_back(D.SysRoot + "/usr/lib/cuda");
177 }
178
179 bool NoCudaLib = Args.hasArg(options::OPT_nogpulib);
180
181 for (const auto &Candidate : Candidates) {
182 InstallPath = Candidate.Path;
183 if (InstallPath.empty() || !FS.exists(InstallPath))
184 continue;
185
186 BinPath = InstallPath + "/bin";
187 IncludePath = InstallPath + "/include";
188 LibDevicePath = InstallPath + "/nvvm/libdevice";
189
190 if (!(FS.exists(IncludePath) && FS.exists(BinPath)))
191 continue;
192 bool CheckLibDevice = (!NoCudaLib || Candidate.StrictChecking);
193 if (CheckLibDevice && !FS.exists(LibDevicePath))
194 continue;
195
196 // On Linux, we have both lib and lib64 directories, and we need to choose
197 // based on our triple. On MacOS, we have only a lib directory.
198 //
199 // It's sufficient for our purposes to be flexible: If both lib and lib64
200 // exist, we choose whichever one matches our triple. Otherwise, if only
201 // lib exists, we use it.
202 if (HostTriple.isArch64Bit() && FS.exists(InstallPath + "/lib64"))
203 LibPath = InstallPath + "/lib64";
204 else if (FS.exists(InstallPath + "/lib"))
205 LibPath = InstallPath + "/lib";
206 else
207 continue;
208
209 CudaVersionInfo VersionInfo = {"", CudaVersion::UNKNOWN};
210 if (auto VersionFile = FS.getBufferForFile(InstallPath + "/version.txt"))
211 VersionInfo = parseCudaVersionFile((*VersionFile)->getBuffer());
212 // If version file didn't give us the version, try to find it in cuda.h
213 if (VersionInfo.Version == CudaVersion::UNKNOWN)
214 if (auto CudaHFile = FS.getBufferForFile(InstallPath + "/include/cuda.h"))
215 VersionInfo = parseCudaHFile((*CudaHFile)->getBuffer());
216 // As the last resort, make an educated guess between CUDA-7.0, (which had
217 // no version.txt file and had old-style libdevice bitcode ) and an unknown
218 // recent CUDA version (no version.txt, new style bitcode).
219 if (VersionInfo.Version == CudaVersion::UNKNOWN) {
220 VersionInfo.Version = (FS.exists(LibDevicePath + "/libdevice.10.bc"))
221 ? Version = CudaVersion::LATEST
222 : Version = CudaVersion::CUDA_70;
223 VersionInfo.DetectedVersion =
224 "No version found in version.txt or cuda.h.";
225 }
226
227 Version = VersionInfo.Version;
228 DetectedVersion = VersionInfo.DetectedVersion;
229
230 // TODO(tra): remove the warning once we have all features of 10.2
231 // and 11.0 implemented.
232 DetectedVersionIsNotSupported = Version > CudaVersion::LATEST_SUPPORTED;
233
234 if (Version >= CudaVersion::CUDA_90) {
235 // CUDA-9+ uses single libdevice file for all GPU variants.
236 std::string FilePath = LibDevicePath + "/libdevice.10.bc";
237 if (FS.exists(FilePath)) {
238 for (int Arch = (int)CudaArch::SM_30, E = (int)CudaArch::LAST; Arch < E;
239 ++Arch) {
240 CudaArch GpuArch = static_cast<CudaArch>(Arch);
241 if (!IsNVIDIAGpuArch(GpuArch))
242 continue;
243 std::string GpuArchName(CudaArchToString(GpuArch));
244 LibDeviceMap[GpuArchName] = FilePath;
245 }
246 }
247 } else {
248 std::error_code EC;
249 for (llvm::vfs::directory_iterator LI = FS.dir_begin(LibDevicePath, EC),
250 LE;
251 !EC && LI != LE; LI = LI.increment(EC)) {
252 StringRef FilePath = LI->path();
253 StringRef FileName = llvm::sys::path::filename(FilePath);
254 // Process all bitcode filenames that look like
255 // libdevice.compute_XX.YY.bc
256 const StringRef LibDeviceName = "libdevice.";
257 if (!(FileName.startswith(LibDeviceName) && FileName.endswith(".bc")))
258 continue;
259 StringRef GpuArch = FileName.slice(
260 LibDeviceName.size(), FileName.find('.', LibDeviceName.size()));
261 LibDeviceMap[GpuArch] = FilePath.str();
262 // Insert map entries for specific devices with this compute
263 // capability. NVCC's choice of the libdevice library version is
264 // rather peculiar and depends on the CUDA version.
265 if (GpuArch == "compute_20") {
266 LibDeviceMap["sm_20"] = std::string(FilePath);
267 LibDeviceMap["sm_21"] = std::string(FilePath);
268 LibDeviceMap["sm_32"] = std::string(FilePath);
269 } else if (GpuArch == "compute_30") {
270 LibDeviceMap["sm_30"] = std::string(FilePath);
271 if (Version < CudaVersion::CUDA_80) {
272 LibDeviceMap["sm_50"] = std::string(FilePath);
273 LibDeviceMap["sm_52"] = std::string(FilePath);
274 LibDeviceMap["sm_53"] = std::string(FilePath);
275 }
276 LibDeviceMap["sm_60"] = std::string(FilePath);
277 LibDeviceMap["sm_61"] = std::string(FilePath);
278 LibDeviceMap["sm_62"] = std::string(FilePath);
279 } else if (GpuArch == "compute_35") {
280 LibDeviceMap["sm_35"] = std::string(FilePath);
281 LibDeviceMap["sm_37"] = std::string(FilePath);
282 } else if (GpuArch == "compute_50") {
283 if (Version >= CudaVersion::CUDA_80) {
284 LibDeviceMap["sm_50"] = std::string(FilePath);
285 LibDeviceMap["sm_52"] = std::string(FilePath);
286 LibDeviceMap["sm_53"] = std::string(FilePath);
287 }
288 }
289 }
290 }
291
292 // Check that we have found at least one libdevice that we can link in if
293 // -nocudalib hasn't been specified.
294 if (LibDeviceMap.empty() && !NoCudaLib)
295 continue;
296
297 IsValid = true;
298 break;
299 }
300 }
301
AddCudaIncludeArgs(const ArgList & DriverArgs,ArgStringList & CC1Args) const302 void CudaInstallationDetector::AddCudaIncludeArgs(
303 const ArgList &DriverArgs, ArgStringList &CC1Args) const {
304 if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
305 // Add cuda_wrappers/* to our system include path. This lets us wrap
306 // standard library headers.
307 SmallString<128> P(D.ResourceDir);
308 llvm::sys::path::append(P, "include");
309 llvm::sys::path::append(P, "cuda_wrappers");
310 CC1Args.push_back("-internal-isystem");
311 CC1Args.push_back(DriverArgs.MakeArgString(P));
312 }
313
314 if (DriverArgs.hasArg(options::OPT_nogpuinc))
315 return;
316
317 if (!isValid()) {
318 D.Diag(diag::err_drv_no_cuda_installation);
319 return;
320 }
321
322 CC1Args.push_back("-internal-isystem");
323 CC1Args.push_back(DriverArgs.MakeArgString(getIncludePath()));
324 CC1Args.push_back("-include");
325 CC1Args.push_back("__clang_cuda_runtime_wrapper.h");
326 }
327
CheckCudaVersionSupportsArch(CudaArch Arch) const328 void CudaInstallationDetector::CheckCudaVersionSupportsArch(
329 CudaArch Arch) const {
330 if (Arch == CudaArch::UNKNOWN || Version == CudaVersion::UNKNOWN ||
331 ArchsWithBadVersion[(int)Arch])
332 return;
333
334 auto MinVersion = MinVersionForCudaArch(Arch);
335 auto MaxVersion = MaxVersionForCudaArch(Arch);
336 if (Version < MinVersion || Version > MaxVersion) {
337 ArchsWithBadVersion[(int)Arch] = true;
338 D.Diag(diag::err_drv_cuda_version_unsupported)
339 << CudaArchToString(Arch) << CudaVersionToString(MinVersion)
340 << CudaVersionToString(MaxVersion) << InstallPath
341 << CudaVersionToString(Version);
342 }
343 }
344
print(raw_ostream & OS) const345 void CudaInstallationDetector::print(raw_ostream &OS) const {
346 if (isValid())
347 OS << "Found CUDA installation: " << InstallPath << ", version "
348 << CudaVersionToString(Version) << "\n";
349 }
350
351 namespace {
352 /// Debug info level for the NVPTX devices. We may need to emit different debug
353 /// info level for the host and for the device itselfi. This type controls
354 /// emission of the debug info for the devices. It either prohibits disable info
355 /// emission completely, or emits debug directives only, or emits same debug
356 /// info as for the host.
357 enum DeviceDebugInfoLevel {
358 DisableDebugInfo, /// Do not emit debug info for the devices.
359 DebugDirectivesOnly, /// Emit only debug directives.
360 EmitSameDebugInfoAsHost, /// Use the same debug info level just like for the
361 /// host.
362 };
363 } // anonymous namespace
364
365 /// Define debug info level for the NVPTX devices. If the debug info for both
366 /// the host and device are disabled (-g0/-ggdb0 or no debug options at all). If
367 /// only debug directives are requested for the both host and device
368 /// (-gline-directvies-only), or the debug info only for the device is disabled
369 /// (optimization is on and --cuda-noopt-device-debug was not specified), the
370 /// debug directves only must be emitted for the device. Otherwise, use the same
371 /// debug info level just like for the host (with the limitations of only
372 /// supported DWARF2 standard).
mustEmitDebugInfo(const ArgList & Args)373 static DeviceDebugInfoLevel mustEmitDebugInfo(const ArgList &Args) {
374 const Arg *A = Args.getLastArg(options::OPT_O_Group);
375 bool IsDebugEnabled = !A || A->getOption().matches(options::OPT_O0) ||
376 Args.hasFlag(options::OPT_cuda_noopt_device_debug,
377 options::OPT_no_cuda_noopt_device_debug,
378 /*Default=*/false);
379 if (const Arg *A = Args.getLastArg(options::OPT_g_Group)) {
380 const Option &Opt = A->getOption();
381 if (Opt.matches(options::OPT_gN_Group)) {
382 if (Opt.matches(options::OPT_g0) || Opt.matches(options::OPT_ggdb0))
383 return DisableDebugInfo;
384 if (Opt.matches(options::OPT_gline_directives_only))
385 return DebugDirectivesOnly;
386 }
387 return IsDebugEnabled ? EmitSameDebugInfoAsHost : DebugDirectivesOnly;
388 }
389 return willEmitRemarks(Args) ? DebugDirectivesOnly : DisableDebugInfo;
390 }
391
ConstructJob(Compilation & C,const JobAction & JA,const InputInfo & Output,const InputInfoList & Inputs,const ArgList & Args,const char * LinkingOutput) const392 void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
393 const InputInfo &Output,
394 const InputInfoList &Inputs,
395 const ArgList &Args,
396 const char *LinkingOutput) const {
397 const auto &TC =
398 static_cast<const toolchains::CudaToolChain &>(getToolChain());
399 assert(TC.getTriple().isNVPTX() && "Wrong platform");
400
401 StringRef GPUArchName;
402 // If this is an OpenMP action we need to extract the device architecture
403 // from the -march=arch option. This option may come from -Xopenmp-target
404 // flag or the default value.
405 if (JA.isDeviceOffloading(Action::OFK_OpenMP)) {
406 GPUArchName = Args.getLastArgValue(options::OPT_march_EQ);
407 assert(!GPUArchName.empty() && "Must have an architecture passed in.");
408 } else
409 GPUArchName = JA.getOffloadingArch();
410
411 // Obtain architecture from the action.
412 CudaArch gpu_arch = StringToCudaArch(GPUArchName);
413 assert(gpu_arch != CudaArch::UNKNOWN &&
414 "Device action expected to have an architecture.");
415
416 // Check that our installation's ptxas supports gpu_arch.
417 if (!Args.hasArg(options::OPT_no_cuda_version_check)) {
418 TC.CudaInstallation.CheckCudaVersionSupportsArch(gpu_arch);
419 }
420
421 ArgStringList CmdArgs;
422 CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-m64" : "-m32");
423 DeviceDebugInfoLevel DIKind = mustEmitDebugInfo(Args);
424 if (DIKind == EmitSameDebugInfoAsHost) {
425 // ptxas does not accept -g option if optimization is enabled, so
426 // we ignore the compiler's -O* options if we want debug info.
427 CmdArgs.push_back("-g");
428 CmdArgs.push_back("--dont-merge-basicblocks");
429 CmdArgs.push_back("--return-at-end");
430 } else if (Arg *A = Args.getLastArg(options::OPT_O_Group)) {
431 // Map the -O we received to -O{0,1,2,3}.
432 //
433 // TODO: Perhaps we should map host -O2 to ptxas -O3. -O3 is ptxas's
434 // default, so it may correspond more closely to the spirit of clang -O2.
435
436 // -O3 seems like the least-bad option when -Osomething is specified to
437 // clang but it isn't handled below.
438 StringRef OOpt = "3";
439 if (A->getOption().matches(options::OPT_O4) ||
440 A->getOption().matches(options::OPT_Ofast))
441 OOpt = "3";
442 else if (A->getOption().matches(options::OPT_O0))
443 OOpt = "0";
444 else if (A->getOption().matches(options::OPT_O)) {
445 // -Os, -Oz, and -O(anything else) map to -O2, for lack of better options.
446 OOpt = llvm::StringSwitch<const char *>(A->getValue())
447 .Case("1", "1")
448 .Case("2", "2")
449 .Case("3", "3")
450 .Case("s", "2")
451 .Case("z", "2")
452 .Default("2");
453 }
454 CmdArgs.push_back(Args.MakeArgString(llvm::Twine("-O") + OOpt));
455 } else {
456 // If no -O was passed, pass -O0 to ptxas -- no opt flag should correspond
457 // to no optimizations, but ptxas's default is -O3.
458 CmdArgs.push_back("-O0");
459 }
460 if (DIKind == DebugDirectivesOnly)
461 CmdArgs.push_back("-lineinfo");
462
463 // Pass -v to ptxas if it was passed to the driver.
464 if (Args.hasArg(options::OPT_v))
465 CmdArgs.push_back("-v");
466
467 CmdArgs.push_back("--gpu-name");
468 CmdArgs.push_back(Args.MakeArgString(CudaArchToString(gpu_arch)));
469 CmdArgs.push_back("--output-file");
470 CmdArgs.push_back(Args.MakeArgString(TC.getInputFilename(Output)));
471 for (const auto& II : Inputs)
472 CmdArgs.push_back(Args.MakeArgString(II.getFilename()));
473
474 for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_ptxas))
475 CmdArgs.push_back(Args.MakeArgString(A));
476
477 bool Relocatable = false;
478 if (JA.isOffloading(Action::OFK_OpenMP))
479 // In OpenMP we need to generate relocatable code.
480 Relocatable = Args.hasFlag(options::OPT_fopenmp_relocatable_target,
481 options::OPT_fnoopenmp_relocatable_target,
482 /*Default=*/true);
483 else if (JA.isOffloading(Action::OFK_Cuda))
484 Relocatable = Args.hasFlag(options::OPT_fgpu_rdc,
485 options::OPT_fno_gpu_rdc, /*Default=*/false);
486
487 if (Relocatable)
488 CmdArgs.push_back("-c");
489
490 const char *Exec;
491 if (Arg *A = Args.getLastArg(options::OPT_ptxas_path_EQ))
492 Exec = A->getValue();
493 else
494 Exec = Args.MakeArgString(TC.GetProgramPath("ptxas"));
495 C.addCommand(std::make_unique<Command>(
496 JA, *this,
497 ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8,
498 "--options-file"},
499 Exec, CmdArgs, Inputs, Output));
500 }
501
shouldIncludePTX(const ArgList & Args,const char * gpu_arch)502 static bool shouldIncludePTX(const ArgList &Args, const char *gpu_arch) {
503 bool includePTX = true;
504 for (Arg *A : Args) {
505 if (!(A->getOption().matches(options::OPT_cuda_include_ptx_EQ) ||
506 A->getOption().matches(options::OPT_no_cuda_include_ptx_EQ)))
507 continue;
508 A->claim();
509 const StringRef ArchStr = A->getValue();
510 if (ArchStr == "all" || ArchStr == gpu_arch) {
511 includePTX = A->getOption().matches(options::OPT_cuda_include_ptx_EQ);
512 continue;
513 }
514 }
515 return includePTX;
516 }
517
518 // All inputs to this linker must be from CudaDeviceActions, as we need to look
519 // at the Inputs' Actions in order to figure out which GPU architecture they
520 // correspond to.
ConstructJob(Compilation & C,const JobAction & JA,const InputInfo & Output,const InputInfoList & Inputs,const ArgList & Args,const char * LinkingOutput) const521 void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
522 const InputInfo &Output,
523 const InputInfoList &Inputs,
524 const ArgList &Args,
525 const char *LinkingOutput) const {
526 const auto &TC =
527 static_cast<const toolchains::CudaToolChain &>(getToolChain());
528 assert(TC.getTriple().isNVPTX() && "Wrong platform");
529
530 ArgStringList CmdArgs;
531 if (TC.CudaInstallation.version() <= CudaVersion::CUDA_100)
532 CmdArgs.push_back("--cuda");
533 CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-64" : "-32");
534 CmdArgs.push_back(Args.MakeArgString("--create"));
535 CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
536 if (mustEmitDebugInfo(Args) == EmitSameDebugInfoAsHost)
537 CmdArgs.push_back("-g");
538
539 for (const auto& II : Inputs) {
540 auto *A = II.getAction();
541 assert(A->getInputs().size() == 1 &&
542 "Device offload action is expected to have a single input");
543 const char *gpu_arch_str = A->getOffloadingArch();
544 assert(gpu_arch_str &&
545 "Device action expected to have associated a GPU architecture!");
546 CudaArch gpu_arch = StringToCudaArch(gpu_arch_str);
547
548 if (II.getType() == types::TY_PP_Asm &&
549 !shouldIncludePTX(Args, gpu_arch_str))
550 continue;
551 // We need to pass an Arch of the form "sm_XX" for cubin files and
552 // "compute_XX" for ptx.
553 const char *Arch = (II.getType() == types::TY_PP_Asm)
554 ? CudaArchToVirtualArchString(gpu_arch)
555 : gpu_arch_str;
556 CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") +
557 Arch + ",file=" + II.getFilename()));
558 }
559
560 for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_fatbinary))
561 CmdArgs.push_back(Args.MakeArgString(A));
562
563 const char *Exec = Args.MakeArgString(TC.GetProgramPath("fatbinary"));
564 C.addCommand(std::make_unique<Command>(
565 JA, *this,
566 ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8,
567 "--options-file"},
568 Exec, CmdArgs, Inputs, Output));
569 }
570
ConstructJob(Compilation & C,const JobAction & JA,const InputInfo & Output,const InputInfoList & Inputs,const ArgList & Args,const char * LinkingOutput) const571 void NVPTX::OpenMPLinker::ConstructJob(Compilation &C, const JobAction &JA,
572 const InputInfo &Output,
573 const InputInfoList &Inputs,
574 const ArgList &Args,
575 const char *LinkingOutput) const {
576 const auto &TC =
577 static_cast<const toolchains::CudaToolChain &>(getToolChain());
578 assert(TC.getTriple().isNVPTX() && "Wrong platform");
579
580 ArgStringList CmdArgs;
581
582 // OpenMP uses nvlink to link cubin files. The result will be embedded in the
583 // host binary by the host linker.
584 assert(!JA.isHostOffloading(Action::OFK_OpenMP) &&
585 "CUDA toolchain not expected for an OpenMP host device.");
586
587 if (Output.isFilename()) {
588 CmdArgs.push_back("-o");
589 CmdArgs.push_back(Output.getFilename());
590 } else
591 assert(Output.isNothing() && "Invalid output.");
592 if (mustEmitDebugInfo(Args) == EmitSameDebugInfoAsHost)
593 CmdArgs.push_back("-g");
594
595 if (Args.hasArg(options::OPT_v))
596 CmdArgs.push_back("-v");
597
598 StringRef GPUArch =
599 Args.getLastArgValue(options::OPT_march_EQ);
600 assert(!GPUArch.empty() && "At least one GPU Arch required for ptxas.");
601
602 CmdArgs.push_back("-arch");
603 CmdArgs.push_back(Args.MakeArgString(GPUArch));
604
605 // Add paths specified in LIBRARY_PATH environment variable as -L options.
606 addDirectoryList(Args, CmdArgs, "-L", "LIBRARY_PATH");
607
608 // Add paths for the default clang library path.
609 SmallString<256> DefaultLibPath =
610 llvm::sys::path::parent_path(TC.getDriver().Dir);
611 llvm::sys::path::append(DefaultLibPath, "lib" CLANG_LIBDIR_SUFFIX);
612 CmdArgs.push_back(Args.MakeArgString(Twine("-L") + DefaultLibPath));
613
614 for (const auto &II : Inputs) {
615 if (II.getType() == types::TY_LLVM_IR ||
616 II.getType() == types::TY_LTO_IR ||
617 II.getType() == types::TY_LTO_BC ||
618 II.getType() == types::TY_LLVM_BC) {
619 C.getDriver().Diag(diag::err_drv_no_linker_llvm_support)
620 << getToolChain().getTripleString();
621 continue;
622 }
623
624 // Currently, we only pass the input files to the linker, we do not pass
625 // any libraries that may be valid only for the host.
626 if (!II.isFilename())
627 continue;
628
629 const char *CubinF = C.addTempFile(
630 C.getArgs().MakeArgString(getToolChain().getInputFilename(II)));
631
632 CmdArgs.push_back(CubinF);
633 }
634
635 const char *Exec =
636 Args.MakeArgString(getToolChain().GetProgramPath("nvlink"));
637 C.addCommand(std::make_unique<Command>(
638 JA, *this,
639 ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8,
640 "--options-file"},
641 Exec, CmdArgs, Inputs, Output));
642 }
643
644 /// CUDA toolchain. Our assembler is ptxas, and our "linker" is fatbinary,
645 /// which isn't properly a linker but nonetheless performs the step of stitching
646 /// together object files from the assembler into a single blob.
647
CudaToolChain(const Driver & D,const llvm::Triple & Triple,const ToolChain & HostTC,const ArgList & Args,const Action::OffloadKind OK)648 CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
649 const ToolChain &HostTC, const ArgList &Args,
650 const Action::OffloadKind OK)
651 : ToolChain(D, Triple, Args), HostTC(HostTC),
652 CudaInstallation(D, HostTC.getTriple(), Args), OK(OK) {
653 if (CudaInstallation.isValid()) {
654 CudaInstallation.WarnIfUnsupportedVersion();
655 getProgramPaths().push_back(std::string(CudaInstallation.getBinPath()));
656 }
657 // Lookup binaries into the driver directory, this is used to
658 // discover the clang-offload-bundler executable.
659 getProgramPaths().push_back(getDriver().Dir);
660 }
661
getInputFilename(const InputInfo & Input) const662 std::string CudaToolChain::getInputFilename(const InputInfo &Input) const {
663 // Only object files are changed, for example assembly files keep their .s
664 // extensions. CUDA also continues to use .o as they don't use nvlink but
665 // fatbinary.
666 if (!(OK == Action::OFK_OpenMP && Input.getType() == types::TY_Object))
667 return ToolChain::getInputFilename(Input);
668
669 // Replace extension for object files with cubin because nvlink relies on
670 // these particular file names.
671 SmallString<256> Filename(ToolChain::getInputFilename(Input));
672 llvm::sys::path::replace_extension(Filename, "cubin");
673 return std::string(Filename.str());
674 }
675
addClangTargetOptions(const llvm::opt::ArgList & DriverArgs,llvm::opt::ArgStringList & CC1Args,Action::OffloadKind DeviceOffloadingKind) const676 void CudaToolChain::addClangTargetOptions(
677 const llvm::opt::ArgList &DriverArgs,
678 llvm::opt::ArgStringList &CC1Args,
679 Action::OffloadKind DeviceOffloadingKind) const {
680 HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind);
681
682 StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
683 assert(!GpuArch.empty() && "Must have an explicit GPU arch.");
684 assert((DeviceOffloadingKind == Action::OFK_OpenMP ||
685 DeviceOffloadingKind == Action::OFK_Cuda) &&
686 "Only OpenMP or CUDA offloading kinds are supported for NVIDIA GPUs.");
687
688 if (DeviceOffloadingKind == Action::OFK_Cuda) {
689 CC1Args.push_back("-fcuda-is-device");
690
691 if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
692 options::OPT_fno_cuda_approx_transcendentals, false))
693 CC1Args.push_back("-fcuda-approx-transcendentals");
694 }
695
696 if (DriverArgs.hasArg(options::OPT_nogpulib))
697 return;
698
699 if (DeviceOffloadingKind == Action::OFK_OpenMP &&
700 DriverArgs.hasArg(options::OPT_S))
701 return;
702
703 std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch);
704 if (LibDeviceFile.empty()) {
705 getDriver().Diag(diag::err_drv_no_cuda_libdevice) << GpuArch;
706 return;
707 }
708
709 CC1Args.push_back("-mlink-builtin-bitcode");
710 CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
711
712 clang::CudaVersion CudaInstallationVersion = CudaInstallation.version();
713
714 // New CUDA versions often introduce new instructions that are only supported
715 // by new PTX version, so we need to raise PTX level to enable them in NVPTX
716 // back-end.
717 const char *PtxFeature = nullptr;
718 switch (CudaInstallationVersion) {
719 #define CASE_CUDA_VERSION(CUDA_VER, PTX_VER) \
720 case CudaVersion::CUDA_##CUDA_VER: \
721 PtxFeature = "+ptx" #PTX_VER; \
722 break;
723 CASE_CUDA_VERSION(112, 72);
724 CASE_CUDA_VERSION(111, 71);
725 CASE_CUDA_VERSION(110, 70);
726 CASE_CUDA_VERSION(102, 65);
727 CASE_CUDA_VERSION(101, 64);
728 CASE_CUDA_VERSION(100, 63);
729 CASE_CUDA_VERSION(92, 61);
730 CASE_CUDA_VERSION(91, 61);
731 CASE_CUDA_VERSION(90, 60);
732 #undef CASE_CUDA_VERSION
733 default:
734 PtxFeature = "+ptx42";
735 }
736 CC1Args.append({"-target-feature", PtxFeature});
737 if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr,
738 options::OPT_fno_cuda_short_ptr, false))
739 CC1Args.append({"-mllvm", "--nvptx-short-ptr"});
740
741 if (CudaInstallationVersion >= CudaVersion::UNKNOWN)
742 CC1Args.push_back(
743 DriverArgs.MakeArgString(Twine("-target-sdk-version=") +
744 CudaVersionToString(CudaInstallationVersion)));
745
746 if (DeviceOffloadingKind == Action::OFK_OpenMP) {
747 if (CudaInstallationVersion < CudaVersion::CUDA_92) {
748 getDriver().Diag(
749 diag::err_drv_omp_offload_target_cuda_version_not_support)
750 << CudaVersionToString(CudaInstallationVersion);
751 return;
752 }
753
754 std::string BitcodeSuffix;
755 if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
756 options::OPT_fno_openmp_target_new_runtime, false))
757 BitcodeSuffix = "new-nvptx-" + GpuArch.str();
758 else
759 BitcodeSuffix = "nvptx-" + GpuArch.str();
760
761 addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, BitcodeSuffix,
762 getTriple());
763 }
764 }
765
getDefaultDenormalModeForType(const llvm::opt::ArgList & DriverArgs,const JobAction & JA,const llvm::fltSemantics * FPType) const766 llvm::DenormalMode CudaToolChain::getDefaultDenormalModeForType(
767 const llvm::opt::ArgList &DriverArgs, const JobAction &JA,
768 const llvm::fltSemantics *FPType) const {
769 if (JA.getOffloadingDeviceKind() == Action::OFK_Cuda) {
770 if (FPType && FPType == &llvm::APFloat::IEEEsingle() &&
771 DriverArgs.hasFlag(options::OPT_fgpu_flush_denormals_to_zero,
772 options::OPT_fno_gpu_flush_denormals_to_zero, false))
773 return llvm::DenormalMode::getPreserveSign();
774 }
775
776 assert(JA.getOffloadingDeviceKind() != Action::OFK_Host);
777 return llvm::DenormalMode::getIEEE();
778 }
779
supportsDebugInfoOption(const llvm::opt::Arg * A) const780 bool CudaToolChain::supportsDebugInfoOption(const llvm::opt::Arg *A) const {
781 const Option &O = A->getOption();
782 return (O.matches(options::OPT_gN_Group) &&
783 !O.matches(options::OPT_gmodules)) ||
784 O.matches(options::OPT_g_Flag) ||
785 O.matches(options::OPT_ggdbN_Group) || O.matches(options::OPT_ggdb) ||
786 O.matches(options::OPT_gdwarf) || O.matches(options::OPT_gdwarf_2) ||
787 O.matches(options::OPT_gdwarf_3) || O.matches(options::OPT_gdwarf_4) ||
788 O.matches(options::OPT_gdwarf_5) ||
789 O.matches(options::OPT_gcolumn_info);
790 }
791
adjustDebugInfoKind(codegenoptions::DebugInfoKind & DebugInfoKind,const ArgList & Args) const792 void CudaToolChain::adjustDebugInfoKind(
793 codegenoptions::DebugInfoKind &DebugInfoKind, const ArgList &Args) const {
794 switch (mustEmitDebugInfo(Args)) {
795 case DisableDebugInfo:
796 DebugInfoKind = codegenoptions::NoDebugInfo;
797 break;
798 case DebugDirectivesOnly:
799 DebugInfoKind = codegenoptions::DebugDirectivesOnly;
800 break;
801 case EmitSameDebugInfoAsHost:
802 // Use same debug info level as the host.
803 break;
804 }
805 }
806
AddCudaIncludeArgs(const ArgList & DriverArgs,ArgStringList & CC1Args) const807 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
808 ArgStringList &CC1Args) const {
809 // Check our CUDA version if we're going to include the CUDA headers.
810 if (!DriverArgs.hasArg(options::OPT_nogpuinc) &&
811 !DriverArgs.hasArg(options::OPT_no_cuda_version_check)) {
812 StringRef Arch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
813 assert(!Arch.empty() && "Must have an explicit GPU arch.");
814 CudaInstallation.CheckCudaVersionSupportsArch(StringToCudaArch(Arch));
815 }
816 CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
817 }
818
819 llvm::opt::DerivedArgList *
TranslateArgs(const llvm::opt::DerivedArgList & Args,StringRef BoundArch,Action::OffloadKind DeviceOffloadKind) const820 CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
821 StringRef BoundArch,
822 Action::OffloadKind DeviceOffloadKind) const {
823 DerivedArgList *DAL =
824 HostTC.TranslateArgs(Args, BoundArch, DeviceOffloadKind);
825 if (!DAL)
826 DAL = new DerivedArgList(Args.getBaseArgs());
827
828 const OptTable &Opts = getDriver().getOpts();
829
830 // For OpenMP device offloading, append derived arguments. Make sure
831 // flags are not duplicated.
832 // Also append the compute capability.
833 if (DeviceOffloadKind == Action::OFK_OpenMP) {
834 for (Arg *A : Args) {
835 bool IsDuplicate = false;
836 for (Arg *DALArg : *DAL) {
837 if (A == DALArg) {
838 IsDuplicate = true;
839 break;
840 }
841 }
842 if (!IsDuplicate)
843 DAL->append(A);
844 }
845
846 StringRef Arch = DAL->getLastArgValue(options::OPT_march_EQ);
847 if (Arch.empty())
848 DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ),
849 CLANG_OPENMP_NVPTX_DEFAULT_ARCH);
850
851 return DAL;
852 }
853
854 for (Arg *A : Args) {
855 DAL->append(A);
856 }
857
858 if (!BoundArch.empty()) {
859 DAL->eraseArg(options::OPT_march_EQ);
860 DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch);
861 }
862 return DAL;
863 }
864
buildAssembler() const865 Tool *CudaToolChain::buildAssembler() const {
866 return new tools::NVPTX::Assembler(*this);
867 }
868
buildLinker() const869 Tool *CudaToolChain::buildLinker() const {
870 if (OK == Action::OFK_OpenMP)
871 return new tools::NVPTX::OpenMPLinker(*this);
872 return new tools::NVPTX::Linker(*this);
873 }
874
addClangWarningOptions(ArgStringList & CC1Args) const875 void CudaToolChain::addClangWarningOptions(ArgStringList &CC1Args) const {
876 HostTC.addClangWarningOptions(CC1Args);
877 }
878
879 ToolChain::CXXStdlibType
GetCXXStdlibType(const ArgList & Args) const880 CudaToolChain::GetCXXStdlibType(const ArgList &Args) const {
881 return HostTC.GetCXXStdlibType(Args);
882 }
883
AddClangSystemIncludeArgs(const ArgList & DriverArgs,ArgStringList & CC1Args) const884 void CudaToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
885 ArgStringList &CC1Args) const {
886 HostTC.AddClangSystemIncludeArgs(DriverArgs, CC1Args);
887 }
888
AddClangCXXStdlibIncludeArgs(const ArgList & Args,ArgStringList & CC1Args) const889 void CudaToolChain::AddClangCXXStdlibIncludeArgs(const ArgList &Args,
890 ArgStringList &CC1Args) const {
891 HostTC.AddClangCXXStdlibIncludeArgs(Args, CC1Args);
892 }
893
AddIAMCUIncludeArgs(const ArgList & Args,ArgStringList & CC1Args) const894 void CudaToolChain::AddIAMCUIncludeArgs(const ArgList &Args,
895 ArgStringList &CC1Args) const {
896 HostTC.AddIAMCUIncludeArgs(Args, CC1Args);
897 }
898
getSupportedSanitizers() const899 SanitizerMask CudaToolChain::getSupportedSanitizers() const {
900 // The CudaToolChain only supports sanitizers in the sense that it allows
901 // sanitizer arguments on the command line if they are supported by the host
902 // toolchain. The CudaToolChain will actually ignore any command line
903 // arguments for any of these "supported" sanitizers. That means that no
904 // sanitization of device code is actually supported at this time.
905 //
906 // This behavior is necessary because the host and device toolchains
907 // invocations often share the command line, so the device toolchain must
908 // tolerate flags meant only for the host toolchain.
909 return HostTC.getSupportedSanitizers();
910 }
911
computeMSVCVersion(const Driver * D,const ArgList & Args) const912 VersionTuple CudaToolChain::computeMSVCVersion(const Driver *D,
913 const ArgList &Args) const {
914 return HostTC.computeMSVCVersion(D, Args);
915 }
916