1 //== GenericTaintChecker.cpp ----------------------------------- -*- C++ -*--=//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This checker defines the attack surface for generic taint propagation.
10 //
11 // The taint information produced by it might be useful to other checkers. For
12 // example, checkers should report errors which involve tainted data more
13 // aggressively, even if the involved symbols are under constrained.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "Yaml.h"
18 #include "clang/AST/Attr.h"
19 #include "clang/Basic/Builtins.h"
20 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
21 #include "clang/StaticAnalyzer/Checkers/Taint.h"
22 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
23 #include "clang/StaticAnalyzer/Core/Checker.h"
24 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
25 #include "clang/StaticAnalyzer/Core/PathSensitive/CallDescription.h"
26 #include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
27 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
28 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramStateTrait.h"
29 #include "llvm/Support/YAMLTraits.h"
30 
31 #include <limits>
32 #include <memory>
33 #include <utility>
34 
35 #define DEBUG_TYPE "taint-checker"
36 
37 using namespace clang;
38 using namespace ento;
39 using namespace taint;
40 
41 using llvm::ImmutableSet;
42 
43 namespace {
44 
45 class GenericTaintChecker;
46 
47 /// Check for CWE-134: Uncontrolled Format String.
48 constexpr llvm::StringLiteral MsgUncontrolledFormatString =
49     "Untrusted data is used as a format string "
50     "(CWE-134: Uncontrolled Format String)";
51 
52 /// Check for:
53 /// CERT/STR02-C. "Sanitize data passed to complex subsystems"
54 /// CWE-78, "Failure to Sanitize Data into an OS Command"
55 constexpr llvm::StringLiteral MsgSanitizeSystemArgs =
56     "Untrusted data is passed to a system call "
57     "(CERT/STR02-C. Sanitize data passed to complex subsystems)";
58 
59 /// Check if tainted data is used as a buffer size in strn.. functions,
60 /// and allocators.
61 constexpr llvm::StringLiteral MsgTaintedBufferSize =
62     "Untrusted data is used to specify the buffer size "
63     "(CERT/STR31-C. Guarantee that storage for strings has sufficient space "
64     "for character data and the null terminator)";
65 
66 /// Check if tainted data is used as a custom sink's parameter.
67 constexpr llvm::StringLiteral MsgCustomSink =
68     "Untrusted data is passed to a user-defined sink";
69 
70 using ArgIdxTy = int;
71 using ArgVecTy = llvm::SmallVector<ArgIdxTy, 2>;
72 
73 /// Denotes the return value.
74 constexpr ArgIdxTy ReturnValueIndex{-1};
75 
76 static ArgIdxTy fromArgumentCount(unsigned Count) {
77   assert(Count <=
78              static_cast<std::size_t>(std::numeric_limits<ArgIdxTy>::max()) &&
79          "ArgIdxTy is not large enough to represent the number of arguments.");
80   return Count;
81 }
82 
83 /// Check if the region the expression evaluates to is the standard input,
84 /// and thus, is tainted.
85 /// FIXME: Move this to Taint.cpp.
86 bool isStdin(SVal Val, const ASTContext &ACtx) {
87   // FIXME: What if Val is NonParamVarRegion?
88 
89   // The region should be symbolic, we do not know it's value.
90   const auto *SymReg = dyn_cast_or_null<SymbolicRegion>(Val.getAsRegion());
91   if (!SymReg)
92     return false;
93 
94   // Get it's symbol and find the declaration region it's pointing to.
95   const auto *DeclReg =
96       dyn_cast_or_null<DeclRegion>(SymReg->getSymbol()->getOriginRegion());
97   if (!DeclReg)
98     return false;
99 
100   // This region corresponds to a declaration, find out if it's a global/extern
101   // variable named stdin with the proper type.
102   if (const auto *D = dyn_cast_or_null<VarDecl>(DeclReg->getDecl())) {
103     D = D->getCanonicalDecl();
104     // FIXME: This should look for an exact match.
105     if (D->getName().contains("stdin") && D->isExternC()) {
106       const QualType FILETy = ACtx.getFILEType().getCanonicalType();
107       const QualType Ty = D->getType().getCanonicalType();
108 
109       if (Ty->isPointerType())
110         return Ty->getPointeeType() == FILETy;
111     }
112   }
113   return false;
114 }
115 
116 SVal getPointeeOf(const CheckerContext &C, Loc LValue) {
117   const QualType ArgTy = LValue.getType(C.getASTContext());
118   if (!ArgTy->isPointerType() || !ArgTy->getPointeeType()->isVoidType())
119     return C.getState()->getSVal(LValue);
120 
121   // Do not dereference void pointers. Treat them as byte pointers instead.
122   // FIXME: we might want to consider more than just the first byte.
123   return C.getState()->getSVal(LValue, C.getASTContext().CharTy);
124 }
125 
126 /// Given a pointer/reference argument, return the value it refers to.
127 Optional<SVal> getPointeeOf(const CheckerContext &C, SVal Arg) {
128   if (auto LValue = Arg.getAs<Loc>())
129     return getPointeeOf(C, *LValue);
130   return None;
131 }
132 
133 /// Given a pointer, return the SVal of its pointee or if it is tainted,
134 /// otherwise return the pointer's SVal if tainted.
135 /// Also considers stdin as a taint source.
136 Optional<SVal> getTaintedPointeeOrPointer(const CheckerContext &C, SVal Arg) {
137   const ProgramStateRef State = C.getState();
138 
139   if (auto Pointee = getPointeeOf(C, Arg))
140     if (isTainted(State, *Pointee)) // FIXME: isTainted(...) ? Pointee : None;
141       return Pointee;
142 
143   if (isTainted(State, Arg))
144     return Arg;
145 
146   // FIXME: This should be done by the isTainted() API.
147   if (isStdin(Arg, C.getASTContext()))
148     return Arg;
149 
150   return None;
151 }
152 
153 bool isTaintedOrPointsToTainted(const Expr *E, const ProgramStateRef &State,
154                                 CheckerContext &C) {
155   return getTaintedPointeeOrPointer(C, C.getSVal(E)).has_value();
156 }
157 
158 /// ArgSet is used to describe arguments relevant for taint detection or
159 /// taint application. A discrete set of argument indexes and a variadic
160 /// argument list signified by a starting index are supported.
161 class ArgSet {
162 public:
163   ArgSet() = default;
164   ArgSet(ArgVecTy &&DiscreteArgs, Optional<ArgIdxTy> VariadicIndex = None)
165       : DiscreteArgs(std::move(DiscreteArgs)),
166         VariadicIndex(std::move(VariadicIndex)) {}
167 
168   bool contains(ArgIdxTy ArgIdx) const {
169     if (llvm::is_contained(DiscreteArgs, ArgIdx))
170       return true;
171 
172     return VariadicIndex && ArgIdx >= *VariadicIndex;
173   }
174 
175   bool isEmpty() const { return DiscreteArgs.empty() && !VariadicIndex; }
176 
177 private:
178   ArgVecTy DiscreteArgs;
179   Optional<ArgIdxTy> VariadicIndex;
180 };
181 
182 /// A struct used to specify taint propagation rules for a function.
183 ///
184 /// If any of the possible taint source arguments is tainted, all of the
185 /// destination arguments should also be tainted. If ReturnValueIndex is added
186 /// to the dst list, the return value will be tainted.
187 class GenericTaintRule {
188   /// Arguments which are taints sinks and should be checked, and a report
189   /// should be emitted if taint reaches these.
190   ArgSet SinkArgs;
191   /// Arguments which should be sanitized on function return.
192   ArgSet FilterArgs;
193   /// Arguments which can participate in taint propagationa. If any of the
194   /// arguments in PropSrcArgs is tainted, all arguments in  PropDstArgs should
195   /// be tainted.
196   ArgSet PropSrcArgs;
197   ArgSet PropDstArgs;
198 
199   /// A message that explains why the call is sensitive to taint.
200   Optional<StringRef> SinkMsg;
201 
202   GenericTaintRule() = default;
203 
204   GenericTaintRule(ArgSet &&Sink, ArgSet &&Filter, ArgSet &&Src, ArgSet &&Dst,
205                    Optional<StringRef> SinkMsg = None)
206       : SinkArgs(std::move(Sink)), FilterArgs(std::move(Filter)),
207         PropSrcArgs(std::move(Src)), PropDstArgs(std::move(Dst)),
208         SinkMsg(SinkMsg) {}
209 
210 public:
211   /// Make a rule that reports a warning if taint reaches any of \p FilterArgs
212   /// arguments.
213   static GenericTaintRule Sink(ArgSet &&SinkArgs,
214                                Optional<StringRef> Msg = None) {
215     return {std::move(SinkArgs), {}, {}, {}, Msg};
216   }
217 
218   /// Make a rule that sanitizes all FilterArgs arguments.
219   static GenericTaintRule Filter(ArgSet &&FilterArgs) {
220     return {{}, std::move(FilterArgs), {}, {}};
221   }
222 
223   /// Make a rule that unconditionally taints all Args.
224   /// If Func is provided, it must also return true for taint to propagate.
225   static GenericTaintRule Source(ArgSet &&SourceArgs) {
226     return {{}, {}, {}, std::move(SourceArgs)};
227   }
228 
229   /// Make a rule that taints all PropDstArgs if any of PropSrcArgs is tainted.
230   static GenericTaintRule Prop(ArgSet &&SrcArgs, ArgSet &&DstArgs) {
231     return {{}, {}, std::move(SrcArgs), std::move(DstArgs)};
232   }
233 
234   /// Make a rule that taints all PropDstArgs if any of PropSrcArgs is tainted.
235   static GenericTaintRule SinkProp(ArgSet &&SinkArgs, ArgSet &&SrcArgs,
236                                    ArgSet &&DstArgs,
237                                    Optional<StringRef> Msg = None) {
238     return {
239         std::move(SinkArgs), {}, std::move(SrcArgs), std::move(DstArgs), Msg};
240   }
241 
242   /// Process a function which could either be a taint source, a taint sink, a
243   /// taint filter or a taint propagator.
244   void process(const GenericTaintChecker &Checker, const CallEvent &Call,
245                CheckerContext &C) const;
246 
247   /// Handles the resolution of indexes of type ArgIdxTy to Expr*-s.
248   static const Expr *GetArgExpr(ArgIdxTy ArgIdx, const CallEvent &Call) {
249     return ArgIdx == ReturnValueIndex ? Call.getOriginExpr()
250                                       : Call.getArgExpr(ArgIdx);
251   };
252 
253   /// Functions for custom taintedness propagation.
254   static bool UntrustedEnv(CheckerContext &C);
255 };
256 
257 using RuleLookupTy = CallDescriptionMap<GenericTaintRule>;
258 
259 /// Used to parse the configuration file.
260 struct TaintConfiguration {
261   using NameScopeArgs = std::tuple<std::string, std::string, ArgVecTy>;
262   enum class VariadicType { None, Src, Dst };
263 
264   struct Common {
265     std::string Name;
266     std::string Scope;
267   };
268 
269   struct Sink : Common {
270     ArgVecTy SinkArgs;
271   };
272 
273   struct Filter : Common {
274     ArgVecTy FilterArgs;
275   };
276 
277   struct Propagation : Common {
278     ArgVecTy SrcArgs;
279     ArgVecTy DstArgs;
280     VariadicType VarType;
281     ArgIdxTy VarIndex;
282   };
283 
284   std::vector<Propagation> Propagations;
285   std::vector<Filter> Filters;
286   std::vector<Sink> Sinks;
287 
288   TaintConfiguration() = default;
289   TaintConfiguration(const TaintConfiguration &) = default;
290   TaintConfiguration(TaintConfiguration &&) = default;
291   TaintConfiguration &operator=(const TaintConfiguration &) = default;
292   TaintConfiguration &operator=(TaintConfiguration &&) = default;
293 };
294 
295 struct GenericTaintRuleParser {
296   GenericTaintRuleParser(CheckerManager &Mgr) : Mgr(Mgr) {}
297   /// Container type used to gather call identification objects grouped into
298   /// pairs with their corresponding taint rules. It is temporary as it is used
299   /// to finally initialize RuleLookupTy, which is considered to be immutable.
300   using RulesContTy = std::vector<std::pair<CallDescription, GenericTaintRule>>;
301   RulesContTy parseConfiguration(const std::string &Option,
302                                  TaintConfiguration &&Config) const;
303 
304 private:
305   using NamePartsTy = llvm::SmallVector<SmallString<32>, 2>;
306 
307   /// Validate part of the configuration, which contains a list of argument
308   /// indexes.
309   void validateArgVector(const std::string &Option, const ArgVecTy &Args) const;
310 
311   template <typename Config> static NamePartsTy parseNameParts(const Config &C);
312 
313   // Takes the config and creates a CallDescription for it and associates a Rule
314   // with that.
315   template <typename Config>
316   static void consumeRulesFromConfig(const Config &C, GenericTaintRule &&Rule,
317                                      RulesContTy &Rules);
318 
319   void parseConfig(const std::string &Option, TaintConfiguration::Sink &&P,
320                    RulesContTy &Rules) const;
321   void parseConfig(const std::string &Option, TaintConfiguration::Filter &&P,
322                    RulesContTy &Rules) const;
323   void parseConfig(const std::string &Option,
324                    TaintConfiguration::Propagation &&P,
325                    RulesContTy &Rules) const;
326 
327   CheckerManager &Mgr;
328 };
329 
330 class GenericTaintChecker : public Checker<check::PreCall, check::PostCall> {
331 public:
332   void checkPreCall(const CallEvent &Call, CheckerContext &C) const;
333   void checkPostCall(const CallEvent &Call, CheckerContext &C) const;
334 
335   void printState(raw_ostream &Out, ProgramStateRef State, const char *NL,
336                   const char *Sep) const override;
337 
338   /// Generate a report if the expression is tainted or points to tainted data.
339   bool generateReportIfTainted(const Expr *E, StringRef Msg,
340                                CheckerContext &C) const;
341 
342 private:
343   const BugType BT{this, "Use of Untrusted Data", "Untrusted Data"};
344 
345   bool checkUncontrolledFormatString(const CallEvent &Call,
346                                      CheckerContext &C) const;
347 
348   void taintUnsafeSocketProtocol(const CallEvent &Call,
349                                  CheckerContext &C) const;
350 
351   /// Default taint rules are initilized with the help of a CheckerContext to
352   /// access the names of built-in functions like memcpy.
353   void initTaintRules(CheckerContext &C) const;
354 
355   /// CallDescription currently cannot restrict matches to the global namespace
356   /// only, which is why multiple CallDescriptionMaps are used, as we want to
357   /// disambiguate global C functions from functions inside user-defined
358   /// namespaces.
359   // TODO: Remove separation to simplify matching logic once CallDescriptions
360   // are more expressive.
361 
362   mutable Optional<RuleLookupTy> StaticTaintRules;
363   mutable Optional<RuleLookupTy> DynamicTaintRules;
364 };
365 } // end of anonymous namespace
366 
367 /// YAML serialization mapping.
368 LLVM_YAML_IS_SEQUENCE_VECTOR(TaintConfiguration::Sink)
369 LLVM_YAML_IS_SEQUENCE_VECTOR(TaintConfiguration::Filter)
370 LLVM_YAML_IS_SEQUENCE_VECTOR(TaintConfiguration::Propagation)
371 
372 namespace llvm {
373 namespace yaml {
374 template <> struct MappingTraits<TaintConfiguration> {
375   static void mapping(IO &IO, TaintConfiguration &Config) {
376     IO.mapOptional("Propagations", Config.Propagations);
377     IO.mapOptional("Filters", Config.Filters);
378     IO.mapOptional("Sinks", Config.Sinks);
379   }
380 };
381 
382 template <> struct MappingTraits<TaintConfiguration::Sink> {
383   static void mapping(IO &IO, TaintConfiguration::Sink &Sink) {
384     IO.mapRequired("Name", Sink.Name);
385     IO.mapOptional("Scope", Sink.Scope);
386     IO.mapRequired("Args", Sink.SinkArgs);
387   }
388 };
389 
390 template <> struct MappingTraits<TaintConfiguration::Filter> {
391   static void mapping(IO &IO, TaintConfiguration::Filter &Filter) {
392     IO.mapRequired("Name", Filter.Name);
393     IO.mapOptional("Scope", Filter.Scope);
394     IO.mapRequired("Args", Filter.FilterArgs);
395   }
396 };
397 
398 template <> struct MappingTraits<TaintConfiguration::Propagation> {
399   static void mapping(IO &IO, TaintConfiguration::Propagation &Propagation) {
400     IO.mapRequired("Name", Propagation.Name);
401     IO.mapOptional("Scope", Propagation.Scope);
402     IO.mapOptional("SrcArgs", Propagation.SrcArgs);
403     IO.mapOptional("DstArgs", Propagation.DstArgs);
404     IO.mapOptional("VariadicType", Propagation.VarType);
405     IO.mapOptional("VariadicIndex", Propagation.VarIndex);
406   }
407 };
408 
409 template <> struct ScalarEnumerationTraits<TaintConfiguration::VariadicType> {
410   static void enumeration(IO &IO, TaintConfiguration::VariadicType &Value) {
411     IO.enumCase(Value, "None", TaintConfiguration::VariadicType::None);
412     IO.enumCase(Value, "Src", TaintConfiguration::VariadicType::Src);
413     IO.enumCase(Value, "Dst", TaintConfiguration::VariadicType::Dst);
414   }
415 };
416 } // namespace yaml
417 } // namespace llvm
418 
419 /// A set which is used to pass information from call pre-visit instruction
420 /// to the call post-visit. The values are signed integers, which are either
421 /// ReturnValueIndex, or indexes of the pointer/reference argument, which
422 /// points to data, which should be tainted on return.
423 REGISTER_MAP_WITH_PROGRAMSTATE(TaintArgsOnPostVisit, const LocationContext *,
424                                ImmutableSet<ArgIdxTy>)
425 REGISTER_SET_FACTORY_WITH_PROGRAMSTATE(ArgIdxFactory, ArgIdxTy)
426 
427 void GenericTaintRuleParser::validateArgVector(const std::string &Option,
428                                                const ArgVecTy &Args) const {
429   for (ArgIdxTy Arg : Args) {
430     if (Arg < ReturnValueIndex) {
431       Mgr.reportInvalidCheckerOptionValue(
432           Mgr.getChecker<GenericTaintChecker>(), Option,
433           "an argument number for propagation rules greater or equal to -1");
434     }
435   }
436 }
437 
438 template <typename Config>
439 GenericTaintRuleParser::NamePartsTy
440 GenericTaintRuleParser::parseNameParts(const Config &C) {
441   NamePartsTy NameParts;
442   if (!C.Scope.empty()) {
443     // If the Scope argument contains multiple "::" parts, those are considered
444     // namespace identifiers.
445     llvm::SmallVector<StringRef, 2> NSParts;
446     StringRef{C.Scope}.split(NSParts, "::", /*MaxSplit*/ -1,
447                              /*KeepEmpty*/ false);
448     NameParts.append(NSParts.begin(), NSParts.end());
449   }
450   NameParts.emplace_back(C.Name);
451   return NameParts;
452 }
453 
454 template <typename Config>
455 void GenericTaintRuleParser::consumeRulesFromConfig(const Config &C,
456                                                     GenericTaintRule &&Rule,
457                                                     RulesContTy &Rules) {
458   NamePartsTy NameParts = parseNameParts(C);
459   llvm::SmallVector<const char *, 2> CallDescParts{NameParts.size()};
460   llvm::transform(NameParts, CallDescParts.begin(),
461                   [](SmallString<32> &S) { return S.c_str(); });
462   Rules.emplace_back(CallDescription(CallDescParts), std::move(Rule));
463 }
464 
465 void GenericTaintRuleParser::parseConfig(const std::string &Option,
466                                          TaintConfiguration::Sink &&S,
467                                          RulesContTy &Rules) const {
468   validateArgVector(Option, S.SinkArgs);
469   consumeRulesFromConfig(S, GenericTaintRule::Sink(std::move(S.SinkArgs)),
470                          Rules);
471 }
472 
473 void GenericTaintRuleParser::parseConfig(const std::string &Option,
474                                          TaintConfiguration::Filter &&S,
475                                          RulesContTy &Rules) const {
476   validateArgVector(Option, S.FilterArgs);
477   consumeRulesFromConfig(S, GenericTaintRule::Filter(std::move(S.FilterArgs)),
478                          Rules);
479 }
480 
481 void GenericTaintRuleParser::parseConfig(const std::string &Option,
482                                          TaintConfiguration::Propagation &&P,
483                                          RulesContTy &Rules) const {
484   validateArgVector(Option, P.SrcArgs);
485   validateArgVector(Option, P.DstArgs);
486   bool IsSrcVariadic = P.VarType == TaintConfiguration::VariadicType::Src;
487   bool IsDstVariadic = P.VarType == TaintConfiguration::VariadicType::Dst;
488   Optional<ArgIdxTy> JustVarIndex = P.VarIndex;
489 
490   ArgSet SrcDesc(std::move(P.SrcArgs), IsSrcVariadic ? JustVarIndex : None);
491   ArgSet DstDesc(std::move(P.DstArgs), IsDstVariadic ? JustVarIndex : None);
492 
493   consumeRulesFromConfig(
494       P, GenericTaintRule::Prop(std::move(SrcDesc), std::move(DstDesc)), Rules);
495 }
496 
497 GenericTaintRuleParser::RulesContTy
498 GenericTaintRuleParser::parseConfiguration(const std::string &Option,
499                                            TaintConfiguration &&Config) const {
500 
501   RulesContTy Rules;
502 
503   for (auto &F : Config.Filters)
504     parseConfig(Option, std::move(F), Rules);
505 
506   for (auto &S : Config.Sinks)
507     parseConfig(Option, std::move(S), Rules);
508 
509   for (auto &P : Config.Propagations)
510     parseConfig(Option, std::move(P), Rules);
511 
512   return Rules;
513 }
514 
515 void GenericTaintChecker::initTaintRules(CheckerContext &C) const {
516   // Check for exact name match for functions without builtin substitutes.
517   // Use qualified name, because these are C functions without namespace.
518 
519   if (StaticTaintRules || DynamicTaintRules)
520     return;
521 
522   using RulesConstructionTy =
523       std::vector<std::pair<CallDescription, GenericTaintRule>>;
524   using TR = GenericTaintRule;
525 
526   const Builtin::Context &BI = C.getASTContext().BuiltinInfo;
527 
528   RulesConstructionTy GlobalCRules{
529       // Sources
530       {{"fdopen"}, TR::Source({{ReturnValueIndex}})},
531       {{"fopen"}, TR::Source({{ReturnValueIndex}})},
532       {{"freopen"}, TR::Source({{ReturnValueIndex}})},
533       {{"getch"}, TR::Source({{ReturnValueIndex}})},
534       {{"getchar"}, TR::Source({{ReturnValueIndex}})},
535       {{"getchar_unlocked"}, TR::Source({{ReturnValueIndex}})},
536       {{"gets"}, TR::Source({{0}, ReturnValueIndex})},
537       {{"gets_s"}, TR::Source({{0}, ReturnValueIndex})},
538       {{"scanf"}, TR::Source({{}, 1})},
539       {{"scanf_s"}, TR::Source({{}, {1}})},
540       {{"wgetch"}, TR::Source({{}, ReturnValueIndex})},
541       // Sometimes the line between taint sources and propagators is blurry.
542       // _IO_getc is choosen to be a source, but could also be a propagator.
543       // This way it is simpler, as modeling it as a propagator would require
544       // to model the possible sources of _IO_FILE * values, which the _IO_getc
545       // function takes as parameters.
546       {{"_IO_getc"}, TR::Source({{ReturnValueIndex}})},
547       {{"getcwd"}, TR::Source({{0, ReturnValueIndex}})},
548       {{"getwd"}, TR::Source({{0, ReturnValueIndex}})},
549       {{"readlink"}, TR::Source({{1, ReturnValueIndex}})},
550       {{"readlinkat"}, TR::Source({{2, ReturnValueIndex}})},
551       {{"get_current_dir_name"}, TR::Source({{ReturnValueIndex}})},
552       {{"gethostname"}, TR::Source({{0}})},
553       {{"getnameinfo"}, TR::Source({{2, 4}})},
554       {{"getseuserbyname"}, TR::Source({{1, 2}})},
555       {{"getgroups"}, TR::Source({{1, ReturnValueIndex}})},
556       {{"getlogin"}, TR::Source({{ReturnValueIndex}})},
557       {{"getlogin_r"}, TR::Source({{0}})},
558 
559       // Props
560       {{"atoi"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
561       {{"atol"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
562       {{"atoll"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
563       {{"fgetc"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
564       {{"fgetln"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
565       {{"fgets"}, TR::Prop({{2}}, {{0, ReturnValueIndex}})},
566       {{"fscanf"}, TR::Prop({{0}}, {{}, 2})},
567       {{"fscanf_s"}, TR::Prop({{0}}, {{}, {2}})},
568       {{"sscanf"}, TR::Prop({{0}}, {{}, 2})},
569 
570       {{"getc"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
571       {{"getc_unlocked"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
572       {{"getdelim"}, TR::Prop({{3}}, {{0}})},
573       {{"getline"}, TR::Prop({{2}}, {{0}})},
574       {{"getw"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
575       {{"pread"}, TR::Prop({{0, 1, 2, 3}}, {{1, ReturnValueIndex}})},
576       {{"read"}, TR::Prop({{0, 2}}, {{1, ReturnValueIndex}})},
577       {{"strchr"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
578       {{"strrchr"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
579       {{"tolower"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
580       {{"toupper"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
581       {{"fread"}, TR::Prop({{3}}, {{0, ReturnValueIndex}})},
582       {{"recv"}, TR::Prop({{0}}, {{1, ReturnValueIndex}})},
583       {{"recvfrom"}, TR::Prop({{0}}, {{1, ReturnValueIndex}})},
584 
585       {{"ttyname"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
586       {{"ttyname_r"}, TR::Prop({{0}}, {{1, ReturnValueIndex}})},
587 
588       {{"basename"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
589       {{"dirname"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
590       {{"fnmatch"}, TR::Prop({{1}}, {{ReturnValueIndex}})},
591       {{"memchr"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
592       {{"memrchr"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
593       {{"rawmemchr"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
594 
595       {{"mbtowc"}, TR::Prop({{1}}, {{0, ReturnValueIndex}})},
596       {{"wctomb"}, TR::Prop({{1}}, {{0, ReturnValueIndex}})},
597       {{"wcwidth"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
598 
599       {{"memcmp"}, TR::Prop({{0, 1}}, {{ReturnValueIndex}})},
600       {{"memcpy"}, TR::Prop({{1}}, {{0, ReturnValueIndex}})},
601       {{"memmove"}, TR::Prop({{1}}, {{0, ReturnValueIndex}})},
602       // If memmem was called with a tainted needle and the search was
603       // successful, that would mean that the value pointed by the return value
604       // has the same content as the needle. If we choose to go by the policy of
605       // content equivalence implies taintedness equivalence, that would mean
606       // haystack should be considered a propagation source argument.
607       {{"memmem"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
608 
609       // The comment for memmem above also applies to strstr.
610       {{"strstr"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
611       {{"strcasestr"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
612 
613       {{"strchrnul"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
614 
615       {{"index"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
616       {{"rindex"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
617 
618       // FIXME: In case of arrays, only the first element of the array gets
619       // tainted.
620       {{"qsort"}, TR::Prop({{0}}, {{0}})},
621       {{"qsort_r"}, TR::Prop({{0}}, {{0}})},
622 
623       {{"strcmp"}, TR::Prop({{0, 1}}, {{ReturnValueIndex}})},
624       {{"strcasecmp"}, TR::Prop({{0, 1}}, {{ReturnValueIndex}})},
625       {{"strncmp"}, TR::Prop({{0, 1, 2}}, {{ReturnValueIndex}})},
626       {{"strncasecmp"}, TR::Prop({{0, 1, 2}}, {{ReturnValueIndex}})},
627       {{"strspn"}, TR::Prop({{0, 1}}, {{ReturnValueIndex}})},
628       {{"strcspn"}, TR::Prop({{0, 1}}, {{ReturnValueIndex}})},
629       {{"strpbrk"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
630       {{"strndup"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
631       {{"strndupa"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
632       {{"strlen"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
633       {{"strnlen"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
634       {{"strtol"}, TR::Prop({{0}}, {{1, ReturnValueIndex}})},
635       {{"strtoll"}, TR::Prop({{0}}, {{1, ReturnValueIndex}})},
636       {{"strtoul"}, TR::Prop({{0}}, {{1, ReturnValueIndex}})},
637       {{"strtoull"}, TR::Prop({{0}}, {{1, ReturnValueIndex}})},
638 
639       {{"isalnum"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
640       {{"isalpha"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
641       {{"isascii"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
642       {{"isblank"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
643       {{"iscntrl"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
644       {{"isdigit"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
645       {{"isgraph"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
646       {{"islower"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
647       {{"isprint"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
648       {{"ispunct"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
649       {{"isspace"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
650       {{"isupper"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
651       {{"isxdigit"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
652 
653       {{CDF_MaybeBuiltin, {BI.getName(Builtin::BIstrncat)}},
654        TR::Prop({{1, 2}}, {{0, ReturnValueIndex}})},
655       {{CDF_MaybeBuiltin, {BI.getName(Builtin::BIstrlcpy)}},
656        TR::Prop({{1, 2}}, {{0}})},
657       {{CDF_MaybeBuiltin, {BI.getName(Builtin::BIstrlcat)}},
658        TR::Prop({{1, 2}}, {{0}})},
659       {{CDF_MaybeBuiltin, {"snprintf"}},
660        TR::Prop({{1}, 3}, {{0, ReturnValueIndex}})},
661       {{CDF_MaybeBuiltin, {"sprintf"}},
662        TR::Prop({{1}, 2}, {{0, ReturnValueIndex}})},
663       {{CDF_MaybeBuiltin, {"strcpy"}},
664        TR::Prop({{1}}, {{0, ReturnValueIndex}})},
665       {{CDF_MaybeBuiltin, {"stpcpy"}},
666        TR::Prop({{1}}, {{0, ReturnValueIndex}})},
667       {{CDF_MaybeBuiltin, {"strcat"}},
668        TR::Prop({{1}}, {{0, ReturnValueIndex}})},
669       {{CDF_MaybeBuiltin, {"strdup"}}, TR::Prop({{0}}, {{ReturnValueIndex}})},
670       {{CDF_MaybeBuiltin, {"strdupa"}}, TR::Prop({{0}}, {{ReturnValueIndex}})},
671       {{CDF_MaybeBuiltin, {"wcsdup"}}, TR::Prop({{0}}, {{ReturnValueIndex}})},
672 
673       // Sinks
674       {{"system"}, TR::Sink({{0}}, MsgSanitizeSystemArgs)},
675       {{"popen"}, TR::Sink({{0}}, MsgSanitizeSystemArgs)},
676       {{"execl"}, TR::Sink({{0}}, MsgSanitizeSystemArgs)},
677       {{"execle"}, TR::Sink({{0}}, MsgSanitizeSystemArgs)},
678       {{"execlp"}, TR::Sink({{0}}, MsgSanitizeSystemArgs)},
679       {{"execvp"}, TR::Sink({{0}}, MsgSanitizeSystemArgs)},
680       {{"execvP"}, TR::Sink({{0}}, MsgSanitizeSystemArgs)},
681       {{"execve"}, TR::Sink({{0}}, MsgSanitizeSystemArgs)},
682       {{"dlopen"}, TR::Sink({{0}}, MsgSanitizeSystemArgs)},
683       {{CDF_MaybeBuiltin, {"malloc"}}, TR::Sink({{0}}, MsgTaintedBufferSize)},
684       {{CDF_MaybeBuiltin, {"calloc"}}, TR::Sink({{0}}, MsgTaintedBufferSize)},
685       {{CDF_MaybeBuiltin, {"alloca"}}, TR::Sink({{0}}, MsgTaintedBufferSize)},
686       {{CDF_MaybeBuiltin, {"memccpy"}}, TR::Sink({{3}}, MsgTaintedBufferSize)},
687       {{CDF_MaybeBuiltin, {"realloc"}}, TR::Sink({{1}}, MsgTaintedBufferSize)},
688       {{{"setproctitle"}}, TR::Sink({{0}, 1}, MsgUncontrolledFormatString)},
689       {{{"setproctitle_fast"}},
690        TR::Sink({{0}, 1}, MsgUncontrolledFormatString)},
691 
692       // SinkProps
693       {{CDF_MaybeBuiltin, BI.getName(Builtin::BImemcpy)},
694        TR::SinkProp({{2}}, {{1, 2}}, {{0, ReturnValueIndex}},
695                     MsgTaintedBufferSize)},
696       {{CDF_MaybeBuiltin, {BI.getName(Builtin::BImemmove)}},
697        TR::SinkProp({{2}}, {{1, 2}}, {{0, ReturnValueIndex}},
698                     MsgTaintedBufferSize)},
699       {{CDF_MaybeBuiltin, {BI.getName(Builtin::BIstrncpy)}},
700        TR::SinkProp({{2}}, {{1, 2}}, {{0, ReturnValueIndex}},
701                     MsgTaintedBufferSize)},
702       {{CDF_MaybeBuiltin, {BI.getName(Builtin::BIstrndup)}},
703        TR::SinkProp({{1}}, {{0, 1}}, {{ReturnValueIndex}},
704                     MsgTaintedBufferSize)},
705       {{CDF_MaybeBuiltin, {"bcopy"}},
706        TR::SinkProp({{2}}, {{0, 2}}, {{1}}, MsgTaintedBufferSize)}};
707 
708   // `getenv` returns taint only in untrusted environments.
709   if (TR::UntrustedEnv(C)) {
710     // void setproctitle_init(int argc, char *argv[], char *envp[])
711     GlobalCRules.push_back(
712         {{{"setproctitle_init"}}, TR::Sink({{1, 2}}, MsgCustomSink)});
713     GlobalCRules.push_back({{"getenv"}, TR::Source({{ReturnValueIndex}})});
714   }
715 
716   StaticTaintRules.emplace(std::make_move_iterator(GlobalCRules.begin()),
717                            std::make_move_iterator(GlobalCRules.end()));
718 
719   // User-provided taint configuration.
720   CheckerManager *Mgr = C.getAnalysisManager().getCheckerManager();
721   assert(Mgr);
722   GenericTaintRuleParser ConfigParser{*Mgr};
723   std::string Option{"Config"};
724   StringRef ConfigFile =
725       Mgr->getAnalyzerOptions().getCheckerStringOption(this, Option);
726   llvm::Optional<TaintConfiguration> Config =
727       getConfiguration<TaintConfiguration>(*Mgr, this, Option, ConfigFile);
728   if (!Config) {
729     // We don't have external taint config, no parsing required.
730     DynamicTaintRules = RuleLookupTy{};
731     return;
732   }
733 
734   GenericTaintRuleParser::RulesContTy Rules{
735       ConfigParser.parseConfiguration(Option, std::move(*Config))};
736 
737   DynamicTaintRules.emplace(std::make_move_iterator(Rules.begin()),
738                             std::make_move_iterator(Rules.end()));
739 }
740 
741 void GenericTaintChecker::checkPreCall(const CallEvent &Call,
742                                        CheckerContext &C) const {
743   initTaintRules(C);
744 
745   // FIXME: this should be much simpler.
746   if (const auto *Rule =
747           Call.isGlobalCFunction() ? StaticTaintRules->lookup(Call) : nullptr)
748     Rule->process(*this, Call, C);
749   else if (const auto *Rule = DynamicTaintRules->lookup(Call))
750     Rule->process(*this, Call, C);
751 
752   // FIXME: These edge cases are to be eliminated from here eventually.
753   //
754   // Additional check that is not supported by CallDescription.
755   // TODO: Make CallDescription be able to match attributes such as printf-like
756   // arguments.
757   checkUncontrolledFormatString(Call, C);
758 
759   // TODO: Modeling sockets should be done in a specific checker.
760   // Socket is a source, which taints the return value.
761   taintUnsafeSocketProtocol(Call, C);
762 }
763 
764 void GenericTaintChecker::checkPostCall(const CallEvent &Call,
765                                         CheckerContext &C) const {
766   // Set the marked values as tainted. The return value only accessible from
767   // checkPostStmt.
768   ProgramStateRef State = C.getState();
769   const StackFrameContext *CurrentFrame = C.getStackFrame();
770 
771   // Depending on what was tainted at pre-visit, we determined a set of
772   // arguments which should be tainted after the function returns. These are
773   // stored in the state as TaintArgsOnPostVisit set.
774   TaintArgsOnPostVisitTy TaintArgsMap = State->get<TaintArgsOnPostVisit>();
775 
776   const ImmutableSet<ArgIdxTy> *TaintArgs = TaintArgsMap.lookup(CurrentFrame);
777   if (!TaintArgs)
778     return;
779   assert(!TaintArgs->isEmpty());
780 
781   LLVM_DEBUG(for (ArgIdxTy I
782                   : *TaintArgs) {
783     llvm::dbgs() << "PostCall<";
784     Call.dump(llvm::dbgs());
785     llvm::dbgs() << "> actually wants to taint arg index: " << I << '\n';
786   });
787 
788   for (ArgIdxTy ArgNum : *TaintArgs) {
789     // Special handling for the tainted return value.
790     if (ArgNum == ReturnValueIndex) {
791       State = addTaint(State, Call.getReturnValue());
792       continue;
793     }
794 
795     // The arguments are pointer arguments. The data they are pointing at is
796     // tainted after the call.
797     if (auto V = getPointeeOf(C, Call.getArgSVal(ArgNum)))
798       State = addTaint(State, *V);
799   }
800 
801   // Clear up the taint info from the state.
802   State = State->remove<TaintArgsOnPostVisit>(CurrentFrame);
803   C.addTransition(State);
804 }
805 
806 void GenericTaintChecker::printState(raw_ostream &Out, ProgramStateRef State,
807                                      const char *NL, const char *Sep) const {
808   printTaint(State, Out, NL, Sep);
809 }
810 
811 void GenericTaintRule::process(const GenericTaintChecker &Checker,
812                                const CallEvent &Call, CheckerContext &C) const {
813   ProgramStateRef State = C.getState();
814   const ArgIdxTy CallNumArgs = fromArgumentCount(Call.getNumArgs());
815 
816   /// Iterate every call argument, and get their corresponding Expr and SVal.
817   const auto ForEachCallArg = [&C, &Call, CallNumArgs](auto &&Fun) {
818     for (ArgIdxTy I = ReturnValueIndex; I < CallNumArgs; ++I) {
819       const Expr *E = GetArgExpr(I, Call);
820       Fun(I, E, C.getSVal(E));
821     }
822   };
823 
824   /// Check for taint sinks.
825   ForEachCallArg([this, &Checker, &C, &State](ArgIdxTy I, const Expr *E, SVal) {
826     if (SinkArgs.contains(I) && isTaintedOrPointsToTainted(E, State, C))
827       Checker.generateReportIfTainted(E, SinkMsg.value_or(MsgCustomSink), C);
828   });
829 
830   /// Check for taint filters.
831   ForEachCallArg([this, &C, &State](ArgIdxTy I, const Expr *E, SVal S) {
832     if (FilterArgs.contains(I)) {
833       State = removeTaint(State, S);
834       if (auto P = getPointeeOf(C, S))
835         State = removeTaint(State, *P);
836     }
837   });
838 
839   /// Check for taint propagation sources.
840   /// A rule is relevant if PropSrcArgs is empty, or if any of its signified
841   /// args are tainted in context of the current CallEvent.
842   bool IsMatching = PropSrcArgs.isEmpty();
843   ForEachCallArg(
844       [this, &C, &IsMatching, &State](ArgIdxTy I, const Expr *E, SVal) {
845         IsMatching = IsMatching || (PropSrcArgs.contains(I) &&
846                                     isTaintedOrPointsToTainted(E, State, C));
847       });
848 
849   if (!IsMatching)
850     return;
851 
852   const auto WouldEscape = [](SVal V, QualType Ty) -> bool {
853     if (!isa<Loc>(V))
854       return false;
855 
856     const bool IsNonConstRef = Ty->isReferenceType() && !Ty.isConstQualified();
857     const bool IsNonConstPtr =
858         Ty->isPointerType() && !Ty->getPointeeType().isConstQualified();
859 
860     return IsNonConstRef || IsNonConstPtr;
861   };
862 
863   /// Propagate taint where it is necessary.
864   auto &F = State->getStateManager().get_context<ArgIdxFactory>();
865   ImmutableSet<ArgIdxTy> Result = F.getEmptySet();
866   ForEachCallArg(
867       [&](ArgIdxTy I, const Expr *E, SVal V) {
868         if (PropDstArgs.contains(I)) {
869           LLVM_DEBUG(llvm::dbgs() << "PreCall<"; Call.dump(llvm::dbgs());
870                      llvm::dbgs()
871                      << "> prepares tainting arg index: " << I << '\n';);
872           Result = F.add(Result, I);
873         }
874 
875         // TODO: We should traverse all reachable memory regions via the
876         // escaping parameter. Instead of doing that we simply mark only the
877         // referred memory region as tainted.
878         if (WouldEscape(V, E->getType())) {
879           LLVM_DEBUG(if (!Result.contains(I)) {
880             llvm::dbgs() << "PreCall<";
881             Call.dump(llvm::dbgs());
882             llvm::dbgs() << "> prepares tainting arg index: " << I << '\n';
883           });
884           Result = F.add(Result, I);
885         }
886       });
887 
888   if (!Result.isEmpty())
889     State = State->set<TaintArgsOnPostVisit>(C.getStackFrame(), Result);
890   C.addTransition(State);
891 }
892 
893 bool GenericTaintRule::UntrustedEnv(CheckerContext &C) {
894   return !C.getAnalysisManager()
895               .getAnalyzerOptions()
896               .ShouldAssumeControlledEnvironment;
897 }
898 
899 bool GenericTaintChecker::generateReportIfTainted(const Expr *E, StringRef Msg,
900                                                   CheckerContext &C) const {
901   assert(E);
902   Optional<SVal> TaintedSVal{getTaintedPointeeOrPointer(C, C.getSVal(E))};
903 
904   if (!TaintedSVal)
905     return false;
906 
907   // Generate diagnostic.
908   if (ExplodedNode *N = C.generateNonFatalErrorNode()) {
909     auto report = std::make_unique<PathSensitiveBugReport>(BT, Msg, N);
910     report->addRange(E->getSourceRange());
911     report->addVisitor(std::make_unique<TaintBugVisitor>(*TaintedSVal));
912     C.emitReport(std::move(report));
913     return true;
914   }
915   return false;
916 }
917 
918 /// TODO: remove checking for printf format attributes and socket whitelisting
919 /// from GenericTaintChecker, and that means the following functions:
920 /// getPrintfFormatArgumentNum,
921 /// GenericTaintChecker::checkUncontrolledFormatString,
922 /// GenericTaintChecker::taintUnsafeSocketProtocol
923 
924 static bool getPrintfFormatArgumentNum(const CallEvent &Call,
925                                        const CheckerContext &C,
926                                        ArgIdxTy &ArgNum) {
927   // Find if the function contains a format string argument.
928   // Handles: fprintf, printf, sprintf, snprintf, vfprintf, vprintf, vsprintf,
929   // vsnprintf, syslog, custom annotated functions.
930   const Decl *CallDecl = Call.getDecl();
931   if (!CallDecl)
932     return false;
933   const FunctionDecl *FDecl = CallDecl->getAsFunction();
934   if (!FDecl)
935     return false;
936 
937   const ArgIdxTy CallNumArgs = fromArgumentCount(Call.getNumArgs());
938 
939   for (const auto *Format : FDecl->specific_attrs<FormatAttr>()) {
940     ArgNum = Format->getFormatIdx() - 1;
941     if ((Format->getType()->getName() == "printf") && CallNumArgs > ArgNum)
942       return true;
943   }
944 
945   return false;
946 }
947 
948 bool GenericTaintChecker::checkUncontrolledFormatString(
949     const CallEvent &Call, CheckerContext &C) const {
950   // Check if the function contains a format string argument.
951   ArgIdxTy ArgNum = 0;
952   if (!getPrintfFormatArgumentNum(Call, C, ArgNum))
953     return false;
954 
955   // If either the format string content or the pointer itself are tainted,
956   // warn.
957   return generateReportIfTainted(Call.getArgExpr(ArgNum),
958                                  MsgUncontrolledFormatString, C);
959 }
960 
961 void GenericTaintChecker::taintUnsafeSocketProtocol(const CallEvent &Call,
962                                                     CheckerContext &C) const {
963   if (Call.getNumArgs() < 1)
964     return;
965   const IdentifierInfo *ID = Call.getCalleeIdentifier();
966   if (!ID)
967     return;
968   if (!ID->getName().equals("socket"))
969     return;
970 
971   SourceLocation DomLoc = Call.getArgExpr(0)->getExprLoc();
972   StringRef DomName = C.getMacroNameOrSpelling(DomLoc);
973   // Allow internal communication protocols.
974   bool SafeProtocol = DomName.equals("AF_SYSTEM") ||
975                       DomName.equals("AF_LOCAL") || DomName.equals("AF_UNIX") ||
976                       DomName.equals("AF_RESERVED_36");
977   if (SafeProtocol)
978     return;
979 
980   ProgramStateRef State = C.getState();
981   auto &F = State->getStateManager().get_context<ArgIdxFactory>();
982   ImmutableSet<ArgIdxTy> Result = F.add(F.getEmptySet(), ReturnValueIndex);
983   State = State->set<TaintArgsOnPostVisit>(C.getStackFrame(), Result);
984   C.addTransition(State);
985 }
986 
987 /// Checker registration
988 void ento::registerGenericTaintChecker(CheckerManager &Mgr) {
989   Mgr.registerChecker<GenericTaintChecker>();
990 }
991 
992 bool ento::shouldRegisterGenericTaintChecker(const CheckerManager &mgr) {
993   return true;
994 }
995