1 //== ArrayBoundCheckerV2.cpp ------------------------------------*- C++ -*--==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines ArrayBoundCheckerV2, which is a path-sensitive check
10 // which looks for an out-of-bound array element access.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "clang/AST/CharUnits.h"
15 #include "clang/AST/ParentMapContext.h"
16 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
17 #include "clang/StaticAnalyzer/Checkers/Taint.h"
18 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
19 #include "clang/StaticAnalyzer/Core/Checker.h"
20 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
21 #include "clang/StaticAnalyzer/Core/PathSensitive/APSIntType.h"
22 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
23 #include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h"
24 #include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h"
25 #include "llvm/ADT/SmallString.h"
26 #include "llvm/Support/FormatVariadic.h"
27 #include "llvm/Support/raw_ostream.h"
28 #include <optional>
29 
30 using namespace clang;
31 using namespace ento;
32 using namespace taint;
33 using llvm::formatv;
34 
35 namespace {
36 enum OOB_Kind { OOB_Precedes, OOB_Exceeds, OOB_Taint };
37 
38 struct Messages {
39   std::string Short, Full;
40 };
41 
42 // NOTE: The `ArraySubscriptExpr` and `UnaryOperator` callbacks are `PostStmt`
43 // instead of `PreStmt` because the current implementation passes the whole
44 // expression to `CheckerContext::getSVal()` which only works after the
45 // symbolic evaluation of the expression. (To turn them into `PreStmt`
46 // callbacks, we'd need to duplicate the logic that evaluates these
47 // expressions.) The `MemberExpr` callback would work as `PreStmt` but it's
48 // defined as `PostStmt` for the sake of consistency with the other callbacks.
49 class ArrayBoundCheckerV2 : public Checker<check::PostStmt<ArraySubscriptExpr>,
50                                            check::PostStmt<UnaryOperator>,
51                                            check::PostStmt<MemberExpr>> {
52   BugType BT{this, "Out-of-bound access"};
53   BugType TaintBT{this, "Out-of-bound access", categories::TaintedData};
54 
55   void performCheck(const Expr *E, CheckerContext &C) const;
56 
57   void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, OOB_Kind Kind,
58                  NonLoc Offset, Messages Msgs) const;
59 
60   static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC);
61 
62   static bool isInAddressOf(const Stmt *S, ASTContext &AC);
63 
64 public:
checkPostStmt(const ArraySubscriptExpr * E,CheckerContext & C) const65   void checkPostStmt(const ArraySubscriptExpr *E, CheckerContext &C) const {
66     performCheck(E, C);
67   }
checkPostStmt(const UnaryOperator * E,CheckerContext & C) const68   void checkPostStmt(const UnaryOperator *E, CheckerContext &C) const {
69     if (E->getOpcode() == UO_Deref)
70       performCheck(E, C);
71   }
checkPostStmt(const MemberExpr * E,CheckerContext & C) const72   void checkPostStmt(const MemberExpr *E, CheckerContext &C) const {
73     if (E->isArrow())
74       performCheck(E->getBase(), C);
75   }
76 };
77 
78 } // anonymous namespace
79 
80 /// For a given Location that can be represented as a symbolic expression
81 /// Arr[Idx] (or perhaps Arr[Idx1][Idx2] etc.), return the parent memory block
82 /// Arr and the distance of Location from the beginning of Arr (expressed in a
83 /// NonLoc that specifies the number of CharUnits). Returns nullopt when these
84 /// cannot be determined.
85 static std::optional<std::pair<const SubRegion *, NonLoc>>
computeOffset(ProgramStateRef State,SValBuilder & SVB,SVal Location)86 computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location) {
87   QualType T = SVB.getArrayIndexType();
88   auto EvalBinOp = [&SVB, State, T](BinaryOperatorKind Op, NonLoc L, NonLoc R) {
89     // We will use this utility to add and multiply values.
90     return SVB.evalBinOpNN(State, Op, L, R, T).getAs<NonLoc>();
91   };
92 
93   const SubRegion *OwnerRegion = nullptr;
94   std::optional<NonLoc> Offset = SVB.makeZeroArrayIndex();
95 
96   const ElementRegion *CurRegion =
97       dyn_cast_or_null<ElementRegion>(Location.getAsRegion());
98 
99   while (CurRegion) {
100     const auto Index = CurRegion->getIndex().getAs<NonLoc>();
101     if (!Index)
102       return std::nullopt;
103 
104     QualType ElemType = CurRegion->getElementType();
105 
106     // FIXME: The following early return was presumably added to safeguard the
107     // getTypeSizeInChars() call (which doesn't accept an incomplete type), but
108     // it seems that `ElemType` cannot be incomplete at this point.
109     if (ElemType->isIncompleteType())
110       return std::nullopt;
111 
112     // Calculate Delta = Index * sizeof(ElemType).
113     NonLoc Size = SVB.makeArrayIndex(
114         SVB.getContext().getTypeSizeInChars(ElemType).getQuantity());
115     auto Delta = EvalBinOp(BO_Mul, *Index, Size);
116     if (!Delta)
117       return std::nullopt;
118 
119     // Perform Offset += Delta.
120     Offset = EvalBinOp(BO_Add, *Offset, *Delta);
121     if (!Offset)
122       return std::nullopt;
123 
124     OwnerRegion = CurRegion->getSuperRegion()->getAs<SubRegion>();
125     // When this is just another ElementRegion layer, we need to continue the
126     // offset calculations:
127     CurRegion = dyn_cast_or_null<ElementRegion>(OwnerRegion);
128   }
129 
130   if (OwnerRegion)
131     return std::make_pair(OwnerRegion, *Offset);
132 
133   return std::nullopt;
134 }
135 
136 // TODO: once the constraint manager is smart enough to handle non simplified
137 // symbolic expressions remove this function. Note that this can not be used in
138 // the constraint manager as is, since this does not handle overflows. It is
139 // safe to assume, however, that memory offsets will not overflow.
140 // NOTE: callers of this function need to be aware of the effects of overflows
141 // and signed<->unsigned conversions!
142 static std::pair<NonLoc, nonloc::ConcreteInt>
getSimplifiedOffsets(NonLoc offset,nonloc::ConcreteInt extent,SValBuilder & svalBuilder)143 getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent,
144                      SValBuilder &svalBuilder) {
145   std::optional<nonloc::SymbolVal> SymVal = offset.getAs<nonloc::SymbolVal>();
146   if (SymVal && SymVal->isExpression()) {
147     if (const SymIntExpr *SIE = dyn_cast<SymIntExpr>(SymVal->getSymbol())) {
148       llvm::APSInt constant =
149           APSIntType(extent.getValue()).convert(SIE->getRHS());
150       switch (SIE->getOpcode()) {
151       case BO_Mul:
152         // The constant should never be 0 here, becasue multiplication by zero
153         // is simplified by the engine.
154         if ((extent.getValue() % constant) != 0)
155           return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
156         else
157           return getSimplifiedOffsets(
158               nonloc::SymbolVal(SIE->getLHS()),
159               svalBuilder.makeIntVal(extent.getValue() / constant),
160               svalBuilder);
161       case BO_Add:
162         return getSimplifiedOffsets(
163             nonloc::SymbolVal(SIE->getLHS()),
164             svalBuilder.makeIntVal(extent.getValue() - constant), svalBuilder);
165       default:
166         break;
167       }
168     }
169   }
170 
171   return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
172 }
173 
174 // Evaluate the comparison Value < Threshold with the help of the custom
175 // simplification algorithm defined for this checker. Return a pair of states,
176 // where the first one corresponds to "value below threshold" and the second
177 // corresponds to "value at or above threshold". Returns {nullptr, nullptr} in
178 // the case when the evaluation fails.
179 // If the optional argument CheckEquality is true, then use BO_EQ instead of
180 // the default BO_LT after consistently applying the same simplification steps.
181 static std::pair<ProgramStateRef, ProgramStateRef>
compareValueToThreshold(ProgramStateRef State,NonLoc Value,NonLoc Threshold,SValBuilder & SVB,bool CheckEquality=false)182 compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold,
183                         SValBuilder &SVB, bool CheckEquality = false) {
184   if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) {
185     std::tie(Value, Threshold) = getSimplifiedOffsets(Value, *ConcreteThreshold, SVB);
186   }
187   if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) {
188     QualType T = Value.getType(SVB.getContext());
189     if (T->isUnsignedIntegerType() && ConcreteThreshold->getValue().isNegative()) {
190       // In this case we reduced the bound check to a comparison of the form
191       //   (symbol or value with unsigned type) < (negative number)
192       // which is always false. We are handling these cases separately because
193       // evalBinOpNN can perform a signed->unsigned conversion that turns the
194       // negative number into a huge positive value and leads to wildly
195       // inaccurate conclusions.
196       return {nullptr, State};
197     }
198   }
199   const BinaryOperatorKind OpKind = CheckEquality ? BO_EQ : BO_LT;
200   auto BelowThreshold =
201       SVB.evalBinOpNN(State, OpKind, Value, Threshold, SVB.getConditionType())
202           .getAs<NonLoc>();
203 
204   if (BelowThreshold)
205     return State->assume(*BelowThreshold);
206 
207   return {nullptr, nullptr};
208 }
209 
getRegionName(const SubRegion * Region)210 static std::string getRegionName(const SubRegion *Region) {
211   if (std::string RegName = Region->getDescriptiveName(); !RegName.empty())
212     return RegName;
213 
214   // Field regions only have descriptive names when their parent has a
215   // descriptive name; so we provide a fallback representation for them:
216   if (const auto *FR = Region->getAs<FieldRegion>()) {
217     if (StringRef Name = FR->getDecl()->getName(); !Name.empty())
218       return formatv("the field '{0}'", Name);
219     return "the unnamed field";
220   }
221 
222   if (isa<AllocaRegion>(Region))
223     return "the memory returned by 'alloca'";
224 
225   if (isa<SymbolicRegion>(Region) &&
226       isa<HeapSpaceRegion>(Region->getMemorySpace()))
227     return "the heap area";
228 
229   if (isa<StringRegion>(Region))
230     return "the string literal";
231 
232   return "the region";
233 }
234 
getConcreteValue(NonLoc SV)235 static std::optional<int64_t> getConcreteValue(NonLoc SV) {
236   if (auto ConcreteVal = SV.getAs<nonloc::ConcreteInt>()) {
237     return ConcreteVal->getValue().tryExtValue();
238   }
239   return std::nullopt;
240 }
241 
getShortMsg(OOB_Kind Kind,std::string RegName)242 static std::string getShortMsg(OOB_Kind Kind, std::string RegName) {
243   static const char *ShortMsgTemplates[] = {
244       "Out of bound access to memory preceding {0}",
245       "Out of bound access to memory after the end of {0}",
246       "Potential out of bound access to {0} with tainted offset"};
247 
248   return formatv(ShortMsgTemplates[Kind], RegName);
249 }
250 
getPrecedesMsgs(const SubRegion * Region,NonLoc Offset)251 static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) {
252   std::string RegName = getRegionName(Region);
253   SmallString<128> Buf;
254   llvm::raw_svector_ostream Out(Buf);
255   Out << "Access of " << RegName << " at negative byte offset";
256   if (auto ConcreteIdx = Offset.getAs<nonloc::ConcreteInt>())
257     Out << ' ' << ConcreteIdx->getValue();
258   return {getShortMsg(OOB_Precedes, RegName), std::string(Buf)};
259 }
260 
getExceedsMsgs(ASTContext & ACtx,const SubRegion * Region,NonLoc Offset,NonLoc Extent,SVal Location)261 static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
262                                NonLoc Offset, NonLoc Extent, SVal Location) {
263   std::string RegName = getRegionName(Region);
264   const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>();
265   assert(EReg && "this checker only handles element access");
266   QualType ElemType = EReg->getElementType();
267 
268   std::optional<int64_t> OffsetN = getConcreteValue(Offset);
269   std::optional<int64_t> ExtentN = getConcreteValue(Extent);
270 
271   bool UseByteOffsets = true;
272   if (int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity()) {
273     const bool OffsetHasRemainder = OffsetN && *OffsetN % ElemSize;
274     const bool ExtentHasRemainder = ExtentN && *ExtentN % ElemSize;
275     if (!OffsetHasRemainder && !ExtentHasRemainder) {
276       UseByteOffsets = false;
277       if (OffsetN)
278         *OffsetN /= ElemSize;
279       if (ExtentN)
280         *ExtentN /= ElemSize;
281     }
282   }
283 
284   SmallString<256> Buf;
285   llvm::raw_svector_ostream Out(Buf);
286   Out << "Access of ";
287   if (!ExtentN && !UseByteOffsets)
288     Out << "'" << ElemType.getAsString() << "' element in ";
289   Out << RegName << " at ";
290   if (OffsetN) {
291     Out << (UseByteOffsets ? "byte offset " : "index ") << *OffsetN;
292   } else {
293     Out << "an overflowing " << (UseByteOffsets ? "byte offset" : "index");
294   }
295   if (ExtentN) {
296     Out << ", while it holds only ";
297     if (*ExtentN != 1)
298       Out << *ExtentN;
299     else
300       Out << "a single";
301     if (UseByteOffsets)
302       Out << " byte";
303     else
304       Out << " '" << ElemType.getAsString() << "' element";
305 
306     if (*ExtentN > 1)
307       Out << "s";
308   }
309 
310   return {getShortMsg(OOB_Exceeds, RegName), std::string(Buf)};
311 }
312 
getTaintMsgs(const SubRegion * Region,const char * OffsetName)313 static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName) {
314   std::string RegName = getRegionName(Region);
315   return {formatv("Potential out of bound access to {0} with tainted {1}",
316                   RegName, OffsetName),
317           formatv("Access of {0} with a tainted {1} that may be too large",
318                   RegName, OffsetName)};
319 }
320 
performCheck(const Expr * E,CheckerContext & C) const321 void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
322   // NOTE: Instead of using ProgramState::assumeInBound(), we are prototyping
323   // some new logic here that reasons directly about memory region extents.
324   // Once that logic is more mature, we can bring it back to assumeInBound()
325   // for all clients to use.
326   //
327   // The algorithm we are using here for bounds checking is to see if the
328   // memory access is within the extent of the base region.  Since we
329   // have some flexibility in defining the base region, we can achieve
330   // various levels of conservatism in our buffer overflow checking.
331 
332   const SVal Location = C.getSVal(E);
333 
334   // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as
335   //   #define isXXXXX(arg) (LOOKUP_TABLE[arg] & BITMASK_FOR_XXXXX)
336   // and incomplete analysis of these leads to false positives. As even
337   // accurate reports would be confusing for the users, just disable reports
338   // from these macros:
339   if (isFromCtypeMacro(E, C.getASTContext()))
340     return;
341 
342   ProgramStateRef State = C.getState();
343   SValBuilder &SVB = C.getSValBuilder();
344 
345   const std::optional<std::pair<const SubRegion *, NonLoc>> &RawOffset =
346       computeOffset(State, SVB, Location);
347 
348   if (!RawOffset)
349     return;
350 
351   auto [Reg, ByteOffset] = *RawOffset;
352 
353   // CHECK LOWER BOUND
354   const MemSpaceRegion *Space = Reg->getMemorySpace();
355   if (!(isa<SymbolicRegion>(Reg) && isa<UnknownSpaceRegion>(Space))) {
356     // A symbolic region in unknown space represents an unknown pointer that
357     // may point into the middle of an array, so we don't look for underflows.
358     // Both conditions are significant because we want to check underflows in
359     // symbolic regions on the heap (which may be introduced by checkers like
360     // MallocChecker that call SValBuilder::getConjuredHeapSymbolVal()) and
361     // non-symbolic regions (e.g. a field subregion of a symbolic region) in
362     // unknown space.
363     auto [PrecedesLowerBound, WithinLowerBound] = compareValueToThreshold(
364         State, ByteOffset, SVB.makeZeroArrayIndex(), SVB);
365 
366     if (PrecedesLowerBound && !WithinLowerBound) {
367       // We know that the index definitely precedes the lower bound.
368       Messages Msgs = getPrecedesMsgs(Reg, ByteOffset);
369       reportOOB(C, PrecedesLowerBound, OOB_Precedes, ByteOffset, Msgs);
370       return;
371     }
372 
373     if (WithinLowerBound)
374       State = WithinLowerBound;
375   }
376 
377   // CHECK UPPER BOUND
378   DefinedOrUnknownSVal Size = getDynamicExtent(State, Reg, SVB);
379   if (auto KnownSize = Size.getAs<NonLoc>()) {
380     auto [WithinUpperBound, ExceedsUpperBound] =
381         compareValueToThreshold(State, ByteOffset, *KnownSize, SVB);
382 
383     if (ExceedsUpperBound) {
384       if (!WithinUpperBound) {
385         // We know that the index definitely exceeds the upper bound.
386         if (isa<ArraySubscriptExpr>(E) && isInAddressOf(E, C.getASTContext())) {
387           // ...but this is within an addressof expression, so we need to check
388           // for the exceptional case that `&array[size]` is valid.
389           auto [EqualsToThreshold, NotEqualToThreshold] =
390               compareValueToThreshold(ExceedsUpperBound, ByteOffset, *KnownSize,
391                                       SVB, /*CheckEquality=*/true);
392           if (EqualsToThreshold && !NotEqualToThreshold) {
393             // We are definitely in the exceptional case, so return early
394             // instead of reporting a bug.
395             C.addTransition(EqualsToThreshold);
396             return;
397           }
398         }
399         Messages Msgs = getExceedsMsgs(C.getASTContext(), Reg, ByteOffset,
400                                        *KnownSize, Location);
401         reportOOB(C, ExceedsUpperBound, OOB_Exceeds, ByteOffset, Msgs);
402         return;
403       }
404       if (isTainted(State, ByteOffset)) {
405         // Both cases are possible, but the offset is tainted, so report.
406         std::string RegName = getRegionName(Reg);
407 
408         // Diagnostic detail: "tainted offset" is always correct, but the
409         // common case is that 'idx' is tainted in 'arr[idx]' and then it's
410         // nicer to say "tainted index".
411         const char *OffsetName = "offset";
412         if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E))
413           if (isTainted(State, ASE->getIdx(), C.getLocationContext()))
414             OffsetName = "index";
415 
416         Messages Msgs = getTaintMsgs(Reg, OffsetName);
417         reportOOB(C, ExceedsUpperBound, OOB_Taint, ByteOffset, Msgs);
418         return;
419       }
420     }
421 
422     if (WithinUpperBound)
423       State = WithinUpperBound;
424   }
425 
426   C.addTransition(State);
427 }
428 
reportOOB(CheckerContext & C,ProgramStateRef ErrorState,OOB_Kind Kind,NonLoc Offset,Messages Msgs) const429 void ArrayBoundCheckerV2::reportOOB(CheckerContext &C,
430                                     ProgramStateRef ErrorState, OOB_Kind Kind,
431                                     NonLoc Offset, Messages Msgs) const {
432 
433   ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState);
434   if (!ErrorNode)
435     return;
436 
437   auto BR = std::make_unique<PathSensitiveBugReport>(
438       Kind == OOB_Taint ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode);
439 
440   // Track back the propagation of taintedness.
441   if (Kind == OOB_Taint)
442     for (SymbolRef Sym : getTaintedSymbols(ErrorState, Offset))
443       BR->markInteresting(Sym);
444 
445   C.emitReport(std::move(BR));
446 }
447 
isFromCtypeMacro(const Stmt * S,ASTContext & ACtx)448 bool ArrayBoundCheckerV2::isFromCtypeMacro(const Stmt *S, ASTContext &ACtx) {
449   SourceLocation Loc = S->getBeginLoc();
450   if (!Loc.isMacroID())
451     return false;
452 
453   StringRef MacroName = Lexer::getImmediateMacroName(
454       Loc, ACtx.getSourceManager(), ACtx.getLangOpts());
455 
456   if (MacroName.size() < 7 || MacroName[0] != 'i' || MacroName[1] != 's')
457     return false;
458 
459   return ((MacroName == "isalnum") || (MacroName == "isalpha") ||
460           (MacroName == "isblank") || (MacroName == "isdigit") ||
461           (MacroName == "isgraph") || (MacroName == "islower") ||
462           (MacroName == "isnctrl") || (MacroName == "isprint") ||
463           (MacroName == "ispunct") || (MacroName == "isspace") ||
464           (MacroName == "isupper") || (MacroName == "isxdigit"));
465 }
466 
isInAddressOf(const Stmt * S,ASTContext & ACtx)467 bool ArrayBoundCheckerV2::isInAddressOf(const Stmt *S, ASTContext &ACtx) {
468   ParentMapContext &ParentCtx = ACtx.getParentMapContext();
469   do {
470     const DynTypedNodeList Parents = ParentCtx.getParents(*S);
471     if (Parents.empty())
472       return false;
473     S = Parents[0].get<Stmt>();
474   } while (isa_and_nonnull<ParenExpr, ImplicitCastExpr>(S));
475   const auto *UnaryOp = dyn_cast_or_null<UnaryOperator>(S);
476   return UnaryOp && UnaryOp->getOpcode() == UO_AddrOf;
477 }
478 
registerArrayBoundCheckerV2(CheckerManager & mgr)479 void ento::registerArrayBoundCheckerV2(CheckerManager &mgr) {
480   mgr.registerChecker<ArrayBoundCheckerV2>();
481 }
482 
shouldRegisterArrayBoundCheckerV2(const CheckerManager & mgr)483 bool ento::shouldRegisterArrayBoundCheckerV2(const CheckerManager &mgr) {
484   return true;
485 }
486