1 //== ArrayBoundCheckerV2.cpp ------------------------------------*- C++ -*--==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines ArrayBoundCheckerV2, which is a path-sensitive check
10 // which looks for an out-of-bound array element access.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "clang/AST/CharUnits.h"
15 #include "clang/AST/ParentMapContext.h"
16 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
17 #include "clang/StaticAnalyzer/Checkers/Taint.h"
18 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
19 #include "clang/StaticAnalyzer/Core/Checker.h"
20 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
21 #include "clang/StaticAnalyzer/Core/PathSensitive/APSIntType.h"
22 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
23 #include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h"
24 #include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h"
25 #include "llvm/ADT/SmallString.h"
26 #include "llvm/Support/FormatVariadic.h"
27 #include "llvm/Support/raw_ostream.h"
28 #include <optional>
29
30 using namespace clang;
31 using namespace ento;
32 using namespace taint;
33 using llvm::formatv;
34
35 namespace {
36 enum OOB_Kind { OOB_Precedes, OOB_Exceeds, OOB_Taint };
37
38 struct Messages {
39 std::string Short, Full;
40 };
41
42 // NOTE: The `ArraySubscriptExpr` and `UnaryOperator` callbacks are `PostStmt`
43 // instead of `PreStmt` because the current implementation passes the whole
44 // expression to `CheckerContext::getSVal()` which only works after the
45 // symbolic evaluation of the expression. (To turn them into `PreStmt`
46 // callbacks, we'd need to duplicate the logic that evaluates these
47 // expressions.) The `MemberExpr` callback would work as `PreStmt` but it's
48 // defined as `PostStmt` for the sake of consistency with the other callbacks.
49 class ArrayBoundCheckerV2 : public Checker<check::PostStmt<ArraySubscriptExpr>,
50 check::PostStmt<UnaryOperator>,
51 check::PostStmt<MemberExpr>> {
52 BugType BT{this, "Out-of-bound access"};
53 BugType TaintBT{this, "Out-of-bound access", categories::TaintedData};
54
55 void performCheck(const Expr *E, CheckerContext &C) const;
56
57 void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, OOB_Kind Kind,
58 NonLoc Offset, Messages Msgs) const;
59
60 static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC);
61
62 static bool isInAddressOf(const Stmt *S, ASTContext &AC);
63
64 public:
checkPostStmt(const ArraySubscriptExpr * E,CheckerContext & C) const65 void checkPostStmt(const ArraySubscriptExpr *E, CheckerContext &C) const {
66 performCheck(E, C);
67 }
checkPostStmt(const UnaryOperator * E,CheckerContext & C) const68 void checkPostStmt(const UnaryOperator *E, CheckerContext &C) const {
69 if (E->getOpcode() == UO_Deref)
70 performCheck(E, C);
71 }
checkPostStmt(const MemberExpr * E,CheckerContext & C) const72 void checkPostStmt(const MemberExpr *E, CheckerContext &C) const {
73 if (E->isArrow())
74 performCheck(E->getBase(), C);
75 }
76 };
77
78 } // anonymous namespace
79
80 /// For a given Location that can be represented as a symbolic expression
81 /// Arr[Idx] (or perhaps Arr[Idx1][Idx2] etc.), return the parent memory block
82 /// Arr and the distance of Location from the beginning of Arr (expressed in a
83 /// NonLoc that specifies the number of CharUnits). Returns nullopt when these
84 /// cannot be determined.
85 static std::optional<std::pair<const SubRegion *, NonLoc>>
computeOffset(ProgramStateRef State,SValBuilder & SVB,SVal Location)86 computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location) {
87 QualType T = SVB.getArrayIndexType();
88 auto EvalBinOp = [&SVB, State, T](BinaryOperatorKind Op, NonLoc L, NonLoc R) {
89 // We will use this utility to add and multiply values.
90 return SVB.evalBinOpNN(State, Op, L, R, T).getAs<NonLoc>();
91 };
92
93 const SubRegion *OwnerRegion = nullptr;
94 std::optional<NonLoc> Offset = SVB.makeZeroArrayIndex();
95
96 const ElementRegion *CurRegion =
97 dyn_cast_or_null<ElementRegion>(Location.getAsRegion());
98
99 while (CurRegion) {
100 const auto Index = CurRegion->getIndex().getAs<NonLoc>();
101 if (!Index)
102 return std::nullopt;
103
104 QualType ElemType = CurRegion->getElementType();
105
106 // FIXME: The following early return was presumably added to safeguard the
107 // getTypeSizeInChars() call (which doesn't accept an incomplete type), but
108 // it seems that `ElemType` cannot be incomplete at this point.
109 if (ElemType->isIncompleteType())
110 return std::nullopt;
111
112 // Calculate Delta = Index * sizeof(ElemType).
113 NonLoc Size = SVB.makeArrayIndex(
114 SVB.getContext().getTypeSizeInChars(ElemType).getQuantity());
115 auto Delta = EvalBinOp(BO_Mul, *Index, Size);
116 if (!Delta)
117 return std::nullopt;
118
119 // Perform Offset += Delta.
120 Offset = EvalBinOp(BO_Add, *Offset, *Delta);
121 if (!Offset)
122 return std::nullopt;
123
124 OwnerRegion = CurRegion->getSuperRegion()->getAs<SubRegion>();
125 // When this is just another ElementRegion layer, we need to continue the
126 // offset calculations:
127 CurRegion = dyn_cast_or_null<ElementRegion>(OwnerRegion);
128 }
129
130 if (OwnerRegion)
131 return std::make_pair(OwnerRegion, *Offset);
132
133 return std::nullopt;
134 }
135
136 // TODO: once the constraint manager is smart enough to handle non simplified
137 // symbolic expressions remove this function. Note that this can not be used in
138 // the constraint manager as is, since this does not handle overflows. It is
139 // safe to assume, however, that memory offsets will not overflow.
140 // NOTE: callers of this function need to be aware of the effects of overflows
141 // and signed<->unsigned conversions!
142 static std::pair<NonLoc, nonloc::ConcreteInt>
getSimplifiedOffsets(NonLoc offset,nonloc::ConcreteInt extent,SValBuilder & svalBuilder)143 getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent,
144 SValBuilder &svalBuilder) {
145 std::optional<nonloc::SymbolVal> SymVal = offset.getAs<nonloc::SymbolVal>();
146 if (SymVal && SymVal->isExpression()) {
147 if (const SymIntExpr *SIE = dyn_cast<SymIntExpr>(SymVal->getSymbol())) {
148 llvm::APSInt constant =
149 APSIntType(extent.getValue()).convert(SIE->getRHS());
150 switch (SIE->getOpcode()) {
151 case BO_Mul:
152 // The constant should never be 0 here, becasue multiplication by zero
153 // is simplified by the engine.
154 if ((extent.getValue() % constant) != 0)
155 return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
156 else
157 return getSimplifiedOffsets(
158 nonloc::SymbolVal(SIE->getLHS()),
159 svalBuilder.makeIntVal(extent.getValue() / constant),
160 svalBuilder);
161 case BO_Add:
162 return getSimplifiedOffsets(
163 nonloc::SymbolVal(SIE->getLHS()),
164 svalBuilder.makeIntVal(extent.getValue() - constant), svalBuilder);
165 default:
166 break;
167 }
168 }
169 }
170
171 return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
172 }
173
174 // Evaluate the comparison Value < Threshold with the help of the custom
175 // simplification algorithm defined for this checker. Return a pair of states,
176 // where the first one corresponds to "value below threshold" and the second
177 // corresponds to "value at or above threshold". Returns {nullptr, nullptr} in
178 // the case when the evaluation fails.
179 // If the optional argument CheckEquality is true, then use BO_EQ instead of
180 // the default BO_LT after consistently applying the same simplification steps.
181 static std::pair<ProgramStateRef, ProgramStateRef>
compareValueToThreshold(ProgramStateRef State,NonLoc Value,NonLoc Threshold,SValBuilder & SVB,bool CheckEquality=false)182 compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold,
183 SValBuilder &SVB, bool CheckEquality = false) {
184 if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) {
185 std::tie(Value, Threshold) = getSimplifiedOffsets(Value, *ConcreteThreshold, SVB);
186 }
187 if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) {
188 QualType T = Value.getType(SVB.getContext());
189 if (T->isUnsignedIntegerType() && ConcreteThreshold->getValue().isNegative()) {
190 // In this case we reduced the bound check to a comparison of the form
191 // (symbol or value with unsigned type) < (negative number)
192 // which is always false. We are handling these cases separately because
193 // evalBinOpNN can perform a signed->unsigned conversion that turns the
194 // negative number into a huge positive value and leads to wildly
195 // inaccurate conclusions.
196 return {nullptr, State};
197 }
198 }
199 const BinaryOperatorKind OpKind = CheckEquality ? BO_EQ : BO_LT;
200 auto BelowThreshold =
201 SVB.evalBinOpNN(State, OpKind, Value, Threshold, SVB.getConditionType())
202 .getAs<NonLoc>();
203
204 if (BelowThreshold)
205 return State->assume(*BelowThreshold);
206
207 return {nullptr, nullptr};
208 }
209
getRegionName(const SubRegion * Region)210 static std::string getRegionName(const SubRegion *Region) {
211 if (std::string RegName = Region->getDescriptiveName(); !RegName.empty())
212 return RegName;
213
214 // Field regions only have descriptive names when their parent has a
215 // descriptive name; so we provide a fallback representation for them:
216 if (const auto *FR = Region->getAs<FieldRegion>()) {
217 if (StringRef Name = FR->getDecl()->getName(); !Name.empty())
218 return formatv("the field '{0}'", Name);
219 return "the unnamed field";
220 }
221
222 if (isa<AllocaRegion>(Region))
223 return "the memory returned by 'alloca'";
224
225 if (isa<SymbolicRegion>(Region) &&
226 isa<HeapSpaceRegion>(Region->getMemorySpace()))
227 return "the heap area";
228
229 if (isa<StringRegion>(Region))
230 return "the string literal";
231
232 return "the region";
233 }
234
getConcreteValue(NonLoc SV)235 static std::optional<int64_t> getConcreteValue(NonLoc SV) {
236 if (auto ConcreteVal = SV.getAs<nonloc::ConcreteInt>()) {
237 return ConcreteVal->getValue().tryExtValue();
238 }
239 return std::nullopt;
240 }
241
getShortMsg(OOB_Kind Kind,std::string RegName)242 static std::string getShortMsg(OOB_Kind Kind, std::string RegName) {
243 static const char *ShortMsgTemplates[] = {
244 "Out of bound access to memory preceding {0}",
245 "Out of bound access to memory after the end of {0}",
246 "Potential out of bound access to {0} with tainted offset"};
247
248 return formatv(ShortMsgTemplates[Kind], RegName);
249 }
250
getPrecedesMsgs(const SubRegion * Region,NonLoc Offset)251 static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) {
252 std::string RegName = getRegionName(Region);
253 SmallString<128> Buf;
254 llvm::raw_svector_ostream Out(Buf);
255 Out << "Access of " << RegName << " at negative byte offset";
256 if (auto ConcreteIdx = Offset.getAs<nonloc::ConcreteInt>())
257 Out << ' ' << ConcreteIdx->getValue();
258 return {getShortMsg(OOB_Precedes, RegName), std::string(Buf)};
259 }
260
getExceedsMsgs(ASTContext & ACtx,const SubRegion * Region,NonLoc Offset,NonLoc Extent,SVal Location)261 static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
262 NonLoc Offset, NonLoc Extent, SVal Location) {
263 std::string RegName = getRegionName(Region);
264 const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>();
265 assert(EReg && "this checker only handles element access");
266 QualType ElemType = EReg->getElementType();
267
268 std::optional<int64_t> OffsetN = getConcreteValue(Offset);
269 std::optional<int64_t> ExtentN = getConcreteValue(Extent);
270
271 bool UseByteOffsets = true;
272 if (int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity()) {
273 const bool OffsetHasRemainder = OffsetN && *OffsetN % ElemSize;
274 const bool ExtentHasRemainder = ExtentN && *ExtentN % ElemSize;
275 if (!OffsetHasRemainder && !ExtentHasRemainder) {
276 UseByteOffsets = false;
277 if (OffsetN)
278 *OffsetN /= ElemSize;
279 if (ExtentN)
280 *ExtentN /= ElemSize;
281 }
282 }
283
284 SmallString<256> Buf;
285 llvm::raw_svector_ostream Out(Buf);
286 Out << "Access of ";
287 if (!ExtentN && !UseByteOffsets)
288 Out << "'" << ElemType.getAsString() << "' element in ";
289 Out << RegName << " at ";
290 if (OffsetN) {
291 Out << (UseByteOffsets ? "byte offset " : "index ") << *OffsetN;
292 } else {
293 Out << "an overflowing " << (UseByteOffsets ? "byte offset" : "index");
294 }
295 if (ExtentN) {
296 Out << ", while it holds only ";
297 if (*ExtentN != 1)
298 Out << *ExtentN;
299 else
300 Out << "a single";
301 if (UseByteOffsets)
302 Out << " byte";
303 else
304 Out << " '" << ElemType.getAsString() << "' element";
305
306 if (*ExtentN > 1)
307 Out << "s";
308 }
309
310 return {getShortMsg(OOB_Exceeds, RegName), std::string(Buf)};
311 }
312
getTaintMsgs(const SubRegion * Region,const char * OffsetName)313 static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName) {
314 std::string RegName = getRegionName(Region);
315 return {formatv("Potential out of bound access to {0} with tainted {1}",
316 RegName, OffsetName),
317 formatv("Access of {0} with a tainted {1} that may be too large",
318 RegName, OffsetName)};
319 }
320
performCheck(const Expr * E,CheckerContext & C) const321 void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
322 // NOTE: Instead of using ProgramState::assumeInBound(), we are prototyping
323 // some new logic here that reasons directly about memory region extents.
324 // Once that logic is more mature, we can bring it back to assumeInBound()
325 // for all clients to use.
326 //
327 // The algorithm we are using here for bounds checking is to see if the
328 // memory access is within the extent of the base region. Since we
329 // have some flexibility in defining the base region, we can achieve
330 // various levels of conservatism in our buffer overflow checking.
331
332 const SVal Location = C.getSVal(E);
333
334 // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as
335 // #define isXXXXX(arg) (LOOKUP_TABLE[arg] & BITMASK_FOR_XXXXX)
336 // and incomplete analysis of these leads to false positives. As even
337 // accurate reports would be confusing for the users, just disable reports
338 // from these macros:
339 if (isFromCtypeMacro(E, C.getASTContext()))
340 return;
341
342 ProgramStateRef State = C.getState();
343 SValBuilder &SVB = C.getSValBuilder();
344
345 const std::optional<std::pair<const SubRegion *, NonLoc>> &RawOffset =
346 computeOffset(State, SVB, Location);
347
348 if (!RawOffset)
349 return;
350
351 auto [Reg, ByteOffset] = *RawOffset;
352
353 // CHECK LOWER BOUND
354 const MemSpaceRegion *Space = Reg->getMemorySpace();
355 if (!(isa<SymbolicRegion>(Reg) && isa<UnknownSpaceRegion>(Space))) {
356 // A symbolic region in unknown space represents an unknown pointer that
357 // may point into the middle of an array, so we don't look for underflows.
358 // Both conditions are significant because we want to check underflows in
359 // symbolic regions on the heap (which may be introduced by checkers like
360 // MallocChecker that call SValBuilder::getConjuredHeapSymbolVal()) and
361 // non-symbolic regions (e.g. a field subregion of a symbolic region) in
362 // unknown space.
363 auto [PrecedesLowerBound, WithinLowerBound] = compareValueToThreshold(
364 State, ByteOffset, SVB.makeZeroArrayIndex(), SVB);
365
366 if (PrecedesLowerBound && !WithinLowerBound) {
367 // We know that the index definitely precedes the lower bound.
368 Messages Msgs = getPrecedesMsgs(Reg, ByteOffset);
369 reportOOB(C, PrecedesLowerBound, OOB_Precedes, ByteOffset, Msgs);
370 return;
371 }
372
373 if (WithinLowerBound)
374 State = WithinLowerBound;
375 }
376
377 // CHECK UPPER BOUND
378 DefinedOrUnknownSVal Size = getDynamicExtent(State, Reg, SVB);
379 if (auto KnownSize = Size.getAs<NonLoc>()) {
380 auto [WithinUpperBound, ExceedsUpperBound] =
381 compareValueToThreshold(State, ByteOffset, *KnownSize, SVB);
382
383 if (ExceedsUpperBound) {
384 if (!WithinUpperBound) {
385 // We know that the index definitely exceeds the upper bound.
386 if (isa<ArraySubscriptExpr>(E) && isInAddressOf(E, C.getASTContext())) {
387 // ...but this is within an addressof expression, so we need to check
388 // for the exceptional case that `&array[size]` is valid.
389 auto [EqualsToThreshold, NotEqualToThreshold] =
390 compareValueToThreshold(ExceedsUpperBound, ByteOffset, *KnownSize,
391 SVB, /*CheckEquality=*/true);
392 if (EqualsToThreshold && !NotEqualToThreshold) {
393 // We are definitely in the exceptional case, so return early
394 // instead of reporting a bug.
395 C.addTransition(EqualsToThreshold);
396 return;
397 }
398 }
399 Messages Msgs = getExceedsMsgs(C.getASTContext(), Reg, ByteOffset,
400 *KnownSize, Location);
401 reportOOB(C, ExceedsUpperBound, OOB_Exceeds, ByteOffset, Msgs);
402 return;
403 }
404 if (isTainted(State, ByteOffset)) {
405 // Both cases are possible, but the offset is tainted, so report.
406 std::string RegName = getRegionName(Reg);
407
408 // Diagnostic detail: "tainted offset" is always correct, but the
409 // common case is that 'idx' is tainted in 'arr[idx]' and then it's
410 // nicer to say "tainted index".
411 const char *OffsetName = "offset";
412 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E))
413 if (isTainted(State, ASE->getIdx(), C.getLocationContext()))
414 OffsetName = "index";
415
416 Messages Msgs = getTaintMsgs(Reg, OffsetName);
417 reportOOB(C, ExceedsUpperBound, OOB_Taint, ByteOffset, Msgs);
418 return;
419 }
420 }
421
422 if (WithinUpperBound)
423 State = WithinUpperBound;
424 }
425
426 C.addTransition(State);
427 }
428
reportOOB(CheckerContext & C,ProgramStateRef ErrorState,OOB_Kind Kind,NonLoc Offset,Messages Msgs) const429 void ArrayBoundCheckerV2::reportOOB(CheckerContext &C,
430 ProgramStateRef ErrorState, OOB_Kind Kind,
431 NonLoc Offset, Messages Msgs) const {
432
433 ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState);
434 if (!ErrorNode)
435 return;
436
437 auto BR = std::make_unique<PathSensitiveBugReport>(
438 Kind == OOB_Taint ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode);
439
440 // Track back the propagation of taintedness.
441 if (Kind == OOB_Taint)
442 for (SymbolRef Sym : getTaintedSymbols(ErrorState, Offset))
443 BR->markInteresting(Sym);
444
445 C.emitReport(std::move(BR));
446 }
447
isFromCtypeMacro(const Stmt * S,ASTContext & ACtx)448 bool ArrayBoundCheckerV2::isFromCtypeMacro(const Stmt *S, ASTContext &ACtx) {
449 SourceLocation Loc = S->getBeginLoc();
450 if (!Loc.isMacroID())
451 return false;
452
453 StringRef MacroName = Lexer::getImmediateMacroName(
454 Loc, ACtx.getSourceManager(), ACtx.getLangOpts());
455
456 if (MacroName.size() < 7 || MacroName[0] != 'i' || MacroName[1] != 's')
457 return false;
458
459 return ((MacroName == "isalnum") || (MacroName == "isalpha") ||
460 (MacroName == "isblank") || (MacroName == "isdigit") ||
461 (MacroName == "isgraph") || (MacroName == "islower") ||
462 (MacroName == "isnctrl") || (MacroName == "isprint") ||
463 (MacroName == "ispunct") || (MacroName == "isspace") ||
464 (MacroName == "isupper") || (MacroName == "isxdigit"));
465 }
466
isInAddressOf(const Stmt * S,ASTContext & ACtx)467 bool ArrayBoundCheckerV2::isInAddressOf(const Stmt *S, ASTContext &ACtx) {
468 ParentMapContext &ParentCtx = ACtx.getParentMapContext();
469 do {
470 const DynTypedNodeList Parents = ParentCtx.getParents(*S);
471 if (Parents.empty())
472 return false;
473 S = Parents[0].get<Stmt>();
474 } while (isa_and_nonnull<ParenExpr, ImplicitCastExpr>(S));
475 const auto *UnaryOp = dyn_cast_or_null<UnaryOperator>(S);
476 return UnaryOp && UnaryOp->getOpcode() == UO_AddrOf;
477 }
478
registerArrayBoundCheckerV2(CheckerManager & mgr)479 void ento::registerArrayBoundCheckerV2(CheckerManager &mgr) {
480 mgr.registerChecker<ArrayBoundCheckerV2>();
481 }
482
shouldRegisterArrayBoundCheckerV2(const CheckerManager & mgr)483 bool ento::shouldRegisterArrayBoundCheckerV2(const CheckerManager &mgr) {
484 return true;
485 }
486