1/* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18package org.apache.spark.sql.catalyst.expressions 19 20import org.apache.spark.sql.catalyst.InternalRow 21import org.apache.spark.sql.catalyst.analysis.TypeCheckResult 22import org.apache.spark.sql.catalyst.expressions.codegen._ 23import org.apache.spark.sql.catalyst.trees.TreeNode 24import org.apache.spark.sql.types._ 25import org.apache.spark.util.Utils 26 27//////////////////////////////////////////////////////////////////////////////////////////////////// 28// This file defines the basic expression abstract classes in Catalyst. 29//////////////////////////////////////////////////////////////////////////////////////////////////// 30 31/** 32 * An expression in Catalyst. 33 * 34 * If an expression wants to be exposed in the function registry (so users can call it with 35 * "name(arguments...)", the concrete implementation must be a case class whose constructor 36 * arguments are all Expressions types. See [[Substring]] for an example. 37 * 38 * There are a few important traits: 39 * 40 * - [[Nondeterministic]]: an expression that is not deterministic. 41 * - [[Unevaluable]]: an expression that is not supposed to be evaluated. 42 * - [[CodegenFallback]]: an expression that does not have code gen implemented and falls back to 43 * interpreted mode. 44 * 45 * - [[LeafExpression]]: an expression that has no child. 46 * - [[UnaryExpression]]: an expression that has one child. 47 * - [[BinaryExpression]]: an expression that has two children. 48 * - [[TernaryExpression]]: an expression that has three children. 49 * - [[BinaryOperator]]: a special case of [[BinaryExpression]] that requires two children to have 50 * the same output data type. 51 * 52 */ 53abstract class Expression extends TreeNode[Expression] { 54 55 /** 56 * Returns true when an expression is a candidate for static evaluation before the query is 57 * executed. 58 * 59 * The following conditions are used to determine suitability for constant folding: 60 * - A [[Coalesce]] is foldable if all of its children are foldable 61 * - A [[BinaryExpression]] is foldable if its both left and right child are foldable 62 * - A [[Not]], [[IsNull]], or [[IsNotNull]] is foldable if its child is foldable 63 * - A [[Literal]] is foldable 64 * - A [[Cast]] or [[UnaryMinus]] is foldable if its child is foldable 65 */ 66 def foldable: Boolean = false 67 68 /** 69 * Returns true when the current expression always return the same result for fixed inputs from 70 * children. 71 * 72 * Note that this means that an expression should be considered as non-deterministic if: 73 * - it relies on some mutable internal state, or 74 * - it relies on some implicit input that is not part of the children expression list. 75 * - it has non-deterministic child or children. 76 * 77 * An example would be `SparkPartitionID` that relies on the partition id returned by TaskContext. 78 * By default leaf expressions are deterministic as Nil.forall(_.deterministic) returns true. 79 */ 80 def deterministic: Boolean = children.forall(_.deterministic) 81 82 def nullable: Boolean 83 84 def references: AttributeSet = AttributeSet(children.flatMap(_.references.iterator)) 85 86 /** Returns the result of evaluating this expression on a given input Row */ 87 def eval(input: InternalRow = null): Any 88 89 /** 90 * Returns an [[ExprCode]], that contains the Java source code to generate the result of 91 * evaluating the expression on an input row. 92 * 93 * @param ctx a [[CodegenContext]] 94 * @return [[ExprCode]] 95 */ 96 def genCode(ctx: CodegenContext): ExprCode = { 97 ctx.subExprEliminationExprs.get(this).map { subExprState => 98 // This expression is repeated which means that the code to evaluate it has already been added 99 // as a function before. In that case, we just re-use it. 100 ExprCode(ctx.registerComment(this.toString), subExprState.isNull, subExprState.value) 101 }.getOrElse { 102 val isNull = ctx.freshName("isNull") 103 val value = ctx.freshName("value") 104 val ve = doGenCode(ctx, ExprCode("", isNull, value)) 105 if (ve.code.nonEmpty) { 106 // Add `this` in the comment. 107 ve.copy(code = s"${ctx.registerComment(this.toString)}\n" + ve.code.trim) 108 } else { 109 ve 110 } 111 } 112 } 113 114 /** 115 * Returns Java source code that can be compiled to evaluate this expression. 116 * The default behavior is to call the eval method of the expression. Concrete expression 117 * implementations should override this to do actual code generation. 118 * 119 * @param ctx a [[CodegenContext]] 120 * @param ev an [[ExprCode]] with unique terms. 121 * @return an [[ExprCode]] containing the Java source code to generate the given expression 122 */ 123 protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode 124 125 /** 126 * Returns `true` if this expression and all its children have been resolved to a specific schema 127 * and input data types checking passed, and `false` if it still contains any unresolved 128 * placeholders or has data types mismatch. 129 * Implementations of expressions should override this if the resolution of this type of 130 * expression involves more than just the resolution of its children and type checking. 131 */ 132 lazy val resolved: Boolean = childrenResolved && checkInputDataTypes().isSuccess 133 134 /** 135 * Returns the [[DataType]] of the result of evaluating this expression. It is 136 * invalid to query the dataType of an unresolved expression (i.e., when `resolved` == false). 137 */ 138 def dataType: DataType 139 140 /** 141 * Returns true if all the children of this expression have been resolved to a specific schema 142 * and false if any still contains any unresolved placeholders. 143 */ 144 def childrenResolved: Boolean = children.forall(_.resolved) 145 146 /** 147 * Returns an expression where a best effort attempt has been made to transform `this` in a way 148 * that preserves the result but removes cosmetic variations (case sensitivity, ordering for 149 * commutative operations, etc.) See [[Canonicalize]] for more details. 150 * 151 * `deterministic` expressions where `this.canonicalized == other.canonicalized` will always 152 * evaluate to the same result. 153 */ 154 lazy val canonicalized: Expression = { 155 val canonicalizedChildren = children.map(_.canonicalized) 156 Canonicalize.execute(withNewChildren(canonicalizedChildren)) 157 } 158 159 /** 160 * Returns true when two expressions will always compute the same result, even if they differ 161 * cosmetically (i.e. capitalization of names in attributes may be different). 162 * 163 * See [[Canonicalize]] for more details. 164 */ 165 def semanticEquals(other: Expression): Boolean = 166 deterministic && other.deterministic && canonicalized == other.canonicalized 167 168 /** 169 * Returns a `hashCode` for the calculation performed by this expression. Unlike the standard 170 * `hashCode`, an attempt has been made to eliminate cosmetic differences. 171 * 172 * See [[Canonicalize]] for more details. 173 */ 174 def semanticHash(): Int = canonicalized.hashCode() 175 176 /** 177 * Checks the input data types, returns `TypeCheckResult.success` if it's valid, 178 * or returns a `TypeCheckResult` with an error message if invalid. 179 * Note: it's not valid to call this method until `childrenResolved == true`. 180 */ 181 def checkInputDataTypes(): TypeCheckResult = TypeCheckResult.TypeCheckSuccess 182 183 /** 184 * Returns a user-facing string representation of this expression's name. 185 * This should usually match the name of the function in SQL. 186 */ 187 def prettyName: String = nodeName.toLowerCase 188 189 protected def flatArguments: Iterator[Any] = productIterator.flatMap { 190 case t: Traversable[_] => t 191 case single => single :: Nil 192 } 193 194 // Marks this as final, Expression.verboseString should never be called, and thus shouldn't be 195 // overridden by concrete classes. 196 final override def verboseString: String = simpleString 197 198 override def simpleString: String = toString 199 200 override def toString: String = prettyName + Utils.truncatedString( 201 flatArguments.toSeq, "(", ", ", ")") 202 203 /** 204 * Returns SQL representation of this expression. For expressions extending [[NonSQLExpression]], 205 * this method may return an arbitrary user facing string. 206 */ 207 def sql: String = { 208 val childrenSQL = children.map(_.sql).mkString(", ") 209 s"$prettyName($childrenSQL)" 210 } 211} 212 213 214/** 215 * An expression that cannot be evaluated. Some expressions don't live past analysis or optimization 216 * time (e.g. Star). This trait is used by those expressions. 217 */ 218trait Unevaluable extends Expression { 219 220 final override def eval(input: InternalRow = null): Any = 221 throw new UnsupportedOperationException(s"Cannot evaluate expression: $this") 222 223 final override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = 224 throw new UnsupportedOperationException(s"Cannot evaluate expression: $this") 225} 226 227 228/** 229 * An expression that gets replaced at runtime (currently by the optimizer) into a different 230 * expression for evaluation. This is mainly used to provide compatibility with other databases. 231 * For example, we use this to support "nvl" by replacing it with "coalesce". 232 * 233 * A RuntimeReplaceable should have the original parameters along with a "child" expression in the 234 * case class constructor, and define a normal constructor that accepts only the original 235 * parameters. For an example, see [[Nvl]]. To make sure the explain plan and expression SQL 236 * works correctly, the implementation should also override flatArguments method and sql method. 237 */ 238trait RuntimeReplaceable extends UnaryExpression with Unevaluable { 239 override def nullable: Boolean = child.nullable 240 override def foldable: Boolean = child.foldable 241 override def dataType: DataType = child.dataType 242} 243 244 245/** 246 * Expressions that don't have SQL representation should extend this trait. Examples are 247 * `ScalaUDF`, `ScalaUDAF`, and object expressions like `MapObjects` and `Invoke`. 248 */ 249trait NonSQLExpression extends Expression { 250 final override def sql: String = { 251 transform { 252 case a: Attribute => new PrettyAttribute(a) 253 }.toString 254 } 255} 256 257 258/** 259 * An expression that is nondeterministic. 260 */ 261trait Nondeterministic extends Expression { 262 final override def deterministic: Boolean = false 263 final override def foldable: Boolean = false 264 265 @transient 266 private[this] var initialized = false 267 268 /** 269 * Initializes internal states given the current partition index and mark this as initialized. 270 * Subclasses should override [[initializeInternal()]]. 271 */ 272 final def initialize(partitionIndex: Int): Unit = { 273 initializeInternal(partitionIndex) 274 initialized = true 275 } 276 277 protected def initializeInternal(partitionIndex: Int): Unit 278 279 /** 280 * @inheritdoc 281 * Throws an exception if [[initialize()]] is not called yet. 282 * Subclasses should override [[evalInternal()]]. 283 */ 284 final override def eval(input: InternalRow = null): Any = { 285 require(initialized, 286 s"Nondeterministic expression ${this.getClass.getName} should be initialized before eval.") 287 evalInternal(input) 288 } 289 290 protected def evalInternal(input: InternalRow): Any 291} 292 293 294/** 295 * A leaf expression, i.e. one without any child expressions. 296 */ 297abstract class LeafExpression extends Expression { 298 299 override final def children: Seq[Expression] = Nil 300} 301 302 303/** 304 * An expression with one input and one output. The output is by default evaluated to null 305 * if the input is evaluated to null. 306 */ 307abstract class UnaryExpression extends Expression { 308 309 def child: Expression 310 311 override final def children: Seq[Expression] = child :: Nil 312 313 override def foldable: Boolean = child.foldable 314 override def nullable: Boolean = child.nullable 315 316 /** 317 * Default behavior of evaluation according to the default nullability of UnaryExpression. 318 * If subclass of UnaryExpression override nullable, probably should also override this. 319 */ 320 override def eval(input: InternalRow): Any = { 321 val value = child.eval(input) 322 if (value == null) { 323 null 324 } else { 325 nullSafeEval(value) 326 } 327 } 328 329 /** 330 * Called by default [[eval]] implementation. If subclass of UnaryExpression keep the default 331 * nullability, they can override this method to save null-check code. If we need full control 332 * of evaluation process, we should override [[eval]]. 333 */ 334 protected def nullSafeEval(input: Any): Any = 335 sys.error(s"UnaryExpressions must override either eval or nullSafeEval") 336 337 /** 338 * Called by unary expressions to generate a code block that returns null if its parent returns 339 * null, and if not null, use `f` to generate the expression. 340 * 341 * As an example, the following does a boolean inversion (i.e. NOT). 342 * {{{ 343 * defineCodeGen(ctx, ev, c => s"!($c)") 344 * }}} 345 * 346 * @param f function that accepts a variable name and returns Java code to compute the output. 347 */ 348 protected def defineCodeGen( 349 ctx: CodegenContext, 350 ev: ExprCode, 351 f: String => String): ExprCode = { 352 nullSafeCodeGen(ctx, ev, eval => { 353 s"${ev.value} = ${f(eval)};" 354 }) 355 } 356 357 /** 358 * Called by unary expressions to generate a code block that returns null if its parent returns 359 * null, and if not null, use `f` to generate the expression. 360 * 361 * @param f function that accepts the non-null evaluation result name of child and returns Java 362 * code to compute the output. 363 */ 364 protected def nullSafeCodeGen( 365 ctx: CodegenContext, 366 ev: ExprCode, 367 f: String => String): ExprCode = { 368 val childGen = child.genCode(ctx) 369 val resultCode = f(childGen.value) 370 371 if (nullable) { 372 val nullSafeEval = ctx.nullSafeExec(child.nullable, childGen.isNull)(resultCode) 373 ev.copy(code = s""" 374 ${childGen.code} 375 boolean ${ev.isNull} = ${childGen.isNull}; 376 ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)}; 377 $nullSafeEval 378 """) 379 } else { 380 ev.copy(code = s""" 381 boolean ${ev.isNull} = false; 382 ${childGen.code} 383 ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)}; 384 $resultCode""", isNull = "false") 385 } 386 } 387} 388 389/** 390 * An expression with two inputs and one output. The output is by default evaluated to null 391 * if any input is evaluated to null. 392 */ 393abstract class BinaryExpression extends Expression { 394 395 def left: Expression 396 def right: Expression 397 398 override final def children: Seq[Expression] = Seq(left, right) 399 400 override def foldable: Boolean = left.foldable && right.foldable 401 402 override def nullable: Boolean = left.nullable || right.nullable 403 404 /** 405 * Default behavior of evaluation according to the default nullability of BinaryExpression. 406 * If subclass of BinaryExpression override nullable, probably should also override this. 407 */ 408 override def eval(input: InternalRow): Any = { 409 val value1 = left.eval(input) 410 if (value1 == null) { 411 null 412 } else { 413 val value2 = right.eval(input) 414 if (value2 == null) { 415 null 416 } else { 417 nullSafeEval(value1, value2) 418 } 419 } 420 } 421 422 /** 423 * Called by default [[eval]] implementation. If subclass of BinaryExpression keep the default 424 * nullability, they can override this method to save null-check code. If we need full control 425 * of evaluation process, we should override [[eval]]. 426 */ 427 protected def nullSafeEval(input1: Any, input2: Any): Any = 428 sys.error(s"BinaryExpressions must override either eval or nullSafeEval") 429 430 /** 431 * Short hand for generating binary evaluation code. 432 * If either of the sub-expressions is null, the result of this computation 433 * is assumed to be null. 434 * 435 * @param f accepts two variable names and returns Java code to compute the output. 436 */ 437 protected def defineCodeGen( 438 ctx: CodegenContext, 439 ev: ExprCode, 440 f: (String, String) => String): ExprCode = { 441 nullSafeCodeGen(ctx, ev, (eval1, eval2) => { 442 s"${ev.value} = ${f(eval1, eval2)};" 443 }) 444 } 445 446 /** 447 * Short hand for generating binary evaluation code. 448 * If either of the sub-expressions is null, the result of this computation 449 * is assumed to be null. 450 * 451 * @param f function that accepts the 2 non-null evaluation result names of children 452 * and returns Java code to compute the output. 453 */ 454 protected def nullSafeCodeGen( 455 ctx: CodegenContext, 456 ev: ExprCode, 457 f: (String, String) => String): ExprCode = { 458 val leftGen = left.genCode(ctx) 459 val rightGen = right.genCode(ctx) 460 val resultCode = f(leftGen.value, rightGen.value) 461 462 if (nullable) { 463 val nullSafeEval = 464 leftGen.code + ctx.nullSafeExec(left.nullable, leftGen.isNull) { 465 rightGen.code + ctx.nullSafeExec(right.nullable, rightGen.isNull) { 466 s""" 467 ${ev.isNull} = false; // resultCode could change nullability. 468 $resultCode 469 """ 470 } 471 } 472 473 ev.copy(code = s""" 474 boolean ${ev.isNull} = true; 475 ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)}; 476 $nullSafeEval 477 """) 478 } else { 479 ev.copy(code = s""" 480 boolean ${ev.isNull} = false; 481 ${leftGen.code} 482 ${rightGen.code} 483 ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)}; 484 $resultCode""", isNull = "false") 485 } 486 } 487} 488 489 490/** 491 * A [[BinaryExpression]] that is an operator, with two properties: 492 * 493 * 1. The string representation is "x symbol y", rather than "funcName(x, y)". 494 * 2. Two inputs are expected to the be same type. If the two inputs have different types, 495 * the analyzer will find the tightest common type and do the proper type casting. 496 */ 497abstract class BinaryOperator extends BinaryExpression with ExpectsInputTypes { 498 499 /** 500 * Expected input type from both left/right child expressions, similar to the 501 * [[ImplicitCastInputTypes]] trait. 502 */ 503 def inputType: AbstractDataType 504 505 def symbol: String 506 507 def sqlOperator: String = symbol 508 509 override def toString: String = s"($left $symbol $right)" 510 511 override def inputTypes: Seq[AbstractDataType] = Seq(inputType, inputType) 512 513 override def checkInputDataTypes(): TypeCheckResult = { 514 // First check whether left and right have the same type, then check if the type is acceptable. 515 if (!left.dataType.sameType(right.dataType)) { 516 TypeCheckResult.TypeCheckFailure(s"differing types in '$sql' " + 517 s"(${left.dataType.simpleString} and ${right.dataType.simpleString}).") 518 } else if (!inputType.acceptsType(left.dataType)) { 519 TypeCheckResult.TypeCheckFailure(s"'$sql' requires ${inputType.simpleString} type," + 520 s" not ${left.dataType.simpleString}") 521 } else { 522 TypeCheckResult.TypeCheckSuccess 523 } 524 } 525 526 override def sql: String = s"(${left.sql} $sqlOperator ${right.sql})" 527} 528 529 530object BinaryOperator { 531 def unapply(e: BinaryOperator): Option[(Expression, Expression)] = Some((e.left, e.right)) 532} 533 534/** 535 * An expression with three inputs and one output. The output is by default evaluated to null 536 * if any input is evaluated to null. 537 */ 538abstract class TernaryExpression extends Expression { 539 540 override def foldable: Boolean = children.forall(_.foldable) 541 542 override def nullable: Boolean = children.exists(_.nullable) 543 544 /** 545 * Default behavior of evaluation according to the default nullability of TernaryExpression. 546 * If subclass of TernaryExpression override nullable, probably should also override this. 547 */ 548 override def eval(input: InternalRow): Any = { 549 val exprs = children 550 val value1 = exprs(0).eval(input) 551 if (value1 != null) { 552 val value2 = exprs(1).eval(input) 553 if (value2 != null) { 554 val value3 = exprs(2).eval(input) 555 if (value3 != null) { 556 return nullSafeEval(value1, value2, value3) 557 } 558 } 559 } 560 null 561 } 562 563 /** 564 * Called by default [[eval]] implementation. If subclass of TernaryExpression keep the default 565 * nullability, they can override this method to save null-check code. If we need full control 566 * of evaluation process, we should override [[eval]]. 567 */ 568 protected def nullSafeEval(input1: Any, input2: Any, input3: Any): Any = 569 sys.error(s"TernaryExpressions must override either eval or nullSafeEval") 570 571 /** 572 * Short hand for generating ternary evaluation code. 573 * If either of the sub-expressions is null, the result of this computation 574 * is assumed to be null. 575 * 576 * @param f accepts three variable names and returns Java code to compute the output. 577 */ 578 protected def defineCodeGen( 579 ctx: CodegenContext, 580 ev: ExprCode, 581 f: (String, String, String) => String): ExprCode = { 582 nullSafeCodeGen(ctx, ev, (eval1, eval2, eval3) => { 583 s"${ev.value} = ${f(eval1, eval2, eval3)};" 584 }) 585 } 586 587 /** 588 * Short hand for generating ternary evaluation code. 589 * If either of the sub-expressions is null, the result of this computation 590 * is assumed to be null. 591 * 592 * @param f function that accepts the 3 non-null evaluation result names of children 593 * and returns Java code to compute the output. 594 */ 595 protected def nullSafeCodeGen( 596 ctx: CodegenContext, 597 ev: ExprCode, 598 f: (String, String, String) => String): ExprCode = { 599 val leftGen = children(0).genCode(ctx) 600 val midGen = children(1).genCode(ctx) 601 val rightGen = children(2).genCode(ctx) 602 val resultCode = f(leftGen.value, midGen.value, rightGen.value) 603 604 if (nullable) { 605 val nullSafeEval = 606 leftGen.code + ctx.nullSafeExec(children(0).nullable, leftGen.isNull) { 607 midGen.code + ctx.nullSafeExec(children(1).nullable, midGen.isNull) { 608 rightGen.code + ctx.nullSafeExec(children(2).nullable, rightGen.isNull) { 609 s""" 610 ${ev.isNull} = false; // resultCode could change nullability. 611 $resultCode 612 """ 613 } 614 } 615 } 616 617 ev.copy(code = s""" 618 boolean ${ev.isNull} = true; 619 ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)}; 620 $nullSafeEval""") 621 } else { 622 ev.copy(code = s""" 623 boolean ${ev.isNull} = false; 624 ${leftGen.code} 625 ${midGen.code} 626 ${rightGen.code} 627 ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)}; 628 $resultCode""", isNull = "false") 629 } 630 } 631} 632