1/*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements.  See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License.  You may obtain a copy of the License at
8 *
9 *    http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18package org.apache.spark.sql.catalyst.expressions
19
20import org.apache.spark.sql.catalyst.InternalRow
21import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
22import org.apache.spark.sql.catalyst.expressions.codegen._
23import org.apache.spark.sql.catalyst.trees.TreeNode
24import org.apache.spark.sql.types._
25import org.apache.spark.util.Utils
26
27////////////////////////////////////////////////////////////////////////////////////////////////////
28// This file defines the basic expression abstract classes in Catalyst.
29////////////////////////////////////////////////////////////////////////////////////////////////////
30
31/**
32 * An expression in Catalyst.
33 *
34 * If an expression wants to be exposed in the function registry (so users can call it with
35 * "name(arguments...)", the concrete implementation must be a case class whose constructor
36 * arguments are all Expressions types. See [[Substring]] for an example.
37 *
38 * There are a few important traits:
39 *
40 * - [[Nondeterministic]]: an expression that is not deterministic.
41 * - [[Unevaluable]]: an expression that is not supposed to be evaluated.
42 * - [[CodegenFallback]]: an expression that does not have code gen implemented and falls back to
43 *                        interpreted mode.
44 *
45 * - [[LeafExpression]]: an expression that has no child.
46 * - [[UnaryExpression]]: an expression that has one child.
47 * - [[BinaryExpression]]: an expression that has two children.
48 * - [[TernaryExpression]]: an expression that has three children.
49 * - [[BinaryOperator]]: a special case of [[BinaryExpression]] that requires two children to have
50 *                       the same output data type.
51 *
52 */
53abstract class Expression extends TreeNode[Expression] {
54
55  /**
56   * Returns true when an expression is a candidate for static evaluation before the query is
57   * executed.
58   *
59   * The following conditions are used to determine suitability for constant folding:
60   *  - A [[Coalesce]] is foldable if all of its children are foldable
61   *  - A [[BinaryExpression]] is foldable if its both left and right child are foldable
62   *  - A [[Not]], [[IsNull]], or [[IsNotNull]] is foldable if its child is foldable
63   *  - A [[Literal]] is foldable
64   *  - A [[Cast]] or [[UnaryMinus]] is foldable if its child is foldable
65   */
66  def foldable: Boolean = false
67
68  /**
69   * Returns true when the current expression always return the same result for fixed inputs from
70   * children.
71   *
72   * Note that this means that an expression should be considered as non-deterministic if:
73   * - it relies on some mutable internal state, or
74   * - it relies on some implicit input that is not part of the children expression list.
75   * - it has non-deterministic child or children.
76   *
77   * An example would be `SparkPartitionID` that relies on the partition id returned by TaskContext.
78   * By default leaf expressions are deterministic as Nil.forall(_.deterministic) returns true.
79   */
80  def deterministic: Boolean = children.forall(_.deterministic)
81
82  def nullable: Boolean
83
84  def references: AttributeSet = AttributeSet(children.flatMap(_.references.iterator))
85
86  /** Returns the result of evaluating this expression on a given input Row */
87  def eval(input: InternalRow = null): Any
88
89  /**
90   * Returns an [[ExprCode]], that contains the Java source code to generate the result of
91   * evaluating the expression on an input row.
92   *
93   * @param ctx a [[CodegenContext]]
94   * @return [[ExprCode]]
95   */
96  def genCode(ctx: CodegenContext): ExprCode = {
97    ctx.subExprEliminationExprs.get(this).map { subExprState =>
98      // This expression is repeated which means that the code to evaluate it has already been added
99      // as a function before. In that case, we just re-use it.
100      ExprCode(ctx.registerComment(this.toString), subExprState.isNull, subExprState.value)
101    }.getOrElse {
102      val isNull = ctx.freshName("isNull")
103      val value = ctx.freshName("value")
104      val ve = doGenCode(ctx, ExprCode("", isNull, value))
105      if (ve.code.nonEmpty) {
106        // Add `this` in the comment.
107        ve.copy(code = s"${ctx.registerComment(this.toString)}\n" + ve.code.trim)
108      } else {
109        ve
110      }
111    }
112  }
113
114  /**
115   * Returns Java source code that can be compiled to evaluate this expression.
116   * The default behavior is to call the eval method of the expression. Concrete expression
117   * implementations should override this to do actual code generation.
118   *
119   * @param ctx a [[CodegenContext]]
120   * @param ev an [[ExprCode]] with unique terms.
121   * @return an [[ExprCode]] containing the Java source code to generate the given expression
122   */
123  protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode
124
125  /**
126   * Returns `true` if this expression and all its children have been resolved to a specific schema
127   * and input data types checking passed, and `false` if it still contains any unresolved
128   * placeholders or has data types mismatch.
129   * Implementations of expressions should override this if the resolution of this type of
130   * expression involves more than just the resolution of its children and type checking.
131   */
132  lazy val resolved: Boolean = childrenResolved && checkInputDataTypes().isSuccess
133
134  /**
135   * Returns the [[DataType]] of the result of evaluating this expression.  It is
136   * invalid to query the dataType of an unresolved expression (i.e., when `resolved` == false).
137   */
138  def dataType: DataType
139
140  /**
141   * Returns true if  all the children of this expression have been resolved to a specific schema
142   * and false if any still contains any unresolved placeholders.
143   */
144  def childrenResolved: Boolean = children.forall(_.resolved)
145
146  /**
147   * Returns an expression where a best effort attempt has been made to transform `this` in a way
148   * that preserves the result but removes cosmetic variations (case sensitivity, ordering for
149   * commutative operations, etc.)  See [[Canonicalize]] for more details.
150   *
151   * `deterministic` expressions where `this.canonicalized == other.canonicalized` will always
152   * evaluate to the same result.
153   */
154  lazy val canonicalized: Expression = {
155    val canonicalizedChildren = children.map(_.canonicalized)
156    Canonicalize.execute(withNewChildren(canonicalizedChildren))
157  }
158
159  /**
160   * Returns true when two expressions will always compute the same result, even if they differ
161   * cosmetically (i.e. capitalization of names in attributes may be different).
162   *
163   * See [[Canonicalize]] for more details.
164   */
165  def semanticEquals(other: Expression): Boolean =
166    deterministic && other.deterministic && canonicalized == other.canonicalized
167
168  /**
169   * Returns a `hashCode` for the calculation performed by this expression. Unlike the standard
170   * `hashCode`, an attempt has been made to eliminate cosmetic differences.
171   *
172   * See [[Canonicalize]] for more details.
173   */
174  def semanticHash(): Int = canonicalized.hashCode()
175
176  /**
177   * Checks the input data types, returns `TypeCheckResult.success` if it's valid,
178   * or returns a `TypeCheckResult` with an error message if invalid.
179   * Note: it's not valid to call this method until `childrenResolved == true`.
180   */
181  def checkInputDataTypes(): TypeCheckResult = TypeCheckResult.TypeCheckSuccess
182
183  /**
184   * Returns a user-facing string representation of this expression's name.
185   * This should usually match the name of the function in SQL.
186   */
187  def prettyName: String = nodeName.toLowerCase
188
189  protected def flatArguments: Iterator[Any] = productIterator.flatMap {
190    case t: Traversable[_] => t
191    case single => single :: Nil
192  }
193
194  // Marks this as final, Expression.verboseString should never be called, and thus shouldn't be
195  // overridden by concrete classes.
196  final override def verboseString: String = simpleString
197
198  override def simpleString: String = toString
199
200  override def toString: String = prettyName + Utils.truncatedString(
201    flatArguments.toSeq, "(", ", ", ")")
202
203  /**
204   * Returns SQL representation of this expression.  For expressions extending [[NonSQLExpression]],
205   * this method may return an arbitrary user facing string.
206   */
207  def sql: String = {
208    val childrenSQL = children.map(_.sql).mkString(", ")
209    s"$prettyName($childrenSQL)"
210  }
211}
212
213
214/**
215 * An expression that cannot be evaluated. Some expressions don't live past analysis or optimization
216 * time (e.g. Star). This trait is used by those expressions.
217 */
218trait Unevaluable extends Expression {
219
220  final override def eval(input: InternalRow = null): Any =
221    throw new UnsupportedOperationException(s"Cannot evaluate expression: $this")
222
223  final override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode =
224    throw new UnsupportedOperationException(s"Cannot evaluate expression: $this")
225}
226
227
228/**
229 * An expression that gets replaced at runtime (currently by the optimizer) into a different
230 * expression for evaluation. This is mainly used to provide compatibility with other databases.
231 * For example, we use this to support "nvl" by replacing it with "coalesce".
232 *
233 * A RuntimeReplaceable should have the original parameters along with a "child" expression in the
234 * case class constructor, and define a normal constructor that accepts only the original
235 * parameters. For an example, see [[Nvl]]. To make sure the explain plan and expression SQL
236 * works correctly, the implementation should also override flatArguments method and sql method.
237 */
238trait RuntimeReplaceable extends UnaryExpression with Unevaluable {
239  override def nullable: Boolean = child.nullable
240  override def foldable: Boolean = child.foldable
241  override def dataType: DataType = child.dataType
242}
243
244
245/**
246 * Expressions that don't have SQL representation should extend this trait.  Examples are
247 * `ScalaUDF`, `ScalaUDAF`, and object expressions like `MapObjects` and `Invoke`.
248 */
249trait NonSQLExpression extends Expression {
250  final override def sql: String = {
251    transform {
252      case a: Attribute => new PrettyAttribute(a)
253    }.toString
254  }
255}
256
257
258/**
259 * An expression that is nondeterministic.
260 */
261trait Nondeterministic extends Expression {
262  final override def deterministic: Boolean = false
263  final override def foldable: Boolean = false
264
265  @transient
266  private[this] var initialized = false
267
268  /**
269   * Initializes internal states given the current partition index and mark this as initialized.
270   * Subclasses should override [[initializeInternal()]].
271   */
272  final def initialize(partitionIndex: Int): Unit = {
273    initializeInternal(partitionIndex)
274    initialized = true
275  }
276
277  protected def initializeInternal(partitionIndex: Int): Unit
278
279  /**
280   * @inheritdoc
281   * Throws an exception if [[initialize()]] is not called yet.
282   * Subclasses should override [[evalInternal()]].
283   */
284  final override def eval(input: InternalRow = null): Any = {
285    require(initialized,
286      s"Nondeterministic expression ${this.getClass.getName} should be initialized before eval.")
287    evalInternal(input)
288  }
289
290  protected def evalInternal(input: InternalRow): Any
291}
292
293
294/**
295 * A leaf expression, i.e. one without any child expressions.
296 */
297abstract class LeafExpression extends Expression {
298
299  override final def children: Seq[Expression] = Nil
300}
301
302
303/**
304 * An expression with one input and one output. The output is by default evaluated to null
305 * if the input is evaluated to null.
306 */
307abstract class UnaryExpression extends Expression {
308
309  def child: Expression
310
311  override final def children: Seq[Expression] = child :: Nil
312
313  override def foldable: Boolean = child.foldable
314  override def nullable: Boolean = child.nullable
315
316  /**
317   * Default behavior of evaluation according to the default nullability of UnaryExpression.
318   * If subclass of UnaryExpression override nullable, probably should also override this.
319   */
320  override def eval(input: InternalRow): Any = {
321    val value = child.eval(input)
322    if (value == null) {
323      null
324    } else {
325      nullSafeEval(value)
326    }
327  }
328
329  /**
330   * Called by default [[eval]] implementation.  If subclass of UnaryExpression keep the default
331   * nullability, they can override this method to save null-check code.  If we need full control
332   * of evaluation process, we should override [[eval]].
333   */
334  protected def nullSafeEval(input: Any): Any =
335    sys.error(s"UnaryExpressions must override either eval or nullSafeEval")
336
337  /**
338   * Called by unary expressions to generate a code block that returns null if its parent returns
339   * null, and if not null, use `f` to generate the expression.
340   *
341   * As an example, the following does a boolean inversion (i.e. NOT).
342   * {{{
343   *   defineCodeGen(ctx, ev, c => s"!($c)")
344   * }}}
345   *
346   * @param f function that accepts a variable name and returns Java code to compute the output.
347   */
348  protected def defineCodeGen(
349      ctx: CodegenContext,
350      ev: ExprCode,
351      f: String => String): ExprCode = {
352    nullSafeCodeGen(ctx, ev, eval => {
353      s"${ev.value} = ${f(eval)};"
354    })
355  }
356
357  /**
358   * Called by unary expressions to generate a code block that returns null if its parent returns
359   * null, and if not null, use `f` to generate the expression.
360   *
361   * @param f function that accepts the non-null evaluation result name of child and returns Java
362   *          code to compute the output.
363   */
364  protected def nullSafeCodeGen(
365      ctx: CodegenContext,
366      ev: ExprCode,
367      f: String => String): ExprCode = {
368    val childGen = child.genCode(ctx)
369    val resultCode = f(childGen.value)
370
371    if (nullable) {
372      val nullSafeEval = ctx.nullSafeExec(child.nullable, childGen.isNull)(resultCode)
373      ev.copy(code = s"""
374        ${childGen.code}
375        boolean ${ev.isNull} = ${childGen.isNull};
376        ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
377        $nullSafeEval
378      """)
379    } else {
380      ev.copy(code = s"""
381        boolean ${ev.isNull} = false;
382        ${childGen.code}
383        ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
384        $resultCode""", isNull = "false")
385    }
386  }
387}
388
389/**
390 * An expression with two inputs and one output. The output is by default evaluated to null
391 * if any input is evaluated to null.
392 */
393abstract class BinaryExpression extends Expression {
394
395  def left: Expression
396  def right: Expression
397
398  override final def children: Seq[Expression] = Seq(left, right)
399
400  override def foldable: Boolean = left.foldable && right.foldable
401
402  override def nullable: Boolean = left.nullable || right.nullable
403
404  /**
405   * Default behavior of evaluation according to the default nullability of BinaryExpression.
406   * If subclass of BinaryExpression override nullable, probably should also override this.
407   */
408  override def eval(input: InternalRow): Any = {
409    val value1 = left.eval(input)
410    if (value1 == null) {
411      null
412    } else {
413      val value2 = right.eval(input)
414      if (value2 == null) {
415        null
416      } else {
417        nullSafeEval(value1, value2)
418      }
419    }
420  }
421
422  /**
423   * Called by default [[eval]] implementation.  If subclass of BinaryExpression keep the default
424   * nullability, they can override this method to save null-check code.  If we need full control
425   * of evaluation process, we should override [[eval]].
426   */
427  protected def nullSafeEval(input1: Any, input2: Any): Any =
428    sys.error(s"BinaryExpressions must override either eval or nullSafeEval")
429
430  /**
431   * Short hand for generating binary evaluation code.
432   * If either of the sub-expressions is null, the result of this computation
433   * is assumed to be null.
434   *
435   * @param f accepts two variable names and returns Java code to compute the output.
436   */
437  protected def defineCodeGen(
438      ctx: CodegenContext,
439      ev: ExprCode,
440      f: (String, String) => String): ExprCode = {
441    nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
442      s"${ev.value} = ${f(eval1, eval2)};"
443    })
444  }
445
446  /**
447   * Short hand for generating binary evaluation code.
448   * If either of the sub-expressions is null, the result of this computation
449   * is assumed to be null.
450   *
451   * @param f function that accepts the 2 non-null evaluation result names of children
452   *          and returns Java code to compute the output.
453   */
454  protected def nullSafeCodeGen(
455      ctx: CodegenContext,
456      ev: ExprCode,
457      f: (String, String) => String): ExprCode = {
458    val leftGen = left.genCode(ctx)
459    val rightGen = right.genCode(ctx)
460    val resultCode = f(leftGen.value, rightGen.value)
461
462    if (nullable) {
463      val nullSafeEval =
464        leftGen.code + ctx.nullSafeExec(left.nullable, leftGen.isNull) {
465          rightGen.code + ctx.nullSafeExec(right.nullable, rightGen.isNull) {
466            s"""
467              ${ev.isNull} = false; // resultCode could change nullability.
468              $resultCode
469            """
470          }
471      }
472
473      ev.copy(code = s"""
474        boolean ${ev.isNull} = true;
475        ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
476        $nullSafeEval
477      """)
478    } else {
479      ev.copy(code = s"""
480        boolean ${ev.isNull} = false;
481        ${leftGen.code}
482        ${rightGen.code}
483        ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
484        $resultCode""", isNull = "false")
485    }
486  }
487}
488
489
490/**
491 * A [[BinaryExpression]] that is an operator, with two properties:
492 *
493 * 1. The string representation is "x symbol y", rather than "funcName(x, y)".
494 * 2. Two inputs are expected to the be same type. If the two inputs have different types,
495 *    the analyzer will find the tightest common type and do the proper type casting.
496 */
497abstract class BinaryOperator extends BinaryExpression with ExpectsInputTypes {
498
499  /**
500   * Expected input type from both left/right child expressions, similar to the
501   * [[ImplicitCastInputTypes]] trait.
502   */
503  def inputType: AbstractDataType
504
505  def symbol: String
506
507  def sqlOperator: String = symbol
508
509  override def toString: String = s"($left $symbol $right)"
510
511  override def inputTypes: Seq[AbstractDataType] = Seq(inputType, inputType)
512
513  override def checkInputDataTypes(): TypeCheckResult = {
514    // First check whether left and right have the same type, then check if the type is acceptable.
515    if (!left.dataType.sameType(right.dataType)) {
516      TypeCheckResult.TypeCheckFailure(s"differing types in '$sql' " +
517        s"(${left.dataType.simpleString} and ${right.dataType.simpleString}).")
518    } else if (!inputType.acceptsType(left.dataType)) {
519      TypeCheckResult.TypeCheckFailure(s"'$sql' requires ${inputType.simpleString} type," +
520        s" not ${left.dataType.simpleString}")
521    } else {
522      TypeCheckResult.TypeCheckSuccess
523    }
524  }
525
526  override def sql: String = s"(${left.sql} $sqlOperator ${right.sql})"
527}
528
529
530object BinaryOperator {
531  def unapply(e: BinaryOperator): Option[(Expression, Expression)] = Some((e.left, e.right))
532}
533
534/**
535 * An expression with three inputs and one output. The output is by default evaluated to null
536 * if any input is evaluated to null.
537 */
538abstract class TernaryExpression extends Expression {
539
540  override def foldable: Boolean = children.forall(_.foldable)
541
542  override def nullable: Boolean = children.exists(_.nullable)
543
544  /**
545   * Default behavior of evaluation according to the default nullability of TernaryExpression.
546   * If subclass of TernaryExpression override nullable, probably should also override this.
547   */
548  override def eval(input: InternalRow): Any = {
549    val exprs = children
550    val value1 = exprs(0).eval(input)
551    if (value1 != null) {
552      val value2 = exprs(1).eval(input)
553      if (value2 != null) {
554        val value3 = exprs(2).eval(input)
555        if (value3 != null) {
556          return nullSafeEval(value1, value2, value3)
557        }
558      }
559    }
560    null
561  }
562
563  /**
564   * Called by default [[eval]] implementation.  If subclass of TernaryExpression keep the default
565   * nullability, they can override this method to save null-check code.  If we need full control
566   * of evaluation process, we should override [[eval]].
567   */
568  protected def nullSafeEval(input1: Any, input2: Any, input3: Any): Any =
569    sys.error(s"TernaryExpressions must override either eval or nullSafeEval")
570
571  /**
572   * Short hand for generating ternary evaluation code.
573   * If either of the sub-expressions is null, the result of this computation
574   * is assumed to be null.
575   *
576   * @param f accepts three variable names and returns Java code to compute the output.
577   */
578  protected def defineCodeGen(
579    ctx: CodegenContext,
580    ev: ExprCode,
581    f: (String, String, String) => String): ExprCode = {
582    nullSafeCodeGen(ctx, ev, (eval1, eval2, eval3) => {
583      s"${ev.value} = ${f(eval1, eval2, eval3)};"
584    })
585  }
586
587  /**
588   * Short hand for generating ternary evaluation code.
589   * If either of the sub-expressions is null, the result of this computation
590   * is assumed to be null.
591   *
592   * @param f function that accepts the 3 non-null evaluation result names of children
593   *          and returns Java code to compute the output.
594   */
595  protected def nullSafeCodeGen(
596    ctx: CodegenContext,
597    ev: ExprCode,
598    f: (String, String, String) => String): ExprCode = {
599    val leftGen = children(0).genCode(ctx)
600    val midGen = children(1).genCode(ctx)
601    val rightGen = children(2).genCode(ctx)
602    val resultCode = f(leftGen.value, midGen.value, rightGen.value)
603
604    if (nullable) {
605      val nullSafeEval =
606        leftGen.code + ctx.nullSafeExec(children(0).nullable, leftGen.isNull) {
607          midGen.code + ctx.nullSafeExec(children(1).nullable, midGen.isNull) {
608            rightGen.code + ctx.nullSafeExec(children(2).nullable, rightGen.isNull) {
609              s"""
610                ${ev.isNull} = false; // resultCode could change nullability.
611                $resultCode
612              """
613            }
614          }
615      }
616
617      ev.copy(code = s"""
618        boolean ${ev.isNull} = true;
619        ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
620        $nullSafeEval""")
621    } else {
622      ev.copy(code = s"""
623        boolean ${ev.isNull} = false;
624        ${leftGen.code}
625        ${midGen.code}
626        ${rightGen.code}
627        ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
628        $resultCode""", isNull = "false")
629    }
630  }
631}
632