[SPARK-26168][SQL] Update the code comments in Expression and Aggregate
## What changes were proposed in this pull request? This PR is to improve the code comments to document some common traits and traps about the expression. ## How was this patch tested? N/A Closes #23135 from gatorsmile/addcomments. Authored-by: gatorsmile <gatorsmile@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
This commit is contained in:
parent
6ab8485da2
commit
6bb60b30fd
|
@ -181,8 +181,9 @@ object TypeCoercion {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The method finds a common type for data types that differ only in nullable, containsNull
|
* The method finds a common type for data types that differ only in nullable flags, including
|
||||||
* and valueContainsNull flags. If the input types are too different, None is returned.
|
* `nullable`, `containsNull` of [[ArrayType]] and `valueContainsNull` of [[MapType]].
|
||||||
|
* If the input types are different besides nullable flags, None is returned.
|
||||||
*/
|
*/
|
||||||
def findCommonTypeDifferentOnlyInNullFlags(t1: DataType, t2: DataType): Option[DataType] = {
|
def findCommonTypeDifferentOnlyInNullFlags(t1: DataType, t2: DataType): Option[DataType] = {
|
||||||
if (t1 == t2) {
|
if (t1 == t2) {
|
||||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion}
|
||||||
import org.apache.spark.sql.catalyst.expressions.aggregate.DeclarativeAggregate
|
import org.apache.spark.sql.catalyst.expressions.aggregate.DeclarativeAggregate
|
||||||
import org.apache.spark.sql.catalyst.expressions.codegen._
|
import org.apache.spark.sql.catalyst.expressions.codegen._
|
||||||
import org.apache.spark.sql.catalyst.expressions.codegen.Block._
|
import org.apache.spark.sql.catalyst.expressions.codegen.Block._
|
||||||
|
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
|
||||||
import org.apache.spark.sql.catalyst.trees.TreeNode
|
import org.apache.spark.sql.catalyst.trees.TreeNode
|
||||||
import org.apache.spark.sql.catalyst.util.truncatedString
|
import org.apache.spark.sql.catalyst.util.truncatedString
|
||||||
import org.apache.spark.sql.internal.SQLConf
|
import org.apache.spark.sql.internal.SQLConf
|
||||||
|
@ -40,12 +41,28 @@ import org.apache.spark.sql.types._
|
||||||
* "name(arguments...)", the concrete implementation must be a case class whose constructor
|
* "name(arguments...)", the concrete implementation must be a case class whose constructor
|
||||||
* arguments are all Expressions types. See [[Substring]] for an example.
|
* arguments are all Expressions types. See [[Substring]] for an example.
|
||||||
*
|
*
|
||||||
* There are a few important traits:
|
* There are a few important traits or abstract classes:
|
||||||
*
|
*
|
||||||
* - [[Nondeterministic]]: an expression that is not deterministic.
|
* - [[Nondeterministic]]: an expression that is not deterministic.
|
||||||
|
* - [[Stateful]]: an expression that contains mutable state. For example, MonotonicallyIncreasingID
|
||||||
|
* and Rand. A stateful expression is always non-deterministic.
|
||||||
* - [[Unevaluable]]: an expression that is not supposed to be evaluated.
|
* - [[Unevaluable]]: an expression that is not supposed to be evaluated.
|
||||||
* - [[CodegenFallback]]: an expression that does not have code gen implemented and falls back to
|
* - [[CodegenFallback]]: an expression that does not have code gen implemented and falls back to
|
||||||
* interpreted mode.
|
* interpreted mode.
|
||||||
|
* - [[NullIntolerant]]: an expression that is null intolerant (i.e. any null input will result in
|
||||||
|
* null output).
|
||||||
|
* - [[NonSQLExpression]]: a common base trait for the expressions that do not have SQL
|
||||||
|
* expressions like representation. For example, `ScalaUDF`, `ScalaUDAF`,
|
||||||
|
* and object `MapObjects` and `Invoke`.
|
||||||
|
* - [[UserDefinedExpression]]: a common base trait for user-defined functions, including
|
||||||
|
* UDF/UDAF/UDTF.
|
||||||
|
* - [[HigherOrderFunction]]: a common base trait for higher order functions that take one or more
|
||||||
|
* (lambda) functions and applies these to some objects. The function
|
||||||
|
* produces a number of variables which can be consumed by some lambda
|
||||||
|
* functions.
|
||||||
|
* - [[NamedExpression]]: An [[Expression]] that is named.
|
||||||
|
* - [[TimeZoneAwareExpression]]: A common base trait for time zone aware expressions.
|
||||||
|
* - [[SubqueryExpression]]: A base interface for expressions that contain a [[LogicalPlan]].
|
||||||
*
|
*
|
||||||
* - [[LeafExpression]]: an expression that has no child.
|
* - [[LeafExpression]]: an expression that has no child.
|
||||||
* - [[UnaryExpression]]: an expression that has one child.
|
* - [[UnaryExpression]]: an expression that has one child.
|
||||||
|
@ -54,12 +71,20 @@ import org.apache.spark.sql.types._
|
||||||
* - [[BinaryOperator]]: a special case of [[BinaryExpression]] that requires two children to have
|
* - [[BinaryOperator]]: a special case of [[BinaryExpression]] that requires two children to have
|
||||||
* the same output data type.
|
* the same output data type.
|
||||||
*
|
*
|
||||||
|
* A few important traits used for type coercion rules:
|
||||||
|
* - [[ExpectsInputTypes]]: an expression that has the expected input types. This trait is typically
|
||||||
|
* used by operator expressions (e.g. [[Add]], [[Subtract]]) to define
|
||||||
|
* expected input types without any implicit casting.
|
||||||
|
* - [[ImplicitCastInputTypes]]: an expression that has the expected input types, which can be
|
||||||
|
* implicitly castable using [[TypeCoercion.ImplicitTypeCasts]].
|
||||||
|
* - [[ComplexTypeMergingExpression]]: to resolve output types of the complex expressions
|
||||||
|
* (e.g., [[CaseWhen]]).
|
||||||
*/
|
*/
|
||||||
abstract class Expression extends TreeNode[Expression] {
|
abstract class Expression extends TreeNode[Expression] {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns true when an expression is a candidate for static evaluation before the query is
|
* Returns true when an expression is a candidate for static evaluation before the query is
|
||||||
* executed.
|
* executed. A typical use case: [[org.apache.spark.sql.catalyst.optimizer.ConstantFolding]]
|
||||||
*
|
*
|
||||||
* The following conditions are used to determine suitability for constant folding:
|
* The following conditions are used to determine suitability for constant folding:
|
||||||
* - A [[Coalesce]] is foldable if all of its children are foldable
|
* - A [[Coalesce]] is foldable if all of its children are foldable
|
||||||
|
@ -72,7 +97,8 @@ abstract class Expression extends TreeNode[Expression] {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns true when the current expression always return the same result for fixed inputs from
|
* Returns true when the current expression always return the same result for fixed inputs from
|
||||||
* children.
|
* children. The non-deterministic expressions should not change in number and order. They should
|
||||||
|
* not be evaluated during the query planning.
|
||||||
*
|
*
|
||||||
* Note that this means that an expression should be considered as non-deterministic if:
|
* Note that this means that an expression should be considered as non-deterministic if:
|
||||||
* - it relies on some mutable internal state, or
|
* - it relies on some mutable internal state, or
|
||||||
|
@ -252,8 +278,9 @@ abstract class Expression extends TreeNode[Expression] {
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An expression that cannot be evaluated. Some expressions don't live past analysis or optimization
|
* An expression that cannot be evaluated. These expressions don't live past analysis or
|
||||||
* time (e.g. Star). This trait is used by those expressions.
|
* optimization time (e.g. Star) and should not be evaluated during query planning and
|
||||||
|
* execution.
|
||||||
*/
|
*/
|
||||||
trait Unevaluable extends Expression {
|
trait Unevaluable extends Expression {
|
||||||
|
|
||||||
|
@ -724,9 +751,10 @@ abstract class TernaryExpression extends Expression {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A trait resolving nullable, containsNull, valueContainsNull flags of the output date type.
|
* A trait used for resolving nullable flags, including `nullable`, `containsNull` of [[ArrayType]]
|
||||||
* This logic is usually utilized by expressions combining data from multiple child expressions
|
* and `valueContainsNull` of [[MapType]], containsNull, valueContainsNull flags of the output date
|
||||||
* of non-primitive types (e.g. [[CaseWhen]]).
|
* type. This is usually utilized by the expressions (e.g. [[CaseWhen]]) that combine data from
|
||||||
|
* multiple child expressions of non-primitive types.
|
||||||
*/
|
*/
|
||||||
trait ComplexTypeMergingExpression extends Expression {
|
trait ComplexTypeMergingExpression extends Expression {
|
||||||
|
|
||||||
|
|
|
@ -130,6 +130,9 @@ abstract class Attribute extends LeafExpression with NamedExpression with NullIn
|
||||||
* Note that exprId and qualifiers are in a separate parameter list because
|
* Note that exprId and qualifiers are in a separate parameter list because
|
||||||
* we only pattern match on child and name.
|
* we only pattern match on child and name.
|
||||||
*
|
*
|
||||||
|
* Note that when creating a new Alias, all the [[AttributeReference]] that refer to
|
||||||
|
* the original alias should be updated to the new one.
|
||||||
|
*
|
||||||
* @param child The computation being performed
|
* @param child The computation being performed
|
||||||
* @param name The name to be associated with the result of computing [[child]].
|
* @param name The name to be associated with the result of computing [[child]].
|
||||||
* @param exprId A globally unique id used to check if an [[AttributeReference]] refers to this
|
* @param exprId A globally unique id used to check if an [[AttributeReference]] refers to this
|
||||||
|
|
|
@ -17,11 +17,11 @@
|
||||||
|
|
||||||
package org.apache.spark.sql.catalyst.plans.logical
|
package org.apache.spark.sql.catalyst.plans.logical
|
||||||
|
|
||||||
import org.apache.spark.sql.catalyst.{AliasIdentifier}
|
import org.apache.spark.sql.catalyst.AliasIdentifier
|
||||||
import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, NamedRelation}
|
import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, NamedRelation}
|
||||||
import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable}
|
import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable}
|
||||||
import org.apache.spark.sql.catalyst.expressions._
|
import org.apache.spark.sql.catalyst.expressions._
|
||||||
import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
|
import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, AggregateFunction}
|
||||||
import org.apache.spark.sql.catalyst.plans._
|
import org.apache.spark.sql.catalyst.plans._
|
||||||
import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RangePartitioning, RoundRobinPartitioning}
|
import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RangePartitioning, RoundRobinPartitioning}
|
||||||
import org.apache.spark.sql.catalyst.util.truncatedString
|
import org.apache.spark.sql.catalyst.util.truncatedString
|
||||||
|
@ -575,6 +575,18 @@ case class Range(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This is a Group by operator with the aggregate functions and projections.
|
||||||
|
*
|
||||||
|
* @param groupingExpressions expressions for grouping keys
|
||||||
|
* @param aggregateExpressions expressions for a project list, which could contain
|
||||||
|
* [[AggregateFunction]]s.
|
||||||
|
*
|
||||||
|
* Note: Currently, aggregateExpressions is the project list of this Group by operator. Before
|
||||||
|
* separating projection from grouping and aggregate, we should avoid expression-level optimization
|
||||||
|
* on aggregateExpressions, which could reference an expression in groupingExpressions.
|
||||||
|
* For example, see the rule [[org.apache.spark.sql.catalyst.optimizer.SimplifyExtractValueOps]]
|
||||||
|
*/
|
||||||
case class Aggregate(
|
case class Aggregate(
|
||||||
groupingExpressions: Seq[Expression],
|
groupingExpressions: Seq[Expression],
|
||||||
aggregateExpressions: Seq[NamedExpression],
|
aggregateExpressions: Seq[NamedExpression],
|
||||||
|
|
Loading…
Reference in a new issue