SPARK-6433 hive tests to import spark-sql test JAR for QueryTest access
1. Test JARs are built & published 1. log4j.resources is explicitly excluded. Without this, downstream test run logging depends on the order the JARs are listed/loaded 1. sql/hive pulls in spark-sql &...spark-catalyst for its test runs 1. The copied in test classes were rm'd, and a test edited to remove its now duplicate assert method 1. Spark streaming is now build with the same plugin/phase as the rest, but its shade plugin declaration is kept in (so different from the rest of the test plugins). Due to (#2), this means the test JAR no longer includes its log4j file. Outstanding issues: * should the JARs be shaded? `spark-streaming-test.jar` does, but given these are test jars for developers only, especially in the same spark source tree, it's hard to justify. * `maven-jar-plugin` v 2.6 was explicitly selected; without this the apache-1.4 parent template JAR version (2.4) chosen. * Are there any other resources to exclude? Author: Steve Loughran <stevel@hortonworks.com> Closes #5119 from steveloughran/stevel/patches/SPARK-6433-test-jars and squashes the following commits: 81ceb01 [Steve Loughran] SPARK-6433 add a clearer comment explaining what the plugin is doing & why a6dca33 [Steve Loughran] SPARK-6433 : pull configuration section form archive plugin c2b5f89 [Steve Loughran] SPARK-6433 omit "jar" goal from jar plugin fdac51b [Steve Loughran] SPARK-6433 -002; indentation & delegate plugin version to parent 650f442 [Steve Loughran] SPARK-6433 patch 001: test JARs are built; sql/hive pulls in spark-sql & spark-catalyst for its test runs
This commit is contained in:
parent
d36c5fca7b
commit
ee11be2582
20
pom.xml
20
pom.xml
|
@ -1265,6 +1265,7 @@
|
|||
<id>create-source-jar</id>
|
||||
<goals>
|
||||
<goal>jar-no-fork</goal>
|
||||
<goal>test-jar-no-fork</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
|
@ -1473,6 +1474,25 @@
|
|||
<groupId>org.scalatest</groupId>
|
||||
<artifactId>scalatest-maven-plugin</artifactId>
|
||||
</plugin>
|
||||
<!-- Build test-jar's for all projects, since some projects depend on tests from others -->
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jar-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>prepare-test-jar</id>
|
||||
<phase>prepare-package</phase>
|
||||
<goals>
|
||||
<goal>test-jar</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<excludes>
|
||||
<exclude>log4j.properties</exclude>
|
||||
</excludes>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
|
|
|
@ -89,6 +89,20 @@
|
|||
<artifactId>junit</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||
<type>test-jar</type>
|
||||
<version>${project.version}</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-catalyst_${scala.binary.version}</artifactId>
|
||||
<type>test-jar</type>
|
||||
<version>${project.version}</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
<profiles>
|
||||
<profile>
|
||||
|
|
|
@ -1,140 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.sql
|
||||
|
||||
import scala.collection.JavaConversions._
|
||||
|
||||
import org.apache.spark.sql.catalyst.plans._
|
||||
import org.apache.spark.sql.catalyst.util._
|
||||
|
||||
|
||||
/**
|
||||
* *** DUPLICATED FROM sql/core. ***
|
||||
*
|
||||
* It is hard to have maven allow one subproject depend on another subprojects test code.
|
||||
* So, we duplicate this code here.
|
||||
*/
|
||||
class QueryTest extends PlanTest {
|
||||
|
||||
/**
|
||||
* Runs the plan and makes sure the answer contains all of the keywords, or the
|
||||
* none of keywords are listed in the answer
|
||||
* @param rdd the [[DataFrame]] to be executed
|
||||
* @param exists true for make sure the keywords are listed in the output, otherwise
|
||||
* to make sure none of the keyword are not listed in the output
|
||||
* @param keywords keyword in string array
|
||||
*/
|
||||
def checkExistence(rdd: DataFrame, exists: Boolean, keywords: String*) {
|
||||
val outputs = rdd.collect().map(_.mkString).mkString
|
||||
for (key <- keywords) {
|
||||
if (exists) {
|
||||
assert(outputs.contains(key), s"Failed for $rdd ($key doens't exist in result)")
|
||||
} else {
|
||||
assert(!outputs.contains(key), s"Failed for $rdd ($key existed in the result)")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Runs the plan and makes sure the answer matches the expected result.
|
||||
* @param rdd the [[DataFrame]] to be executed
|
||||
* @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
|
||||
*/
|
||||
protected def checkAnswer(rdd: DataFrame, expectedAnswer: Seq[Row]): Unit = {
|
||||
QueryTest.checkAnswer(rdd, expectedAnswer) match {
|
||||
case Some(errorMessage) => fail(errorMessage)
|
||||
case None =>
|
||||
}
|
||||
}
|
||||
|
||||
protected def checkAnswer(rdd: DataFrame, expectedAnswer: Row): Unit = {
|
||||
checkAnswer(rdd, Seq(expectedAnswer))
|
||||
}
|
||||
|
||||
def sqlTest(sqlString: String, expectedAnswer: Seq[Row])(implicit sqlContext: SQLContext): Unit = {
|
||||
test(sqlString) {
|
||||
checkAnswer(sqlContext.sql(sqlString), expectedAnswer)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
object QueryTest {
|
||||
/**
|
||||
* Runs the plan and makes sure the answer matches the expected result.
|
||||
* If there was exception during the execution or the contents of the DataFrame does not
|
||||
* match the expected result, an error message will be returned. Otherwise, a [[None]] will
|
||||
* be returned.
|
||||
* @param rdd the [[DataFrame]] to be executed
|
||||
* @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
|
||||
*/
|
||||
def checkAnswer(rdd: DataFrame, expectedAnswer: Seq[Row]): Option[String] = {
|
||||
val isSorted = rdd.logicalPlan.collect { case s: logical.Sort => s }.nonEmpty
|
||||
def prepareAnswer(answer: Seq[Row]): Seq[Row] = {
|
||||
// Converts data to types that we can do equality comparison using Scala collections.
|
||||
// For BigDecimal type, the Scala type has a better definition of equality test (similar to
|
||||
// Java's java.math.BigDecimal.compareTo).
|
||||
val converted: Seq[Row] = answer.map { s =>
|
||||
Row.fromSeq(s.toSeq.map {
|
||||
case d: java.math.BigDecimal => BigDecimal(d)
|
||||
case o => o
|
||||
})
|
||||
}
|
||||
if (!isSorted) converted.sortBy(_.toString) else converted
|
||||
}
|
||||
val sparkAnswer = try rdd.collect().toSeq catch {
|
||||
case e: Exception =>
|
||||
val errorMessage =
|
||||
s"""
|
||||
|Exception thrown while executing query:
|
||||
|${rdd.queryExecution}
|
||||
|== Exception ==
|
||||
|$e
|
||||
|${org.apache.spark.sql.catalyst.util.stackTraceToString(e)}
|
||||
""".stripMargin
|
||||
return Some(errorMessage)
|
||||
}
|
||||
|
||||
if (prepareAnswer(expectedAnswer) != prepareAnswer(sparkAnswer)) {
|
||||
val errorMessage =
|
||||
s"""
|
||||
|Results do not match for query:
|
||||
|${rdd.logicalPlan}
|
||||
|== Analyzed Plan ==
|
||||
|${rdd.queryExecution.analyzed}
|
||||
|== Physical Plan ==
|
||||
|${rdd.queryExecution.executedPlan}
|
||||
|== Results ==
|
||||
|${sideBySide(
|
||||
s"== Correct Answer - ${expectedAnswer.size} ==" +:
|
||||
prepareAnswer(expectedAnswer).map(_.toString),
|
||||
s"== Spark Answer - ${sparkAnswer.size} ==" +:
|
||||
prepareAnswer(sparkAnswer).map(_.toString)).mkString("\n")}
|
||||
""".stripMargin
|
||||
return Some(errorMessage)
|
||||
}
|
||||
|
||||
return None
|
||||
}
|
||||
|
||||
def checkAnswer(rdd: DataFrame, expectedAnswer: java.util.List[Row]): String = {
|
||||
checkAnswer(rdd, expectedAnswer.toSeq) match {
|
||||
case Some(errorMessage) => errorMessage
|
||||
case None => null
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,57 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.sql.catalyst.plans
|
||||
|
||||
import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, ExprId}
|
||||
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
|
||||
import org.apache.spark.sql.catalyst.util._
|
||||
import org.scalatest.FunSuite
|
||||
|
||||
/**
|
||||
* *** DUPLICATED FROM sql/catalyst/plans. ***
|
||||
*
|
||||
* It is hard to have maven allow one subproject depend on another subprojects test code.
|
||||
* So, we duplicate this code here.
|
||||
*/
|
||||
class PlanTest extends FunSuite {
|
||||
|
||||
/**
|
||||
* Since attribute references are given globally unique ids during analysis,
|
||||
* we must normalize them to check if two different queries are identical.
|
||||
*/
|
||||
protected def normalizeExprIds(plan: LogicalPlan) = {
|
||||
plan transformAllExpressions {
|
||||
case a: AttributeReference =>
|
||||
AttributeReference(a.name, a.dataType, a.nullable)(exprId = ExprId(0))
|
||||
case a: Alias =>
|
||||
Alias(a.child, a.name)(exprId = ExprId(0))
|
||||
}
|
||||
}
|
||||
|
||||
/** Fails the test if the two plans do not match */
|
||||
protected def comparePlans(plan1: LogicalPlan, plan2: LogicalPlan) {
|
||||
val normalized1 = normalizeExprIds(plan1)
|
||||
val normalized2 = normalizeExprIds(plan2)
|
||||
if (normalized1 != normalized2)
|
||||
fail(
|
||||
s"""
|
||||
|== FAIL: Plans do not match ===
|
||||
|${sideBySide(normalized1.treeString, normalized2.treeString).mkString("\n")}
|
||||
""".stripMargin)
|
||||
}
|
||||
}
|
|
@ -24,21 +24,6 @@ import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest}
|
|||
import org.apache.spark.storage.RDDBlockId
|
||||
|
||||
class CachedTableSuite extends QueryTest {
|
||||
/**
|
||||
* Throws a test failed exception when the number of cached tables differs from the expected
|
||||
* number.
|
||||
*/
|
||||
def assertCached(query: DataFrame, numCachedTables: Int = 1): Unit = {
|
||||
val planWithCaching = query.queryExecution.withCachedData
|
||||
val cachedData = planWithCaching collect {
|
||||
case cached: InMemoryRelation => cached
|
||||
}
|
||||
|
||||
assert(
|
||||
cachedData.size == numCachedTables,
|
||||
s"Expected query to contain $numCachedTables, but it actually had ${cachedData.size}\n" +
|
||||
planWithCaching)
|
||||
}
|
||||
|
||||
def rddIdOf(tableName: String): Int = {
|
||||
val executedPlan = table(tableName).queryExecution.executedPlan
|
||||
|
|
|
@ -97,34 +97,6 @@
|
|||
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
|
||||
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
|
||||
<plugins>
|
||||
<!--
|
||||
This plugin forces the generation of jar containing streaming test classes,
|
||||
so that the tests classes of external modules can use them. The two execution profiles
|
||||
are necessary - first one for 'mvn package', second one for 'mvn test-compile'. Ideally,
|
||||
'mvn compile' should not compile test classes and therefore should not need this.
|
||||
However, an open Maven bug (http://jira.codehaus.org/browse/MNG-3559)
|
||||
causes the compilation to fail if streaming test-jar is not generated. Hence, the
|
||||
second execution profile for 'mvn test-compile'.
|
||||
-->
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jar-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>test-jar</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>test-jar-on-test-compile</id>
|
||||
<phase>test-compile</phase>
|
||||
<goals>
|
||||
<goal>test-jar</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-shade-plugin</artifactId>
|
||||
|
|
Loading…
Reference in a new issue