SPARK-6433 hive tests to import spark-sql test JAR for QueryTest access

1. Test JARs are built & published
1. log4j.resources is explicitly excluded. Without this, downstream test run logging depends on the order the JARs are listed/loaded
1. sql/hive pulls in spark-sql &...spark-catalyst for its test runs
1. The copied in test classes were rm'd, and a test edited to remove its now duplicate assert method
1. Spark streaming is now build with the same plugin/phase as the rest, but its shade plugin declaration is kept in (so different from the rest of the test plugins). Due to (#2), this means the test JAR no longer includes its log4j file.

Outstanding issues:
* should the JARs be shaded? `spark-streaming-test.jar` does, but given these are test jars for developers only, especially in the same spark source tree, it's hard to justify.
* `maven-jar-plugin` v 2.6 was explicitly selected; without this the apache-1.4 parent template JAR version (2.4) chosen.
* Are there any other resources to exclude?

Author: Steve Loughran <stevel@hortonworks.com>

Closes #5119 from steveloughran/stevel/patches/SPARK-6433-test-jars and squashes the following commits:

81ceb01 [Steve Loughran] SPARK-6433 add a clearer comment explaining what the plugin is doing & why
a6dca33 [Steve Loughran] SPARK-6433 : pull configuration section form archive plugin
c2b5f89 [Steve Loughran] SPARK-6433 omit "jar" goal from jar plugin
fdac51b [Steve Loughran] SPARK-6433 -002; indentation & delegate plugin version to parent
650f442 [Steve Loughran] SPARK-6433 patch 001: test JARs are built; sql/hive pulls in spark-sql & spark-catalyst for its test runs
This commit is contained in:
Steve Loughran 2015-04-01 16:26:54 +01:00 committed by Sean Owen
parent d36c5fca7b
commit ee11be2582
6 changed files with 34 additions and 240 deletions

20
pom.xml
View file

@ -1265,6 +1265,7 @@
<id>create-source-jar</id>
<goals>
<goal>jar-no-fork</goal>
<goal>test-jar-no-fork</goal>
</goals>
</execution>
</executions>
@ -1473,6 +1474,25 @@
<groupId>org.scalatest</groupId>
<artifactId>scalatest-maven-plugin</artifactId>
</plugin>
<!-- Build test-jar's for all projects, since some projects depend on tests from others -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<executions>
<execution>
<id>prepare-test-jar</id>
<phase>prepare-package</phase>
<goals>
<goal>test-jar</goal>
</goals>
<configuration>
<excludes>
<exclude>log4j.properties</exclude>
</excludes>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>

View file

@ -89,6 +89,20 @@
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<type>test-jar</type>
<version>${project.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-catalyst_${scala.binary.version}</artifactId>
<type>test-jar</type>
<version>${project.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
<profiles>
<profile>

View file

@ -1,140 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql
import scala.collection.JavaConversions._
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.util._
/**
* *** DUPLICATED FROM sql/core. ***
*
* It is hard to have maven allow one subproject depend on another subprojects test code.
* So, we duplicate this code here.
*/
class QueryTest extends PlanTest {
/**
* Runs the plan and makes sure the answer contains all of the keywords, or the
* none of keywords are listed in the answer
* @param rdd the [[DataFrame]] to be executed
* @param exists true for make sure the keywords are listed in the output, otherwise
* to make sure none of the keyword are not listed in the output
* @param keywords keyword in string array
*/
def checkExistence(rdd: DataFrame, exists: Boolean, keywords: String*) {
val outputs = rdd.collect().map(_.mkString).mkString
for (key <- keywords) {
if (exists) {
assert(outputs.contains(key), s"Failed for $rdd ($key doens't exist in result)")
} else {
assert(!outputs.contains(key), s"Failed for $rdd ($key existed in the result)")
}
}
}
/**
* Runs the plan and makes sure the answer matches the expected result.
* @param rdd the [[DataFrame]] to be executed
* @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
*/
protected def checkAnswer(rdd: DataFrame, expectedAnswer: Seq[Row]): Unit = {
QueryTest.checkAnswer(rdd, expectedAnswer) match {
case Some(errorMessage) => fail(errorMessage)
case None =>
}
}
protected def checkAnswer(rdd: DataFrame, expectedAnswer: Row): Unit = {
checkAnswer(rdd, Seq(expectedAnswer))
}
def sqlTest(sqlString: String, expectedAnswer: Seq[Row])(implicit sqlContext: SQLContext): Unit = {
test(sqlString) {
checkAnswer(sqlContext.sql(sqlString), expectedAnswer)
}
}
}
object QueryTest {
/**
* Runs the plan and makes sure the answer matches the expected result.
* If there was exception during the execution or the contents of the DataFrame does not
* match the expected result, an error message will be returned. Otherwise, a [[None]] will
* be returned.
* @param rdd the [[DataFrame]] to be executed
* @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
*/
def checkAnswer(rdd: DataFrame, expectedAnswer: Seq[Row]): Option[String] = {
val isSorted = rdd.logicalPlan.collect { case s: logical.Sort => s }.nonEmpty
def prepareAnswer(answer: Seq[Row]): Seq[Row] = {
// Converts data to types that we can do equality comparison using Scala collections.
// For BigDecimal type, the Scala type has a better definition of equality test (similar to
// Java's java.math.BigDecimal.compareTo).
val converted: Seq[Row] = answer.map { s =>
Row.fromSeq(s.toSeq.map {
case d: java.math.BigDecimal => BigDecimal(d)
case o => o
})
}
if (!isSorted) converted.sortBy(_.toString) else converted
}
val sparkAnswer = try rdd.collect().toSeq catch {
case e: Exception =>
val errorMessage =
s"""
|Exception thrown while executing query:
|${rdd.queryExecution}
|== Exception ==
|$e
|${org.apache.spark.sql.catalyst.util.stackTraceToString(e)}
""".stripMargin
return Some(errorMessage)
}
if (prepareAnswer(expectedAnswer) != prepareAnswer(sparkAnswer)) {
val errorMessage =
s"""
|Results do not match for query:
|${rdd.logicalPlan}
|== Analyzed Plan ==
|${rdd.queryExecution.analyzed}
|== Physical Plan ==
|${rdd.queryExecution.executedPlan}
|== Results ==
|${sideBySide(
s"== Correct Answer - ${expectedAnswer.size} ==" +:
prepareAnswer(expectedAnswer).map(_.toString),
s"== Spark Answer - ${sparkAnswer.size} ==" +:
prepareAnswer(sparkAnswer).map(_.toString)).mkString("\n")}
""".stripMargin
return Some(errorMessage)
}
return None
}
def checkAnswer(rdd: DataFrame, expectedAnswer: java.util.List[Row]): String = {
checkAnswer(rdd, expectedAnswer.toSeq) match {
case Some(errorMessage) => errorMessage
case None => null
}
}
}

View file

@ -1,57 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.catalyst.plans
import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, ExprId}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.util._
import org.scalatest.FunSuite
/**
* *** DUPLICATED FROM sql/catalyst/plans. ***
*
* It is hard to have maven allow one subproject depend on another subprojects test code.
* So, we duplicate this code here.
*/
class PlanTest extends FunSuite {
/**
* Since attribute references are given globally unique ids during analysis,
* we must normalize them to check if two different queries are identical.
*/
protected def normalizeExprIds(plan: LogicalPlan) = {
plan transformAllExpressions {
case a: AttributeReference =>
AttributeReference(a.name, a.dataType, a.nullable)(exprId = ExprId(0))
case a: Alias =>
Alias(a.child, a.name)(exprId = ExprId(0))
}
}
/** Fails the test if the two plans do not match */
protected def comparePlans(plan1: LogicalPlan, plan2: LogicalPlan) {
val normalized1 = normalizeExprIds(plan1)
val normalized2 = normalizeExprIds(plan2)
if (normalized1 != normalized2)
fail(
s"""
|== FAIL: Plans do not match ===
|${sideBySide(normalized1.treeString, normalized2.treeString).mkString("\n")}
""".stripMargin)
}
}

View file

@ -24,21 +24,6 @@ import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest}
import org.apache.spark.storage.RDDBlockId
class CachedTableSuite extends QueryTest {
/**
* Throws a test failed exception when the number of cached tables differs from the expected
* number.
*/
def assertCached(query: DataFrame, numCachedTables: Int = 1): Unit = {
val planWithCaching = query.queryExecution.withCachedData
val cachedData = planWithCaching collect {
case cached: InMemoryRelation => cached
}
assert(
cachedData.size == numCachedTables,
s"Expected query to contain $numCachedTables, but it actually had ${cachedData.size}\n" +
planWithCaching)
}
def rddIdOf(tableName: String): Int = {
val executedPlan = table(tableName).queryExecution.executedPlan

View file

@ -97,34 +97,6 @@
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
<plugins>
<!--
This plugin forces the generation of jar containing streaming test classes,
so that the tests classes of external modules can use them. The two execution profiles
are necessary - first one for 'mvn package', second one for 'mvn test-compile'. Ideally,
'mvn compile' should not compile test classes and therefore should not need this.
However, an open Maven bug (http://jira.codehaus.org/browse/MNG-3559)
causes the compilation to fail if streaming test-jar is not generated. Hence, the
second execution profile for 'mvn test-compile'.
-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>test-jar</goal>
</goals>
</execution>
<execution>
<id>test-jar-on-test-compile</id>
<phase>test-compile</phase>
<goals>
<goal>test-jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>