SPARK-6433 hive tests to import spark-sql test JAR for QueryTest access

1. Test JARs are built & published 1. log4j.resources is explicitly excluded. Without this, downstream test run logging depends on the order the JARs are listed/loaded 1. sql/hive pulls in spark-sql &...spark-catalyst for its test runs 1. The copied in test classes were rm'd, and a test edited to remove its now duplicate assert method 1. Spark streaming is now build with the same plugin/phase as the rest, but its shade plugin declaration is kept in (so different from the rest of the test plugins). Due to (#2), this means the test JAR no longer includes its log4j file. Outstanding issues: * should the JARs be shaded? `spark-streaming-test.jar` does, but given these are test jars for developers only, especially in the same spark source tree, it's hard to justify. * `maven-jar-plugin` v 2.6 was explicitly selected; without this the apache-1.4 parent template JAR version (2.4) chosen. * Are there any other resources to exclude? Author: Steve Loughran <stevel@hortonworks.com> Closes #5119 from steveloughran/stevel/patches/SPARK-6433-test-jars and squashes the following commits: 81ceb01 [Steve Loughran] SPARK-6433 add a clearer comment explaining what the plugin is doing & why a6dca33 [Steve Loughran] SPARK-6433 : pull configuration section form archive plugin c2b5f89 [Steve Loughran] SPARK-6433 omit "jar" goal from jar plugin fdac51b [Steve Loughran] SPARK-6433 -002; indentation & delegate plugin version to parent 650f442 [Steve Loughran] SPARK-6433 patch 001: test JARs are built; sql/hive pulls in spark-sql & spark-catalyst for its test runs
2015-04-01 16:26:54 +01:00 · 2015-04-01 16:26:54 +01:00 · ee11be2582
parent d36c5fca7b
commit ee11be2582
6 changed files with 34 additions and 240 deletions
--- a/pom.xml
+++ b/pom.xml
@ -1265,6 +1265,7 @@
              <id>create-source-jar</id>
              <goals>
                <goal>jar-no-fork</goal>
+                <goal>test-jar-no-fork</goal>
              </goals>
            </execution>
          </executions>
@ -1473,6 +1474,25 @@
        <groupId>org.scalatest</groupId>
        <artifactId>scalatest-maven-plugin</artifactId>
      </plugin>
+      <!-- Build test-jar's for all projects, since some projects depend on tests from others -->
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-jar-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>prepare-test-jar</id>
+            <phase>prepare-package</phase>
+            <goals>
+              <goal>test-jar</goal>
+            </goals>
+            <configuration>
+              <excludes>
+                <exclude>log4j.properties</exclude>
+              </excludes>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
    </plugins>
  </build>

--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@ -89,6 +89,20 @@
      <artifactId>junit</artifactId>
      <scope>test</scope>
    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-sql_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-catalyst_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
  </dependencies>
  <profiles>
    <profile>
--- a/sql/hive/src/test/scala/org/apache/spark/sql/QueryTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/QueryTest.scala
@ -1,140 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql
-
-import scala.collection.JavaConversions._
-
-import org.apache.spark.sql.catalyst.plans._
-import org.apache.spark.sql.catalyst.util._
-
-
-/**
- * *** DUPLICATED FROM sql/core. ***
- *
- * It is hard to have maven allow one subproject depend on another subprojects test code.
- * So, we duplicate this code here.
- */
-class QueryTest extends PlanTest {
-
-  /**
-   * Runs the plan and makes sure the answer contains all of the keywords, or the
-   * none of keywords are listed in the answer
-   * @param rdd the [[DataFrame]] to be executed
-   * @param exists true for make sure the keywords are listed in the output, otherwise
-   *               to make sure none of the keyword are not listed in the output
-   * @param keywords keyword in string array
-   */
-  def checkExistence(rdd: DataFrame, exists: Boolean, keywords: String*) {
-    val outputs = rdd.collect().map(_.mkString).mkString
-    for (key <- keywords) {
-      if (exists) {
-        assert(outputs.contains(key), s"Failed for $rdd ($key doens't exist in result)")
-      } else {
-        assert(!outputs.contains(key), s"Failed for $rdd ($key existed in the result)")
-      }
-    }
-  }
-
-  /**
-   * Runs the plan and makes sure the answer matches the expected result.
-   * @param rdd the [[DataFrame]] to be executed
-   * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
-   */
-  protected def checkAnswer(rdd: DataFrame, expectedAnswer: Seq[Row]): Unit = {
-    QueryTest.checkAnswer(rdd, expectedAnswer) match {
-      case Some(errorMessage) => fail(errorMessage)
-      case None =>
-    }
-  }
-
-  protected def checkAnswer(rdd: DataFrame, expectedAnswer: Row): Unit = {
-    checkAnswer(rdd, Seq(expectedAnswer))
-  }
-
-  def sqlTest(sqlString: String, expectedAnswer: Seq[Row])(implicit sqlContext: SQLContext): Unit = {
-    test(sqlString) {
-      checkAnswer(sqlContext.sql(sqlString), expectedAnswer)
-    }
-  }
-}
-
-object QueryTest {
-  /**
-   * Runs the plan and makes sure the answer matches the expected result.
-   * If there was exception during the execution or the contents of the DataFrame does not
-   * match the expected result, an error message will be returned. Otherwise, a [[None]] will
-   * be returned.
-   * @param rdd the [[DataFrame]] to be executed
-   * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
-   */
-  def checkAnswer(rdd: DataFrame, expectedAnswer: Seq[Row]): Option[String] = {
-    val isSorted = rdd.logicalPlan.collect { case s: logical.Sort => s }.nonEmpty
-    def prepareAnswer(answer: Seq[Row]): Seq[Row] = {
-      // Converts data to types that we can do equality comparison using Scala collections.
-      // For BigDecimal type, the Scala type has a better definition of equality test (similar to
-      // Java's java.math.BigDecimal.compareTo).
-      val converted: Seq[Row] = answer.map { s =>
-        Row.fromSeq(s.toSeq.map {
-          case d: java.math.BigDecimal => BigDecimal(d)
-          case o => o
-        })
-      }
-      if (!isSorted) converted.sortBy(_.toString) else converted
-    }
-    val sparkAnswer = try rdd.collect().toSeq catch {
-      case e: Exception =>
-        val errorMessage =
-          s"""
-            |Exception thrown while executing query:
-            |${rdd.queryExecution}
-            |== Exception ==
-            |$e
-            |${org.apache.spark.sql.catalyst.util.stackTraceToString(e)}
-          """.stripMargin
-        return Some(errorMessage)
-    }
-
-    if (prepareAnswer(expectedAnswer) != prepareAnswer(sparkAnswer)) {
-      val errorMessage =
-        s"""
-        |Results do not match for query:
-        |${rdd.logicalPlan}
-        |== Analyzed Plan ==
-        |${rdd.queryExecution.analyzed}
-        |== Physical Plan ==
-        |${rdd.queryExecution.executedPlan}
-        |== Results ==
-        |${sideBySide(
-          s"== Correct Answer - ${expectedAnswer.size} ==" +:
-            prepareAnswer(expectedAnswer).map(_.toString),
-          s"== Spark Answer - ${sparkAnswer.size} ==" +:
-            prepareAnswer(sparkAnswer).map(_.toString)).mkString("\n")}
-      """.stripMargin
-      return Some(errorMessage)
-    }
-
-    return None
-  }
-
-  def checkAnswer(rdd: DataFrame, expectedAnswer: java.util.List[Row]): String = {
-    checkAnswer(rdd, expectedAnswer.toSeq) match {
-      case Some(errorMessage) => errorMessage
-      case None => null
-    }
-  }
-}
--- a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.plans
-
-import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, ExprId}
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.catalyst.util._
-import org.scalatest.FunSuite
-
-/**
- * *** DUPLICATED FROM sql/catalyst/plans. ***
- *
- * It is hard to have maven allow one subproject depend on another subprojects test code.
- * So, we duplicate this code here.
- */
-class PlanTest extends FunSuite {
-
-  /**
-   * Since attribute references are given globally unique ids during analysis,
-   * we must normalize them to check if two different queries are identical.
-   */
-  protected def normalizeExprIds(plan: LogicalPlan) = {
-    plan transformAllExpressions {
-      case a: AttributeReference =>
-        AttributeReference(a.name, a.dataType, a.nullable)(exprId = ExprId(0))
-      case a: Alias =>
-        Alias(a.child, a.name)(exprId = ExprId(0))
-    }
-  }
-
-  /** Fails the test if the two plans do not match */
-  protected def comparePlans(plan1: LogicalPlan, plan2: LogicalPlan) {
-    val normalized1 = normalizeExprIds(plan1)
-    val normalized2 = normalizeExprIds(plan2)
-    if (normalized1 != normalized2)
-      fail(
-        s"""
-          |== FAIL: Plans do not match ===
-          |${sideBySide(normalized1.treeString, normalized2.treeString).mkString("\n")}
-        """.stripMargin)
-  }
-}
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
@ -24,21 +24,6 @@ import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest}
 import org.apache.spark.storage.RDDBlockId

 class CachedTableSuite extends QueryTest {
-  /**
-   * Throws a test failed exception when the number of cached tables differs from the expected
-   * number.
-   */
-  def assertCached(query: DataFrame, numCachedTables: Int = 1): Unit = {
-    val planWithCaching = query.queryExecution.withCachedData
-    val cachedData = planWithCaching collect {
-      case cached: InMemoryRelation => cached
-    }
-
-    assert(
-      cachedData.size == numCachedTables,
-      s"Expected query to contain $numCachedTables, but it actually had ${cachedData.size}\n" +
-        planWithCaching)
-  }

  def rddIdOf(tableName: String): Int = {
    val executedPlan = table(tableName).queryExecution.executedPlan
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@ -97,34 +97,6 @@
    <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
    <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
    <plugins>
-      <!--
-           This plugin forces the generation of jar containing streaming test classes,
-           so that the tests classes of external modules can use them. The two execution profiles
-           are necessary - first one for 'mvn package', second one for 'mvn test-compile'. Ideally,
-           'mvn compile' should not compile test classes and therefore should not need this.
-           However, an open Maven bug (http://jira.codehaus.org/browse/MNG-3559)
-           causes the compilation to fail if streaming test-jar is not generated. Hence, the
-           second execution profile for 'mvn test-compile'.
-      -->
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-jar-plugin</artifactId>
-        <executions>
-          <execution>
-            <goals>
-              <goal>test-jar</goal>
-            </goals>
-          </execution>
-          <execution>
-            <id>test-jar-on-test-compile</id>
-            <phase>test-compile</phase>
-            <goals>
-              <goal>test-jar</goal>
-            </goals>
-          </execution>
-        </executions>
-      </plugin>
-
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-shade-plugin</artifactId>