[SPARK-33696][BUILD][SQL] Upgrade built-in Hive to 2.3.8
### What changes were proposed in this pull request? Hive 2.3.8 changes: HIVE-19662: Upgrade Avro to 1.8.2 HIVE-24324: Remove deprecated API usage from Avro HIVE-23980: Shade Guava from hive-exec in Hive 2.3 HIVE-24436: Fix Avro NULL_DEFAULT_VALUE compatibility issue HIVE-24512: Exclude calcite in packaging. HIVE-22708: Fix for HttpTransport to replace String.equals HIVE-24551: Hive should include transitive dependencies from calcite after shading it HIVE-24553: Exclude calcite from test-jar dependency of hive-exec ### Why are the changes needed? Upgrade Avro and Parquet to latest version. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing test add test try to upgrade Parquet to 1.11.1 and Avro to 1.10.1: https://github.com/apache/spark/pull/30517 Closes #30657 from wangyum/SPARK-33696. Authored-by: Yuming Wang <yumwang@ebay.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
This commit is contained in:
parent
b5bdbf2ebc
commit
c87b0085c9
|
@ -81,21 +81,21 @@ hadoop-yarn-client/2.7.4//hadoop-yarn-client-2.7.4.jar
|
|||
hadoop-yarn-common/2.7.4//hadoop-yarn-common-2.7.4.jar
|
||||
hadoop-yarn-server-common/2.7.4//hadoop-yarn-server-common-2.7.4.jar
|
||||
hadoop-yarn-server-web-proxy/2.7.4//hadoop-yarn-server-web-proxy-2.7.4.jar
|
||||
hive-beeline/2.3.7//hive-beeline-2.3.7.jar
|
||||
hive-cli/2.3.7//hive-cli-2.3.7.jar
|
||||
hive-common/2.3.7//hive-common-2.3.7.jar
|
||||
hive-exec/2.3.7/core/hive-exec-2.3.7-core.jar
|
||||
hive-jdbc/2.3.7//hive-jdbc-2.3.7.jar
|
||||
hive-llap-common/2.3.7//hive-llap-common-2.3.7.jar
|
||||
hive-metastore/2.3.7//hive-metastore-2.3.7.jar
|
||||
hive-serde/2.3.7//hive-serde-2.3.7.jar
|
||||
hive-beeline/2.3.8//hive-beeline-2.3.8.jar
|
||||
hive-cli/2.3.8//hive-cli-2.3.8.jar
|
||||
hive-common/2.3.8//hive-common-2.3.8.jar
|
||||
hive-exec/2.3.8/core/hive-exec-2.3.8-core.jar
|
||||
hive-jdbc/2.3.8//hive-jdbc-2.3.8.jar
|
||||
hive-llap-common/2.3.8//hive-llap-common-2.3.8.jar
|
||||
hive-metastore/2.3.8//hive-metastore-2.3.8.jar
|
||||
hive-serde/2.3.8//hive-serde-2.3.8.jar
|
||||
hive-service-rpc/3.1.2//hive-service-rpc-3.1.2.jar
|
||||
hive-shims-0.23/2.3.7//hive-shims-0.23-2.3.7.jar
|
||||
hive-shims-common/2.3.7//hive-shims-common-2.3.7.jar
|
||||
hive-shims-scheduler/2.3.7//hive-shims-scheduler-2.3.7.jar
|
||||
hive-shims/2.3.7//hive-shims-2.3.7.jar
|
||||
hive-shims-0.23/2.3.8//hive-shims-0.23-2.3.8.jar
|
||||
hive-shims-common/2.3.8//hive-shims-common-2.3.8.jar
|
||||
hive-shims-scheduler/2.3.8//hive-shims-scheduler-2.3.8.jar
|
||||
hive-shims/2.3.8//hive-shims-2.3.8.jar
|
||||
hive-storage-api/2.7.2//hive-storage-api-2.7.2.jar
|
||||
hive-vector-code-gen/2.3.7//hive-vector-code-gen-2.3.7.jar
|
||||
hive-vector-code-gen/2.3.8//hive-vector-code-gen-2.3.8.jar
|
||||
hk2-api/2.6.1//hk2-api-2.6.1.jar
|
||||
hk2-locator/2.6.1//hk2-locator-2.6.1.jar
|
||||
hk2-utils/2.6.1//hk2-utils-2.6.1.jar
|
||||
|
|
|
@ -58,21 +58,21 @@ gson/2.2.4//gson-2.2.4.jar
|
|||
guava/14.0.1//guava-14.0.1.jar
|
||||
hadoop-client-api/3.2.2//hadoop-client-api-3.2.2.jar
|
||||
hadoop-client-runtime/3.2.2//hadoop-client-runtime-3.2.2.jar
|
||||
hive-beeline/2.3.7//hive-beeline-2.3.7.jar
|
||||
hive-cli/2.3.7//hive-cli-2.3.7.jar
|
||||
hive-common/2.3.7//hive-common-2.3.7.jar
|
||||
hive-exec/2.3.7/core/hive-exec-2.3.7-core.jar
|
||||
hive-jdbc/2.3.7//hive-jdbc-2.3.7.jar
|
||||
hive-llap-common/2.3.7//hive-llap-common-2.3.7.jar
|
||||
hive-metastore/2.3.7//hive-metastore-2.3.7.jar
|
||||
hive-serde/2.3.7//hive-serde-2.3.7.jar
|
||||
hive-beeline/2.3.8//hive-beeline-2.3.8.jar
|
||||
hive-cli/2.3.8//hive-cli-2.3.8.jar
|
||||
hive-common/2.3.8//hive-common-2.3.8.jar
|
||||
hive-exec/2.3.8/core/hive-exec-2.3.8-core.jar
|
||||
hive-jdbc/2.3.8//hive-jdbc-2.3.8.jar
|
||||
hive-llap-common/2.3.8//hive-llap-common-2.3.8.jar
|
||||
hive-metastore/2.3.8//hive-metastore-2.3.8.jar
|
||||
hive-serde/2.3.8//hive-serde-2.3.8.jar
|
||||
hive-service-rpc/3.1.2//hive-service-rpc-3.1.2.jar
|
||||
hive-shims-0.23/2.3.7//hive-shims-0.23-2.3.7.jar
|
||||
hive-shims-common/2.3.7//hive-shims-common-2.3.7.jar
|
||||
hive-shims-scheduler/2.3.7//hive-shims-scheduler-2.3.7.jar
|
||||
hive-shims/2.3.7//hive-shims-2.3.7.jar
|
||||
hive-shims-0.23/2.3.8//hive-shims-0.23-2.3.8.jar
|
||||
hive-shims-common/2.3.8//hive-shims-common-2.3.8.jar
|
||||
hive-shims-scheduler/2.3.8//hive-shims-scheduler-2.3.8.jar
|
||||
hive-shims/2.3.8//hive-shims-2.3.8.jar
|
||||
hive-storage-api/2.7.2//hive-storage-api-2.7.2.jar
|
||||
hive-vector-code-gen/2.3.7//hive-vector-code-gen-2.3.7.jar
|
||||
hive-vector-code-gen/2.3.8//hive-vector-code-gen-2.3.8.jar
|
||||
hk2-api/2.6.1//hk2-api-2.6.1.jar
|
||||
hk2-locator/2.6.1//hk2-locator-2.6.1.jar
|
||||
hk2-utils/2.6.1//hk2-utils-2.6.1.jar
|
||||
|
|
|
@ -83,9 +83,9 @@ Example:
|
|||
|
||||
To enable Hive integration for Spark SQL along with its JDBC server and CLI,
|
||||
add the `-Phive` and `-Phive-thriftserver` profiles to your existing build options.
|
||||
By default Spark will build with Hive 2.3.7.
|
||||
By default Spark will build with Hive 2.3.8.
|
||||
|
||||
# With Hive 2.3.7 support
|
||||
# With Hive 2.3.8 support
|
||||
./build/mvn -Pyarn -Phive -Phive-thriftserver -DskipTests clean package
|
||||
|
||||
## Packaging without Hadoop Dependencies for YARN
|
||||
|
|
|
@ -127,10 +127,10 @@ The following options can be used to configure the version of Hive that is used
|
|||
<tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr>
|
||||
<tr>
|
||||
<td><code>spark.sql.hive.metastore.version</code></td>
|
||||
<td><code>2.3.7</code></td>
|
||||
<td><code>2.3.8</code></td>
|
||||
<td>
|
||||
Version of the Hive metastore. Available
|
||||
options are <code>0.12.0</code> through <code>2.3.7</code> and <code>3.0.0</code> through <code>3.1.2</code>.
|
||||
options are <code>0.12.0</code> through <code>2.3.8</code> and <code>3.0.0</code> through <code>3.1.2</code>.
|
||||
</td>
|
||||
<td>1.4.0</td>
|
||||
</tr>
|
||||
|
@ -142,9 +142,9 @@ The following options can be used to configure the version of Hive that is used
|
|||
property can be one of four options:
|
||||
<ol>
|
||||
<li><code>builtin</code></li>
|
||||
Use Hive 2.3.7, which is bundled with the Spark assembly when <code>-Phive</code> is
|
||||
Use Hive 2.3.8, which is bundled with the Spark assembly when <code>-Phive</code> is
|
||||
enabled. When this option is chosen, <code>spark.sql.hive.metastore.version</code> must be
|
||||
either <code>2.3.7</code> or not defined.
|
||||
either <code>2.3.8</code> or not defined.
|
||||
<li><code>maven</code></li>
|
||||
Use Hive jars of specified version downloaded from Maven repositories. This configuration
|
||||
is not generally recommended for production deployments.
|
||||
|
|
|
@ -863,7 +863,7 @@ Python UDF registration is unchanged.
|
|||
Spark SQL is designed to be compatible with the Hive Metastore, SerDes and UDFs.
|
||||
Currently, Hive SerDes and UDFs are based on built-in Hive,
|
||||
and Spark SQL can be connected to different versions of Hive Metastore
|
||||
(from 0.12.0 to 2.3.7 and 3.0.0 to 3.1.2. Also see [Interacting with Different Versions of Hive Metastore](sql-data-sources-hive-tables.html#interacting-with-different-versions-of-hive-metastore)).
|
||||
(from 0.12.0 to 2.3.8 and 3.0.0 to 3.1.2. Also see [Interacting with Different Versions of Hive Metastore](sql-data-sources-hive-tables.html#interacting-with-different-versions-of-hive-metastore)).
|
||||
|
||||
#### Deploying in Existing Hive Warehouses
|
||||
{:.no_toc}
|
||||
|
|
20
pom.xml
20
pom.xml
|
@ -128,8 +128,8 @@
|
|||
<hive.group>org.apache.hive</hive.group>
|
||||
<hive.classifier>core</hive.classifier>
|
||||
<!-- Version used in Maven Hive dependency -->
|
||||
<hive.version>2.3.7</hive.version>
|
||||
<hive23.version>2.3.7</hive23.version>
|
||||
<hive.version>2.3.8</hive.version>
|
||||
<hive23.version>2.3.8</hive23.version>
|
||||
<!-- Version used for internal directory structure -->
|
||||
<hive.version.short>2.3</hive.version.short>
|
||||
<!-- note that this should be compatible with Kafka brokers version 0.10 and up -->
|
||||
|
@ -1891,6 +1891,22 @@
|
|||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>*</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>net.hydromatic</groupId>
|
||||
<artifactId>eigenbase-properties</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>org.codehaus.janino</groupId>
|
||||
<artifactId>commons-compiler</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>org.codehaus.janino</groupId>
|
||||
<artifactId>janino</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>org.pentaho</groupId>
|
||||
<artifactId>pentaho-aggdesigner-algorithm</artifactId>
|
||||
</exclusion>
|
||||
<!-- End of Hive 2.3 exclusion -->
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
|
|
@ -3724,20 +3724,21 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
|
|||
|
||||
test("SPARK-33084: Add jar support Ivy URI in SQL") {
|
||||
val sc = spark.sparkContext
|
||||
val hiveVersion = "2.3.8"
|
||||
// default transitive=false, only download specified jar
|
||||
sql("ADD JAR ivy://org.apache.hive.hcatalog:hive-hcatalog-core:2.3.7")
|
||||
sql(s"ADD JAR ivy://org.apache.hive.hcatalog:hive-hcatalog-core:$hiveVersion")
|
||||
assert(sc.listJars()
|
||||
.exists(_.contains("org.apache.hive.hcatalog_hive-hcatalog-core-2.3.7.jar")))
|
||||
.exists(_.contains(s"org.apache.hive.hcatalog_hive-hcatalog-core-$hiveVersion.jar")))
|
||||
|
||||
// test download ivy URL jar return multiple jars
|
||||
sql("ADD JAR ivy://org.scala-js:scalajs-test-interface_2.12:1.2.0?transitive=true")
|
||||
assert(sc.listJars().exists(_.contains("scalajs-library_2.12")))
|
||||
assert(sc.listJars().exists(_.contains("scalajs-test-interface_2.12")))
|
||||
|
||||
sql("ADD JAR ivy://org.apache.hive:hive-contrib:2.3.7" +
|
||||
sql(s"ADD JAR ivy://org.apache.hive:hive-contrib:$hiveVersion" +
|
||||
"?exclude=org.pentaho:pentaho-aggdesigner-algorithm&transitive=true")
|
||||
assert(sc.listJars().exists(_.contains("org.apache.hive_hive-contrib-2.3.7.jar")))
|
||||
assert(sc.listJars().exists(_.contains("org.apache.hive_hive-exec-2.3.7.jar")))
|
||||
assert(sc.listJars().exists(_.contains(s"org.apache.hive_hive-contrib-$hiveVersion.jar")))
|
||||
assert(sc.listJars().exists(_.contains(s"org.apache.hive_hive-exec-$hiveVersion.jar")))
|
||||
assert(!sc.listJars().exists(_.contains("org.pentaho.pentaho_aggdesigner-algorithm")))
|
||||
}
|
||||
|
||||
|
|
|
@ -546,7 +546,7 @@ class HiveThriftBinaryServerSuite extends HiveThriftServer2Test {
|
|||
conf += resultSet.getString(1) -> resultSet.getString(2)
|
||||
}
|
||||
|
||||
assert(conf.get(HiveUtils.FAKE_HIVE_VERSION.key) === Some("2.3.7"))
|
||||
assert(conf.get(HiveUtils.FAKE_HIVE_VERSION.key) === Some("2.3.8"))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -559,7 +559,7 @@ class HiveThriftBinaryServerSuite extends HiveThriftServer2Test {
|
|||
conf += resultSet.getString(1) -> resultSet.getString(2)
|
||||
}
|
||||
|
||||
assert(conf.get(HiveUtils.FAKE_HIVE_VERSION.key) === Some("2.3.7"))
|
||||
assert(conf.get(HiveUtils.FAKE_HIVE_VERSION.key) === Some("2.3.8"))
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -60,7 +60,7 @@ private[spark] object HiveUtils extends Logging {
|
|||
|
||||
val HIVE_METASTORE_VERSION = buildStaticConf("spark.sql.hive.metastore.version")
|
||||
.doc("Version of the Hive metastore. Available options are " +
|
||||
"<code>0.12.0</code> through <code>2.3.7</code> and " +
|
||||
"<code>0.12.0</code> through <code>2.3.8</code> and " +
|
||||
"<code>3.0.0</code> through <code>3.1.2</code>.")
|
||||
.version("1.4.0")
|
||||
.stringConf
|
||||
|
|
|
@ -98,8 +98,8 @@ private[hive] object IsolatedClientLoader extends Logging {
|
|||
case "2.0" | "2.0.0" | "2.0.1" => hive.v2_0
|
||||
case "2.1" | "2.1.0" | "2.1.1" => hive.v2_1
|
||||
case "2.2" | "2.2.0" => hive.v2_2
|
||||
case "2.3" | "2.3.0" | "2.3.1" | "2.3.2" | "2.3.3" | "2.3.4" | "2.3.5" | "2.3.6" | "2.3.7" =>
|
||||
hive.v2_3
|
||||
case "2.3" | "2.3.0" | "2.3.1" | "2.3.2" | "2.3.3" | "2.3.4" | "2.3.5" | "2.3.6" | "2.3.7" |
|
||||
"2.3.8" => hive.v2_3
|
||||
case "3.0" | "3.0.0" => hive.v3_0
|
||||
case "3.1" | "3.1.0" | "3.1.1" | "3.1.2" => hive.v3_1
|
||||
case version =>
|
||||
|
|
|
@ -100,11 +100,13 @@ package object client {
|
|||
"org.apache.curator:*",
|
||||
"org.pentaho:pentaho-aggdesigner-algorithm"))
|
||||
|
||||
// Since HIVE-14496, Hive materialized view need calcite-core.
|
||||
// Since HIVE-23980, calcite-core included in Hive package jar.
|
||||
// For spark, only VersionsSuite currently creates a hive materialized view for testing.
|
||||
case object v2_3 extends HiveVersion("2.3.7",
|
||||
exclusions = Seq("org.apache.calcite:calcite-druid",
|
||||
case object v2_3 extends HiveVersion("2.3.8",
|
||||
exclusions = Seq("org.apache.calcite:calcite-core",
|
||||
"org.apache.calcite:calcite-druid",
|
||||
"org.apache.calcite.avatica:avatica",
|
||||
"com.fasterxml.jackson.core:*",
|
||||
"org.apache.curator:*",
|
||||
"org.pentaho:pentaho-aggdesigner-algorithm"))
|
||||
|
||||
|
@ -114,7 +116,6 @@ package object client {
|
|||
extraDeps = Seq("org.apache.logging.log4j:log4j-api:2.10.0",
|
||||
"org.apache.derby:derby:10.14.1.0"),
|
||||
exclusions = Seq("org.apache.calcite:calcite-druid",
|
||||
"org.apache.calcite.avatica:avatica",
|
||||
"org.apache.curator:*",
|
||||
"org.pentaho:pentaho-aggdesigner-algorithm"))
|
||||
|
||||
|
@ -124,7 +125,6 @@ package object client {
|
|||
extraDeps = Seq("org.apache.logging.log4j:log4j-api:2.10.0",
|
||||
"org.apache.derby:derby:10.14.1.0"),
|
||||
exclusions = Seq("org.apache.calcite:calcite-druid",
|
||||
"org.apache.calcite.avatica:avatica",
|
||||
"org.apache.curator:*",
|
||||
"org.pentaho:pentaho-aggdesigner-algorithm"))
|
||||
|
||||
|
|
|
@ -60,7 +60,7 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
|
|||
.map(new File(_)).getOrElse(Utils.createTempDir(namePrefix = "test-spark"))
|
||||
private val unusedJar = TestUtils.createJarWithClasses(Seq.empty)
|
||||
val hiveVersion = if (SystemUtils.isJavaVersionAtLeast(JavaVersion.JAVA_9)) {
|
||||
"2.3.7"
|
||||
"2.3.8"
|
||||
} else {
|
||||
"1.2.1"
|
||||
}
|
||||
|
|
|
@ -34,6 +34,7 @@ import org.apache.spark.sql.catalyst.expressions.Cast
|
|||
import org.apache.spark.sql.catalyst.parser.ParseException
|
||||
import org.apache.spark.sql.catalyst.plans.logical.Project
|
||||
import org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec
|
||||
import org.apache.spark.sql.hive.HiveUtils.{builtinHiveVersion => hiveVersion}
|
||||
import org.apache.spark.sql.hive.test.{HiveTestJars, TestHive}
|
||||
import org.apache.spark.sql.hive.test.TestHive._
|
||||
import org.apache.spark.sql.internal.SQLConf
|
||||
|
@ -1223,17 +1224,17 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd
|
|||
test("SPARK-33084: Add jar support Ivy URI in SQL") {
|
||||
val testData = TestHive.getHiveFile("data/files/sample.json").toURI
|
||||
withTable("t") {
|
||||
sql("ADD JAR ivy://org.apache.hive.hcatalog:hive-hcatalog-core:2.3.7")
|
||||
sql(s"ADD JAR ivy://org.apache.hive.hcatalog:hive-hcatalog-core:$hiveVersion")
|
||||
sql(
|
||||
"""CREATE TABLE t(a string, b string)
|
||||
|ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'""".stripMargin)
|
||||
sql(s"""LOAD DATA LOCAL INPATH "$testData" INTO TABLE t""")
|
||||
sql("SELECT * FROM src JOIN t on src.key = t.a")
|
||||
assert(sql("LIST JARS").filter(_.getString(0).contains(
|
||||
"org.apache.hive.hcatalog_hive-hcatalog-core-2.3.7.jar")).count() > 0)
|
||||
s"org.apache.hive.hcatalog_hive-hcatalog-core-$hiveVersion.jar")).count() > 0)
|
||||
assert(sql("LIST JAR").
|
||||
filter(_.getString(0).contains(
|
||||
"org.apache.hive.hcatalog_hive-hcatalog-core-2.3.7.jar")).count() > 0)
|
||||
s"org.apache.hive.hcatalog_hive-hcatalog-core-$hiveVersion.jar")).count() > 0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue