From 2fd85174e9423673edec5ecb1f1c402ec33472fe Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Tue, 9 Mar 2021 21:28:35 +0900 Subject: [PATCH] [SPARK-34603][SQL] Support ADD ARCHIVE and LIST ARCHIVES command ### What changes were proposed in this pull request? This PR adds `ADD ARCHIVE` and `LIST ARCHIVES` commands to SQL and updates relevant documents. SPARK-33530 added `addArchive` and `listArchives` to `SparkContext` but it's not supported yet to add/list archives with SQL. ### Why are the changes needed? To complement features. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Added new test and confirmed the generated HTML from the updated documents. Closes #31721 from sarutak/sql-archive. Authored-by: Kousuke Saruta Signed-off-by: HyukjinKwon --- docs/_data/menu-sql.yaml | 4 + ...ef-syntax-aux-resource-mgmt-add-archive.md | 53 +++++++++++ ...l-ref-syntax-aux-resource-mgmt-add-file.md | 3 +- ...ql-ref-syntax-aux-resource-mgmt-add-jar.md | 2 + ...f-syntax-aux-resource-mgmt-list-archive.md | 53 +++++++++++ ...-ref-syntax-aux-resource-mgmt-list-file.md | 5 +- ...l-ref-syntax-aux-resource-mgmt-list-jar.md | 6 +- docs/sql-ref-syntax-aux-resource-mgmt.md | 2 + .../spark/sql/execution/SparkSqlParser.scala | 10 +- .../sql/execution/command/resources.scala | 31 ++++++ .../sql/hive/execution/HiveQuerySuite.scala | 95 +++++++++++++++++++ 11 files changed, 258 insertions(+), 6 deletions(-) create mode 100644 docs/sql-ref-syntax-aux-resource-mgmt-add-archive.md create mode 100644 docs/sql-ref-syntax-aux-resource-mgmt-list-archive.md diff --git a/docs/_data/menu-sql.yaml b/docs/_data/menu-sql.yaml index a9ea6fed92..5192422f8c 100644 --- a/docs/_data/menu-sql.yaml +++ b/docs/_data/menu-sql.yaml @@ -263,7 +263,11 @@ url: sql-ref-syntax-aux-resource-mgmt-add-file.html - text: ADD JAR url: sql-ref-syntax-aux-resource-mgmt-add-jar.html + - text: ADD ARCHIVE + url: sql-ref-syntax-aux-resource-mgmt-add-archive.html - text: LIST FILE url: sql-ref-syntax-aux-resource-mgmt-list-file.html - text: LIST JAR url: sql-ref-syntax-aux-resource-mgmt-list-jar.html + - text: LIST ARCHIVE + url: sql-ref-syntax-aux-resource-mgmt-list-archive.html diff --git a/docs/sql-ref-syntax-aux-resource-mgmt-add-archive.md b/docs/sql-ref-syntax-aux-resource-mgmt-add-archive.md new file mode 100644 index 0000000000..fa86acd8c5 --- /dev/null +++ b/docs/sql-ref-syntax-aux-resource-mgmt-add-archive.md @@ -0,0 +1,53 @@ +--- +layout: global +title: ADD ARCHIVE +displayTitle: ADD ARCHIVE +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- + +### Description + +`ADD ARCHIVE` can be used to add an archive file to the list of resources. The given archive file should be one of .zip, .tar, .tar.gz, .tgz and .jar. The added archive file can be listed using [LIST ARCHIVE](sql-ref-syntax-aux-resource-mgmt-list-archive.html). + +### Syntax + +```sql +ADD ARCHIVE file_name +``` + +### Parameters + +* **file_name** + + The name of the archive file to be added. It could be either on a local file system or a distributed file system. + +### Examples + +```sql +ADD ARCHIVE /tmp/test.tar.gz; +ADD ARCHIVE "/path/to/some.zip"; +ADD ARCHIVE '/some/other.tgz'; +ADD ARCHIVE "/path with space/abc.tar"; +``` + +### Related Statements + +* [LIST FILE](sql-ref-syntax-aux-resource-mgmt-list-file.html) +* [LIST JAR](sql-ref-syntax-aux-resource-mgmt-list-jar.html) +* [LIST ARCHIVE](sql-ref-syntax-aux-resource-mgmt-list-archive.html) +* [ADD FILE](sql-ref-syntax-aux-resource-mgmt-add-file.html) +* [ADD JAR](sql-ref-syntax-aux-resource-mgmt-add-jar.html) diff --git a/docs/sql-ref-syntax-aux-resource-mgmt-add-file.md b/docs/sql-ref-syntax-aux-resource-mgmt-add-file.md index 9203293d0c..3154c432d5 100644 --- a/docs/sql-ref-syntax-aux-resource-mgmt-add-file.md +++ b/docs/sql-ref-syntax-aux-resource-mgmt-add-file.md @@ -49,5 +49,6 @@ ADD FILE "/path/to/some/directory"; * [LIST FILE](sql-ref-syntax-aux-resource-mgmt-list-file.html) * [LIST JAR](sql-ref-syntax-aux-resource-mgmt-list-jar.html) +* [LIST ARCHIVE](sql-ref-syntax-aux-resource-mgmt-list-archive.html) * [ADD JAR](sql-ref-syntax-aux-resource-mgmt-add-jar.html) - +* [ADD ARCHIVE](sql-ref-syntax-aux-resource-mgmt-add-archive.html) diff --git a/docs/sql-ref-syntax-aux-resource-mgmt-add-jar.md b/docs/sql-ref-syntax-aux-resource-mgmt-add-jar.md index e5ac58ba81..3b342b02e5 100644 --- a/docs/sql-ref-syntax-aux-resource-mgmt-add-jar.md +++ b/docs/sql-ref-syntax-aux-resource-mgmt-add-jar.md @@ -63,3 +63,5 @@ ADD JAR "ivy://group:module:version?exclude=group:module&transitive=true" * [LIST JAR](sql-ref-syntax-aux-resource-mgmt-list-jar.html) * [ADD FILE](sql-ref-syntax-aux-resource-mgmt-add-file.html) * [LIST FILE](sql-ref-syntax-aux-resource-mgmt-list-file.html) +* [ADD ARCHIVE](sql-ref-syntax-aux-resource-mgmt-add-archive.html) +* [LIST ARCHIVE](sql-ref-syntax-aux-resource-mgmt-list-archive.html) diff --git a/docs/sql-ref-syntax-aux-resource-mgmt-list-archive.md b/docs/sql-ref-syntax-aux-resource-mgmt-list-archive.md new file mode 100644 index 0000000000..ae5051ab11 --- /dev/null +++ b/docs/sql-ref-syntax-aux-resource-mgmt-list-archive.md @@ -0,0 +1,53 @@ +--- +layout: global +title: LIST ARCHIVE +displayTitle: LIST ARCHIVE +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- + +### Description + +`LIST ARCHIVE` lists the archives added by [ADD ARCHIVE](sql-ref-syntax-aux-resource-mgmt-add-archive.html). + +### Syntax + +```sql +LIST { ARCHIVE | ARCHIVES } file_name [ ... ] +``` + +### Examples + +```sql +ADD ARCHIVE /tmp/test.zip; +ADD ARCHIVE /tmp/test_2.tar.gz; +LIST ARCHIVE; +-- output for LIST ARCHIVE +file:/tmp/test.zip +file:/tmp/test_2.tar.gz + +LIST ARCHIVE /tmp/test.zip /some/random.tgz /another/random.tar; +-- output +file:/tmp/test.zip +``` + +### Related Statements + +* [ADD JAR](sql-ref-syntax-aux-resource-mgmt-add-jar.html) +* [ADD FILE](sql-ref-syntax-aux-resource-mgmt-add-file.html) +* [ADD ARCHIVE](sql-ref-syntax-aux-resource-mgmt-add-archive.html) +* [LIST FILE](sql-ref-syntax-aux-resource-mgmt-list-file.html) +* [LIST JAR](sql-ref-syntax-aux-resource-mgmt-list-jar.html) diff --git a/docs/sql-ref-syntax-aux-resource-mgmt-list-file.md b/docs/sql-ref-syntax-aux-resource-mgmt-list-file.md index 9b9a7df7f6..d92f5fe162 100644 --- a/docs/sql-ref-syntax-aux-resource-mgmt-list-file.md +++ b/docs/sql-ref-syntax-aux-resource-mgmt-list-file.md @@ -26,7 +26,7 @@ license: | ### Syntax ```sql -LIST FILE +LIST { FILE | FILES } file_name [ ... ] ``` ### Examples @@ -48,5 +48,6 @@ file:/private/tmp/test * [ADD FILE](sql-ref-syntax-aux-resource-mgmt-add-file.html) * [ADD JAR](sql-ref-syntax-aux-resource-mgmt-add-jar.html) +* [ADD ARCHIVE](sql-ref-syntax-aux-resource-mgmt-add-archive.html) * [LIST JAR](sql-ref-syntax-aux-resource-mgmt-list-jar.html) - +* [LIST ARCHIVE](sql-ref-syntax-aux-resource-mgmt-list-archive.html) diff --git a/docs/sql-ref-syntax-aux-resource-mgmt-list-jar.md b/docs/sql-ref-syntax-aux-resource-mgmt-list-jar.md index 04aa52c2ad..6c705aa8b1 100644 --- a/docs/sql-ref-syntax-aux-resource-mgmt-list-jar.md +++ b/docs/sql-ref-syntax-aux-resource-mgmt-list-jar.md @@ -26,7 +26,8 @@ license: | ### Syntax ```sql -LIST JAR +LIST { JAR | JARS } file_name [ ... ] + ``` ### Examples @@ -48,5 +49,6 @@ spark://192.168.1.112:62859/jars/test.jar * [ADD JAR](sql-ref-syntax-aux-resource-mgmt-add-jar.html) * [ADD FILE](sql-ref-syntax-aux-resource-mgmt-add-file.html) +* [ADD ARCHIVE](sql-ref-syntax-aux-resource-mgmt-add-archive.html) * [LIST FILE](sql-ref-syntax-aux-resource-mgmt-list-file.html) - +* [LIST ARCHIVE](sql-ref-syntax-aux-resource-mgmt-list-archive.html) diff --git a/docs/sql-ref-syntax-aux-resource-mgmt.md b/docs/sql-ref-syntax-aux-resource-mgmt.md index 50c12ef7c2..4a262824b7 100644 --- a/docs/sql-ref-syntax-aux-resource-mgmt.md +++ b/docs/sql-ref-syntax-aux-resource-mgmt.md @@ -21,5 +21,7 @@ license: | * [ADD FILE](sql-ref-syntax-aux-resource-mgmt-add-file.html) * [ADD JAR](sql-ref-syntax-aux-resource-mgmt-add-jar.html) + * [ADD ARCHIVE](sql-ref-syntax-aux-resource-mgmt-add-archive.html) * [LIST FILE](sql-ref-syntax-aux-resource-mgmt-list-file.html) * [LIST JAR](sql-ref-syntax-aux-resource-mgmt-list-jar.html) + * [LIST ARCHIVE](sql-ref-syntax-aux-resource-mgmt-list-archive.html) \ No newline at end of file diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index e9feba7c7f..0861706c06 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -339,7 +339,8 @@ class SparkSqlAstBuilder extends AstBuilder { } /** - * Create a [[AddFileCommand]], [[AddJarCommand]], [[ListFilesCommand]] or [[ListJarsCommand]] + * Create a [[AddFileCommand]], [[AddJarCommand]], [[AddArchiveCommand]], + * [[ListFilesCommand]], [[ListJarsCommand]] or [[ListArchivesCommand]] * command depending on the requested operation on resources. * Expected format: * {{{ @@ -359,6 +360,7 @@ class SparkSqlAstBuilder extends AstBuilder { ctx.identifier.getText.toLowerCase(Locale.ROOT) match { case "file" => AddFileCommand(maybePaths) case "jar" => AddJarCommand(maybePaths) + case "archive" => AddArchiveCommand(maybePaths) case other => operationNotAllowed(s"ADD with resource type '$other'", ctx) } case SqlBaseParser.LIST => @@ -375,6 +377,12 @@ class SparkSqlAstBuilder extends AstBuilder { } else { ListJarsCommand() } + case "archives" | "archive" => + if (maybePaths.length > 0) { + ListArchivesCommand(maybePaths.split("\\s+")) + } else { + ListArchivesCommand() + } case other => operationNotAllowed(s"LIST with resource type '$other'", ctx) } case _ => operationNotAllowed(s"Other types of operation on resources", ctx) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/resources.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/resources.scala index fd115ac600..691837f38d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/resources.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/resources.scala @@ -47,6 +47,16 @@ case class AddFileCommand(path: String) extends RunnableCommand { } } +/** + * Adds an archive to the current session so it can be used. + */ +case class AddArchiveCommand(path: String) extends RunnableCommand { + override def run(sparkSession: SparkSession): Seq[Row] = { + sparkSession.sparkContext.addArchive(path) + Seq.empty[Row] + } +} + /** * Returns a list of file paths that are added to resources. * If file paths are provided, return the ones that are added to resources. @@ -94,3 +104,24 @@ case class ListJarsCommand(jars: Seq[String] = Seq.empty[String]) extends Runnab } } } + +/** + * Returns a list of archive paths that are added to resources. + * If archive paths are provided, return the ones that are added to resources. + */ +case class ListArchivesCommand(archives: Seq[String] = Seq.empty[String]) extends RunnableCommand { + override val output: Seq[Attribute] = { + AttributeReference("Results", StringType, nullable = false)() :: Nil + } + override def run(sparkSession: SparkSession): Seq[Row] = { + val archiveList = sparkSession.sparkContext.listArchives() + if (archives.nonEmpty) { + for { + archiveName <- archives.map(f => new Path(f).getName) + archivePath <- archiveList if archivePath.contains(archiveName) + } yield Row(archivePath) + } else { + archiveList.map(Row(_)) + } + } +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index 87c2541dc7..9968e8ddf5 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.hive.execution import java.io.File import java.net.URI +import java.nio.file.Files import java.sql.Timestamp import java.util.Locale @@ -857,6 +858,100 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd assert(sql(s"list file $testFile").count() == 1) } + test("ADD ARCHIVE/LIST ARCHIVES commands") { + withTempDir { dir => + val file1 = File.createTempFile("someprefix1", "somesuffix1", dir) + val file2 = File.createTempFile("someprefix2", "somesuffix2", dir) + + Files.write(file1.toPath, "file1".getBytes) + Files.write(file2.toPath, "file2".getBytes) + + val zipFile = new File(dir, "test.zip") + val jarFile = new File(dir, "test.jar") + TestUtils.createJar(Seq(file1), zipFile) + TestUtils.createJar(Seq(file2), jarFile) + + sql(s"ADD ARCHIVE ${zipFile.getAbsolutePath}#foo") + sql(s"ADD ARCHIVE ${jarFile.getAbsolutePath}#bar") + + val checkAddArchive = + sparkContext.parallelize( + Seq( + "foo", + s"foo/${file1.getName}", + "nonexistence", + "bar", + s"bar/${file2.getName}"), 1).map { name => + val file = new File(SparkFiles.get(name)) + val contents = + if (file.isFile) { + Some(String.join("", new String(Files.readAllBytes(file.toPath)))) + } else { + None + } + (name, file.canRead, contents) + }.collect() + + assert(checkAddArchive(0) === ("foo", true, None)) + assert(checkAddArchive(1) === (s"foo/${file1.getName}", true, Some("file1"))) + assert(checkAddArchive(2) === ("nonexistence", false, None)) + assert(checkAddArchive(3) === ("bar", true, None)) + assert(checkAddArchive(4) === (s"bar/${file2.getName}", true, Some("file2"))) + assert(sql("list archives"). + filter(_.getString(0).contains(s"${zipFile.getAbsolutePath}")).count() > 0) + assert(sql("list archive"). + filter(_.getString(0).contains(s"${jarFile.getAbsolutePath}")).count() > 0) + assert(sql(s"list archive ${zipFile.getAbsolutePath}").count() === 1) + assert(sql(s"list archives ${zipFile.getAbsolutePath} nonexistence").count() === 1) + assert(sql(s"list archives ${zipFile.getAbsolutePath} " + + s"${jarFile.getAbsolutePath}").count === 2) + } + } + + test("ADD ARCHIVE/List ARCHIVES commands - unsupported archive formats") { + withTempDir { dir => + val file1 = File.createTempFile("someprefix1", "somesuffix1", dir) + val file2 = File.createTempFile("someprefix2", "somesuffix2", dir) + + Files.write(file1.toPath, "file1".getBytes) + Files.write(file2.toPath, "file2".getBytes) + + // Emulate unsupported archive formats with .bz2 and .xz suffix. + val bz2File = new File(dir, "test.bz2") + val xzFile = new File(dir, "test.xz") + TestUtils.createJar(Seq(file1), bz2File) + TestUtils.createJar(Seq(file2), xzFile) + + sql(s"ADD ARCHIVE ${bz2File.getAbsolutePath}#foo") + sql(s"ADD ARCHIVE ${xzFile.getAbsolutePath}#bar") + + val checkAddArchive = + sparkContext.parallelize( + Seq( + "foo", + "bar"), 1).map { name => + val file = new File(SparkFiles.get(name)) + val contents = + if (file.isFile) { + Some(Files.readAllBytes(file.toPath).toSeq) + } else { + None + } + (name, file.canRead, contents) + }.collect() + + assert(checkAddArchive(0) === ("foo", true, Some(Files.readAllBytes(bz2File.toPath).toSeq))) + assert(checkAddArchive(1) === ("bar", true, Some(Files.readAllBytes(xzFile.toPath).toSeq))) + assert(sql("list archives"). + filter(_.getString(0).contains(s"${bz2File.getAbsolutePath}")).count() > 0) + assert(sql("list archive"). + filter(_.getString(0).contains(s"${xzFile.getAbsolutePath}")).count() > 0) + assert(sql(s"list archive ${bz2File.getAbsolutePath}").count() === 1) + assert(sql(s"list archives ${bz2File.getAbsolutePath} " + + s"${xzFile.getAbsolutePath}").count === 2) + } + } + createQueryTest("dynamic_partition", """ |DROP TABLE IF EXISTS dynamic_part_table;