[SPARK-34789][TEST] Introduce Jetty based construct for integration tests where HTTP server is used

### What changes were proposed in this pull request?

Introducing a new test construct:
```
  withHttpServer() { baseURL =>
    ...
  }
```
Which starts and stops a Jetty server to serve files via HTTP.

Moreover this PR uses this new construct in the test `Run SparkRemoteFileTest using a remote data file`.

### Why are the changes needed?

Before this PR github URLs was used like "https://raw.githubusercontent.com/apache/spark/master/data/mllib/pagerank_data.txt".
This connects two Spark version in an unhealthy way like connecting the "master" branch which is moving part with the committed test code which is a non-moving (as it might be even released).
So this way a test running for an earlier version of Spark expects something (filename, content, path) from a the latter release and what is worse when the moving version is changed the earlier test will break.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Existing unit test.

Closes #31935 from attilapiros/SPARK-34789.

Authored-by: “attilapiros” <piros.attila.zsolt@gmail.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
This commit is contained in:
“attilapiros” 2021-04-14 21:22:52 -07:00 committed by Dongjoon Hyun
parent 7ff9d2e3ee
commit 8a3815f722
2 changed files with 30 additions and 6 deletions

View file

@ -18,7 +18,7 @@
package org.apache.spark
import java.io.{ByteArrayInputStream, File, FileInputStream, FileOutputStream}
import java.net.{HttpURLConnection, URI, URL}
import java.net.{HttpURLConnection, InetSocketAddress, URI, URL}
import java.nio.charset.StandardCharsets
import java.nio.file.{Files => JavaFiles, Paths}
import java.nio.file.attribute.PosixFilePermission.{OWNER_EXECUTE, OWNER_READ, OWNER_WRITE}
@ -41,6 +41,11 @@ import scala.util.Try
import com.google.common.io.{ByteStreams, Files}
import org.apache.commons.lang3.StringUtils
import org.apache.log4j.PropertyConfigurator
import org.eclipse.jetty.server.Handler
import org.eclipse.jetty.server.Server
import org.eclipse.jetty.server.handler.DefaultHandler
import org.eclipse.jetty.server.handler.HandlerList
import org.eclipse.jetty.server.handler.ResourceHandler
import org.json4s.JsonAST.JValue
import org.json4s.jackson.JsonMethods.{compact, render}
@ -364,6 +369,22 @@ private[spark] object TestUtils {
}
}
def withHttpServer(resBaseDir: String = ".")(body: URL => Unit): Unit = {
// 0 as port means choosing randomly from the available ports
val server = new Server(new InetSocketAddress(Utils.localCanonicalHostName, 0))
val resHandler = new ResourceHandler()
resHandler.setResourceBase(resBaseDir)
val handlers = new HandlerList()
handlers.setHandlers(Array[Handler](resHandler, new DefaultHandler()))
server.setHandler(handlers)
server.start()
try {
body(server.getURI.toURL)
} finally {
server.stop()
}
}
/**
* Wait until at least `numExecutors` executors are up, or throw `TimeoutException` if the waiting
* time elapsed before `numExecutors` executors up. Exposed for testing.

View file

@ -18,6 +18,7 @@ package org.apache.spark.deploy.k8s.integrationtest
import io.fabric8.kubernetes.api.model.Pod
import org.apache.spark.TestUtils
import org.apache.spark.launcher.SparkLauncher
private[spark] trait BasicTestsSuite { k8sSuite: KubernetesSuite =>
@ -99,9 +100,12 @@ private[spark] trait BasicTestsSuite { k8sSuite: KubernetesSuite =>
}
test("Run SparkRemoteFileTest using a remote data file", k8sTestTag) {
sparkAppConf
.set("spark.files", REMOTE_PAGE_RANK_DATA_FILE)
runSparkRemoteCheckAndVerifyCompletion(appArgs = Array(REMOTE_PAGE_RANK_FILE_NAME))
assert(sys.props.contains("spark.test.home"), "spark.test.home is not set!")
TestUtils.withHttpServer(sys.props("spark.test.home")) { baseURL =>
sparkAppConf
.set("spark.files", baseURL.toString + REMOTE_PAGE_RANK_DATA_FILE)
runSparkRemoteCheckAndVerifyCompletion(appArgs = Array(REMOTE_PAGE_RANK_FILE_NAME))
}
}
}
@ -110,7 +114,6 @@ private[spark] object BasicTestsSuite {
val CONTAINER_LOCAL_FILE_DOWNLOAD_PATH = "/var/spark-data/spark-files"
val CONTAINER_LOCAL_DOWNLOADED_PAGE_RANK_DATA_FILE =
s"$CONTAINER_LOCAL_FILE_DOWNLOAD_PATH/pagerank_data.txt"
val REMOTE_PAGE_RANK_DATA_FILE =
"https://raw.githubusercontent.com/apache/spark/master/data/mllib/pagerank_data.txt"
val REMOTE_PAGE_RANK_DATA_FILE = "data/mllib/pagerank_data.txt"
val REMOTE_PAGE_RANK_FILE_NAME = "pagerank_data.txt"
}