[SPARK-5808] [build] Package pyspark files in sbt assembly.

This turned out to be more complicated than I wanted because the
layout of python/ doesn't really follow the usual maven conventions.
So some extra code is needed to copy just the right things.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #5461 from vanzin/SPARK-5808 and squashes the following commits:

7153dac [Marcelo Vanzin] Only try to create resource dir if it doesn't already exist.
ee90e84 [Marcelo Vanzin] [SPARK-5808] [build] Package pyspark files in sbt assembly.
This commit is contained in:
Marcelo Vanzin 2015-04-14 13:41:38 -07:00 committed by Andrew Or
parent 6adb8bcbf0
commit 65774370a1

View file

@ -15,7 +15,7 @@
* limitations under the License.
*/
import java.io.File
import java.io._
import scala.util.Properties
import scala.collection.JavaConversions._
@ -166,6 +166,9 @@ object SparkBuild extends PomBuild {
/* Enable Assembly for all assembly projects */
assemblyProjects.foreach(enable(Assembly.settings))
/* Package pyspark artifacts in the main assembly. */
enable(PySparkAssembly.settings)(assembly)
/* Enable unidoc only for the root spark project */
enable(Unidoc.settings)(spark)
@ -316,6 +319,7 @@ object Hive {
}
object Assembly {
import sbtassembly.AssemblyUtils._
import sbtassembly.Plugin._
import AssemblyKeys._
@ -347,6 +351,60 @@ object Assembly {
)
}
object PySparkAssembly {
import sbtassembly.Plugin._
import AssemblyKeys._
lazy val settings = Seq(
unmanagedJars in Compile += { BuildCommons.sparkHome / "python/lib/py4j-0.8.2.1-src.zip" },
// Use a resource generator to copy all .py files from python/pyspark into a managed directory
// to be included in the assembly. We can't just add "python/" to the assembly's resource dir
// list since that will copy unneeded / unwanted files.
resourceGenerators in Compile <+= resourceManaged in Compile map { outDir: File =>
val dst = new File(outDir, "pyspark")
if (!dst.isDirectory()) {
require(dst.mkdirs())
}
val src = new File(BuildCommons.sparkHome, "python/pyspark")
copy(src, dst)
}
)
private def copy(src: File, dst: File): Seq[File] = {
src.listFiles().flatMap { f =>
val child = new File(dst, f.getName())
if (f.isDirectory()) {
child.mkdir()
copy(f, child)
} else if (f.getName().endsWith(".py")) {
var in: Option[FileInputStream] = None
var out: Option[FileOutputStream] = None
try {
in = Some(new FileInputStream(f))
out = Some(new FileOutputStream(child))
val bytes = new Array[Byte](1024)
var read = 0
while (read >= 0) {
read = in.get.read(bytes)
if (read > 0) {
out.get.write(bytes, 0, read)
}
}
Some(child)
} finally {
in.foreach(_.close())
out.foreach(_.close())
}
} else {
None
}
}
}
}
object Unidoc {
import BuildCommons._