[SPARK-7485] [BUILD] Remove pyspark files from assembly.
The sbt part of the build is hacky; it basically tricks sbt into generating the zip by using a generator, but returns an empty list for the generated files so that nothing is actually added to the assembly. Author: Marcelo Vanzin <vanzin@cloudera.com> Closes #6022 from vanzin/SPARK-7485 and squashes the following commits: 22c1e04 [Marcelo Vanzin] Remove unneeded code. 4893622 [Marcelo Vanzin] [SPARK-7485] [build] Remove pyspark files from assembly.
This commit is contained in:
parent
9847875266
commit
82e890fb19
47
core/pom.xml
47
core/pom.xml
|
@ -381,35 +381,6 @@
|
|||
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
|
||||
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
|
||||
<plugins>
|
||||
<!-- Unzip py4j so we can include its files in the jar -->
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-antrun-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>generate-resources</phase>
|
||||
<goals>
|
||||
<goal>run</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<target>
|
||||
<unzip src="../python/lib/py4j-0.8.2.1-src.zip" dest="../python/build" />
|
||||
</target>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<artifactId>maven-clean-plugin</artifactId>
|
||||
<configuration>
|
||||
<filesets>
|
||||
<fileset>
|
||||
<directory>${basedir}/../python/build</directory>
|
||||
</fileset>
|
||||
</filesets>
|
||||
<verbose>true</verbose>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-dependency-plugin</artifactId>
|
||||
|
@ -438,24 +409,6 @@
|
|||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>src/main/resources</directory>
|
||||
</resource>
|
||||
<resource>
|
||||
<directory>../python</directory>
|
||||
<includes>
|
||||
<include>pyspark/*.py</include>
|
||||
</includes>
|
||||
</resource>
|
||||
<resource>
|
||||
<directory>../python/build</directory>
|
||||
<includes>
|
||||
<include>py4j/*.py</include>
|
||||
</includes>
|
||||
</resource>
|
||||
</resources>
|
||||
</build>
|
||||
|
||||
<profiles>
|
||||
|
|
|
@ -141,16 +141,5 @@
|
|||
<build>
|
||||
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
|
||||
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>../python</directory>
|
||||
<includes>
|
||||
<include>pyspark/mllib/*.py</include>
|
||||
<include>pyspark/mllib/stat/*.py</include>
|
||||
<include>pyspark/ml/*.py</include>
|
||||
<include>pyspark/ml/param/*.py</include>
|
||||
</includes>
|
||||
</resource>
|
||||
</resources>
|
||||
</build>
|
||||
</project>
|
||||
|
|
|
@ -168,7 +168,7 @@ object SparkBuild extends PomBuild {
|
|||
/* Enable Assembly for all assembly projects */
|
||||
assemblyProjects.foreach(enable(Assembly.settings))
|
||||
|
||||
/* Package pyspark artifacts in the main assembly. */
|
||||
/* Package pyspark artifacts in a separate zip file for YARN. */
|
||||
enable(PySparkAssembly.settings)(assembly)
|
||||
|
||||
/* Enable unidoc only for the root spark project */
|
||||
|
@ -373,22 +373,15 @@ object PySparkAssembly {
|
|||
import java.util.zip.{ZipOutputStream, ZipEntry}
|
||||
|
||||
lazy val settings = Seq(
|
||||
unmanagedJars in Compile += { BuildCommons.sparkHome / "python/lib/py4j-0.8.2.1-src.zip" },
|
||||
// Use a resource generator to copy all .py files from python/pyspark into a managed directory
|
||||
// to be included in the assembly. We can't just add "python/" to the assembly's resource dir
|
||||
// list since that will copy unneeded / unwanted files.
|
||||
resourceGenerators in Compile <+= resourceManaged in Compile map { outDir: File =>
|
||||
val src = new File(BuildCommons.sparkHome, "python/pyspark")
|
||||
|
||||
val zipFile = new File(BuildCommons.sparkHome , "python/lib/pyspark.zip")
|
||||
zipFile.delete()
|
||||
zipRecursive(src, zipFile)
|
||||
|
||||
val dst = new File(outDir, "pyspark")
|
||||
if (!dst.isDirectory()) {
|
||||
require(dst.mkdirs())
|
||||
}
|
||||
copy(src, dst)
|
||||
Seq[File]()
|
||||
}
|
||||
)
|
||||
|
||||
|
@ -416,42 +409,11 @@ object PySparkAssembly {
|
|||
output.write(buf, 0, n)
|
||||
}
|
||||
}
|
||||
output.closeEntry()
|
||||
in.close()
|
||||
}
|
||||
}
|
||||
|
||||
private def copy(src: File, dst: File): Seq[File] = {
|
||||
src.listFiles().flatMap { f =>
|
||||
val child = new File(dst, f.getName())
|
||||
if (f.isDirectory()) {
|
||||
child.mkdir()
|
||||
copy(f, child)
|
||||
} else if (f.getName().endsWith(".py")) {
|
||||
var in: Option[FileInputStream] = None
|
||||
var out: Option[FileOutputStream] = None
|
||||
try {
|
||||
in = Some(new FileInputStream(f))
|
||||
out = Some(new FileOutputStream(child))
|
||||
|
||||
val bytes = new Array[Byte](1024)
|
||||
var read = 0
|
||||
while (read >= 0) {
|
||||
read = in.get.read(bytes)
|
||||
if (read > 0) {
|
||||
out.get.write(bytes, 0, read)
|
||||
}
|
||||
}
|
||||
|
||||
Some(child)
|
||||
} finally {
|
||||
in.foreach(_.close())
|
||||
out.foreach(_.close())
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
object Unidoc {
|
||||
|
|
|
@ -103,13 +103,5 @@
|
|||
<build>
|
||||
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
|
||||
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>../../python</directory>
|
||||
<includes>
|
||||
<include>pyspark/sql/*.py</include>
|
||||
</includes>
|
||||
</resource>
|
||||
</resources>
|
||||
</build>
|
||||
</project>
|
||||
|
|
|
@ -105,13 +105,5 @@
|
|||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>../python</directory>
|
||||
<includes>
|
||||
<include>pyspark/streaming/*.py</include>
|
||||
</includes>
|
||||
</resource>
|
||||
</resources>
|
||||
</build>
|
||||
</project>
|
||||
|
|
Loading…
Reference in a new issue