2013-12-31 22:30:08 -05:00
|
|
|
<?xml version="1.0" encoding="UTF-8"?>
|
|
|
|
<!--
|
|
|
|
~ Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
~ contributor license agreements. See the NOTICE file distributed with
|
|
|
|
~ this work for additional information regarding copyright ownership.
|
|
|
|
~ The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
~ (the "License"); you may not use this file except in compliance with
|
|
|
|
~ the License. You may obtain a copy of the License at
|
|
|
|
~
|
|
|
|
~ http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
~
|
|
|
|
~ Unless required by applicable law or agreed to in writing, software
|
|
|
|
~ distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
~ See the License for the specific language governing permissions and
|
|
|
|
~ limitations under the License.
|
|
|
|
-->
|
|
|
|
|
|
|
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
2014-03-08 02:10:35 -05:00
|
|
|
<modelVersion>4.0.0</modelVersion>
|
|
|
|
<parent>
|
2013-12-31 22:30:08 -05:00
|
|
|
<groupId>org.apache.spark</groupId>
|
2014-03-08 02:10:35 -05:00
|
|
|
<artifactId>spark-parent</artifactId>
|
2014-11-19 00:24:18 -05:00
|
|
|
<version>1.3.0-SNAPSHOT</version>
|
2014-03-08 02:10:35 -05:00
|
|
|
<relativePath>../pom.xml</relativePath>
|
|
|
|
</parent>
|
2013-12-31 22:30:08 -05:00
|
|
|
|
2014-03-08 02:10:35 -05:00
|
|
|
<groupId>org.apache.spark</groupId>
|
|
|
|
<artifactId>spark-core_2.10</artifactId>
|
2014-07-10 14:03:37 -04:00
|
|
|
<properties>
|
2014-07-28 15:07:30 -04:00
|
|
|
<sbt.project.name>core</sbt.project.name>
|
2014-07-10 14:03:37 -04:00
|
|
|
</properties>
|
2014-03-08 02:10:35 -05:00
|
|
|
<packaging>jar</packaging>
|
|
|
|
<name>Spark Project Core</name>
|
|
|
|
<url>http://spark.apache.org/</url>
|
|
|
|
<dependencies>
|
2014-11-12 00:36:48 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>com.twitter</groupId>
|
|
|
|
<artifactId>chill_${scala.binary.version}</artifactId>
|
|
|
|
<exclusions>
|
|
|
|
<exclusion>
|
|
|
|
<groupId>org.ow2.asm</groupId>
|
|
|
|
<artifactId>asm</artifactId>
|
|
|
|
</exclusion>
|
|
|
|
<exclusion>
|
|
|
|
<groupId>org.ow2.asm</groupId>
|
|
|
|
<artifactId>asm-commons</artifactId>
|
|
|
|
</exclusion>
|
|
|
|
</exclusions>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>com.twitter</groupId>
|
|
|
|
<artifactId>chill-java</artifactId>
|
|
|
|
<exclusions>
|
|
|
|
<exclusion>
|
|
|
|
<groupId>org.ow2.asm</groupId>
|
|
|
|
<artifactId>asm</artifactId>
|
|
|
|
</exclusion>
|
|
|
|
<exclusion>
|
|
|
|
<groupId>org.ow2.asm</groupId>
|
|
|
|
<artifactId>asm-commons</artifactId>
|
|
|
|
</exclusion>
|
|
|
|
</exclusions>
|
|
|
|
</dependency>
|
2014-03-08 02:10:35 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.hadoop</groupId>
|
|
|
|
<artifactId>hadoop-client</artifactId>
|
2014-07-01 03:28:38 -04:00
|
|
|
<exclusions>
|
|
|
|
<exclusion>
|
|
|
|
<groupId>javax.servlet</groupId>
|
|
|
|
<artifactId>servlet-api</artifactId>
|
|
|
|
</exclusion>
|
|
|
|
</exclusions>
|
2014-03-08 02:10:35 -05:00
|
|
|
</dependency>
|
[SPARK-3453] Netty-based BlockTransferService, extracted from Spark core
This PR encapsulates #2330, which is itself a continuation of #2240. The first goal of this PR is to provide an alternate, simpler implementation of the ConnectionManager which is based on Netty.
In addition to this goal, however, we want to resolve [SPARK-3796](https://issues.apache.org/jira/browse/SPARK-3796), which calls for a standalone shuffle service which can be integrated into the YARN NodeManager, Standalone Worker, or on its own. This PR makes the first step in this direction by ensuring that the actual Netty service is as small as possible and extracted from Spark core. Given this, we should be able to construct this standalone jar which can be included in other JVMs without incurring significant dependency or runtime issues. The actual work to ensure that such a standalone shuffle service would work in Spark will be left for a future PR, however.
In order to minimize dependencies and allow for the service to be long-running (possibly much longer-running than Spark, and possibly having to support multiple version of Spark simultaneously), the entire service has been ported to Java, where we have full control over the binary compatibility of the components and do not depend on the Scala runtime or version.
These issues: have been addressed by folding in #2330:
SPARK-3453: Refactor Netty module to use BlockTransferService interface
SPARK-3018: Release all buffers upon task completion/failure
SPARK-3002: Create a connection pool and reuse clients across different threads
SPARK-3017: Integration tests and unit tests for connection failures
SPARK-3049: Make sure client doesn't block when server/connection has error(s)
SPARK-3502: SO_RCVBUF and SO_SNDBUF should be bootstrap childOption, not option
SPARK-3503: Disable thread local cache in PooledByteBufAllocator
TODO before mergeable:
- [x] Implement uploadBlock()
- [x] Unit tests for RPC side of code
- [x] Performance testing (see comments [here](https://github.com/apache/spark/pull/2753#issuecomment-59475022))
- [x] Turn OFF by default (currently on for unit testing)
Author: Reynold Xin <rxin@apache.org>
Author: Aaron Davidson <aaron@databricks.com>
Author: cocoatomo <cocoatomo77@gmail.com>
Author: Patrick Wendell <pwendell@gmail.com>
Author: Prashant Sharma <prashant.s@imaginea.com>
Author: Davies Liu <davies.liu@gmail.com>
Author: Anand Avati <avati@redhat.com>
Closes #2753 from aarondav/netty and squashes the following commits:
cadfd28 [Aaron Davidson] Turn netty off by default
d7be11b [Aaron Davidson] Turn netty on by default
4a204b8 [Aaron Davidson] Fail block fetches if client connection fails
2b0d1c0 [Aaron Davidson] 100ch
0c5bca2 [Aaron Davidson] Merge branch 'master' of https://github.com/apache/spark into netty
14e37f7 [Aaron Davidson] Address Reynold's comments
8dfcceb [Aaron Davidson] Merge branch 'master' of https://github.com/apache/spark into netty
322dfc1 [Aaron Davidson] Address Reynold's comments, including major rename
e5675a4 [Aaron Davidson] Fail outstanding RPCs as well
ccd4959 [Aaron Davidson] Don't throw exception if client immediately fails
9da0bc1 [Aaron Davidson] Add RPC unit tests
d236dfd [Aaron Davidson] Remove no-op serializer :)
7b7a26c [Aaron Davidson] Fix Nio compile issue
dd420fd [Aaron Davidson] Merge branch 'master' of https://github.com/apache/spark into netty-test
939f276 [Aaron Davidson] Attempt to make comm. bidirectional
aa58f67 [cocoatomo] [SPARK-3909][PySpark][Doc] A corrupted format in Sphinx documents and building warnings
8dc1ded [cocoatomo] [SPARK-3867][PySpark] ./python/run-tests failed when it run with Python 2.6 and unittest2 is not installed
5b5dbe6 [Prashant Sharma] [SPARK-2924] Required by scala 2.11, only one fun/ctor amongst overriden alternatives, can have default argument(s).
2c5d9dc [Patrick Wendell] HOTFIX: Fix build issue with Akka 2.3.4 upgrade.
020691e [Davies Liu] [SPARK-3886] [PySpark] use AutoBatchedSerializer by default
ae4083a [Anand Avati] [SPARK-2805] Upgrade Akka to 2.3.4
29c6dcf [Aaron Davidson] [SPARK-3453] Netty-based BlockTransferService, extracted from Spark core
f7e7568 [Reynold Xin] Fixed spark.shuffle.io.receiveBuffer setting.
5d98ce3 [Reynold Xin] Flip buffer.
f6c220d [Reynold Xin] Merge with latest master.
407e59a [Reynold Xin] Fix style violation.
a0518c7 [Reynold Xin] Implemented block uploads.
4b18db2 [Reynold Xin] Copy the buffer in fetchBlockSync.
bec4ea2 [Reynold Xin] Removed OIO and added num threads settings.
1bdd7ee [Reynold Xin] Fixed tests.
d68f328 [Reynold Xin] Logging close() in case close() fails.
f63fb4c [Reynold Xin] Add more debug message.
6afc435 [Reynold Xin] Added logging.
c066309 [Reynold Xin] Implement java.io.Closeable interface.
519d64d [Reynold Xin] Mark private package visibility and MimaExcludes.
f0a16e9 [Reynold Xin] Fixed test hanging.
14323a5 [Reynold Xin] Removed BlockManager.getLocalShuffleFromDisk.
b2f3281 [Reynold Xin] Added connection pooling.
d23ed7b [Reynold Xin] Incorporated feedback from Norman: - use same pool for boss and worker - remove ioratio - disable caching of byte buf allocator - childoption sendbuf/receivebuf - fire exception through pipeline
9e0cb87 [Reynold Xin] Fixed BlockClientHandlerSuite
5cd33d7 [Reynold Xin] Fixed style violation.
cb589ec [Reynold Xin] Added more test cases covering cleanup when fault happens in ShuffleBlockFetcherIteratorSuite
1be4e8e [Reynold Xin] Shorten NioManagedBuffer and NettyManagedBuffer class names.
108c9ed [Reynold Xin] Forgot to add TestSerializer to the commit list.
b5c8d1f [Reynold Xin] Fixed ShuffleBlockFetcherIteratorSuite.
064747b [Reynold Xin] Reference count buffers and clean them up properly.
2b44cf1 [Reynold Xin] Added more documentation.
1760d32 [Reynold Xin] Use Epoll.isAvailable in BlockServer as well.
165eab1 [Reynold Xin] [SPARK-3453] Refactor Netty module to use BlockTransferService.
2014-10-29 14:27:07 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.spark</groupId>
|
2014-11-12 00:36:48 -05:00
|
|
|
<artifactId>spark-network-common_${scala.binary.version}</artifactId>
|
[SPARK-3453] Netty-based BlockTransferService, extracted from Spark core
This PR encapsulates #2330, which is itself a continuation of #2240. The first goal of this PR is to provide an alternate, simpler implementation of the ConnectionManager which is based on Netty.
In addition to this goal, however, we want to resolve [SPARK-3796](https://issues.apache.org/jira/browse/SPARK-3796), which calls for a standalone shuffle service which can be integrated into the YARN NodeManager, Standalone Worker, or on its own. This PR makes the first step in this direction by ensuring that the actual Netty service is as small as possible and extracted from Spark core. Given this, we should be able to construct this standalone jar which can be included in other JVMs without incurring significant dependency or runtime issues. The actual work to ensure that such a standalone shuffle service would work in Spark will be left for a future PR, however.
In order to minimize dependencies and allow for the service to be long-running (possibly much longer-running than Spark, and possibly having to support multiple version of Spark simultaneously), the entire service has been ported to Java, where we have full control over the binary compatibility of the components and do not depend on the Scala runtime or version.
These issues: have been addressed by folding in #2330:
SPARK-3453: Refactor Netty module to use BlockTransferService interface
SPARK-3018: Release all buffers upon task completion/failure
SPARK-3002: Create a connection pool and reuse clients across different threads
SPARK-3017: Integration tests and unit tests for connection failures
SPARK-3049: Make sure client doesn't block when server/connection has error(s)
SPARK-3502: SO_RCVBUF and SO_SNDBUF should be bootstrap childOption, not option
SPARK-3503: Disable thread local cache in PooledByteBufAllocator
TODO before mergeable:
- [x] Implement uploadBlock()
- [x] Unit tests for RPC side of code
- [x] Performance testing (see comments [here](https://github.com/apache/spark/pull/2753#issuecomment-59475022))
- [x] Turn OFF by default (currently on for unit testing)
Author: Reynold Xin <rxin@apache.org>
Author: Aaron Davidson <aaron@databricks.com>
Author: cocoatomo <cocoatomo77@gmail.com>
Author: Patrick Wendell <pwendell@gmail.com>
Author: Prashant Sharma <prashant.s@imaginea.com>
Author: Davies Liu <davies.liu@gmail.com>
Author: Anand Avati <avati@redhat.com>
Closes #2753 from aarondav/netty and squashes the following commits:
cadfd28 [Aaron Davidson] Turn netty off by default
d7be11b [Aaron Davidson] Turn netty on by default
4a204b8 [Aaron Davidson] Fail block fetches if client connection fails
2b0d1c0 [Aaron Davidson] 100ch
0c5bca2 [Aaron Davidson] Merge branch 'master' of https://github.com/apache/spark into netty
14e37f7 [Aaron Davidson] Address Reynold's comments
8dfcceb [Aaron Davidson] Merge branch 'master' of https://github.com/apache/spark into netty
322dfc1 [Aaron Davidson] Address Reynold's comments, including major rename
e5675a4 [Aaron Davidson] Fail outstanding RPCs as well
ccd4959 [Aaron Davidson] Don't throw exception if client immediately fails
9da0bc1 [Aaron Davidson] Add RPC unit tests
d236dfd [Aaron Davidson] Remove no-op serializer :)
7b7a26c [Aaron Davidson] Fix Nio compile issue
dd420fd [Aaron Davidson] Merge branch 'master' of https://github.com/apache/spark into netty-test
939f276 [Aaron Davidson] Attempt to make comm. bidirectional
aa58f67 [cocoatomo] [SPARK-3909][PySpark][Doc] A corrupted format in Sphinx documents and building warnings
8dc1ded [cocoatomo] [SPARK-3867][PySpark] ./python/run-tests failed when it run with Python 2.6 and unittest2 is not installed
5b5dbe6 [Prashant Sharma] [SPARK-2924] Required by scala 2.11, only one fun/ctor amongst overriden alternatives, can have default argument(s).
2c5d9dc [Patrick Wendell] HOTFIX: Fix build issue with Akka 2.3.4 upgrade.
020691e [Davies Liu] [SPARK-3886] [PySpark] use AutoBatchedSerializer by default
ae4083a [Anand Avati] [SPARK-2805] Upgrade Akka to 2.3.4
29c6dcf [Aaron Davidson] [SPARK-3453] Netty-based BlockTransferService, extracted from Spark core
f7e7568 [Reynold Xin] Fixed spark.shuffle.io.receiveBuffer setting.
5d98ce3 [Reynold Xin] Flip buffer.
f6c220d [Reynold Xin] Merge with latest master.
407e59a [Reynold Xin] Fix style violation.
a0518c7 [Reynold Xin] Implemented block uploads.
4b18db2 [Reynold Xin] Copy the buffer in fetchBlockSync.
bec4ea2 [Reynold Xin] Removed OIO and added num threads settings.
1bdd7ee [Reynold Xin] Fixed tests.
d68f328 [Reynold Xin] Logging close() in case close() fails.
f63fb4c [Reynold Xin] Add more debug message.
6afc435 [Reynold Xin] Added logging.
c066309 [Reynold Xin] Implement java.io.Closeable interface.
519d64d [Reynold Xin] Mark private package visibility and MimaExcludes.
f0a16e9 [Reynold Xin] Fixed test hanging.
14323a5 [Reynold Xin] Removed BlockManager.getLocalShuffleFromDisk.
b2f3281 [Reynold Xin] Added connection pooling.
d23ed7b [Reynold Xin] Incorporated feedback from Norman: - use same pool for boss and worker - remove ioratio - disable caching of byte buf allocator - childoption sendbuf/receivebuf - fire exception through pipeline
9e0cb87 [Reynold Xin] Fixed BlockClientHandlerSuite
5cd33d7 [Reynold Xin] Fixed style violation.
cb589ec [Reynold Xin] Added more test cases covering cleanup when fault happens in ShuffleBlockFetcherIteratorSuite
1be4e8e [Reynold Xin] Shorten NioManagedBuffer and NettyManagedBuffer class names.
108c9ed [Reynold Xin] Forgot to add TestSerializer to the commit list.
b5c8d1f [Reynold Xin] Fixed ShuffleBlockFetcherIteratorSuite.
064747b [Reynold Xin] Reference count buffers and clean them up properly.
2b44cf1 [Reynold Xin] Added more documentation.
1760d32 [Reynold Xin] Use Epoll.isAvailable in BlockServer as well.
165eab1 [Reynold Xin] [SPARK-3453] Refactor Netty module to use BlockTransferService.
2014-10-29 14:27:07 -04:00
|
|
|
<version>${project.version}</version>
|
|
|
|
</dependency>
|
[SPARK-3796] Create external service which can serve shuffle files
This patch introduces the tooling necessary to construct an external shuffle service which is independent of Spark executors, and then use this service inside Spark. An example (just for the sake of this PR) of the service creation can be found in Worker, and the service itself is used by plugging in the StandaloneShuffleClient as Spark's ShuffleClient (setup in BlockManager).
This PR continues the work from #2753, which extracted out the transport layer of Spark's block transfer into an independent package within Spark. A new package was created which contains the Spark business logic necessary to retrieve the actual shuffle data, which is completely independent of the transport layer introduced in the previous patch. Similar to the transport layer, this package must not depend on Spark as we anticipate plugging this service as a lightweight process within, say, the YARN NodeManager, and do not wish to include Spark's dependencies (including Scala itself).
There are several outstanding tasks which must be complete before this PR can be merged:
- [x] Complete unit testing of network/shuffle package.
- [x] Performance and correctness testing on a real cluster.
- [x] Remove example service instantiation from Worker.scala.
There are even more shortcomings of this PR which should be addressed in followup patches:
- Don't use Java serializer for RPC layer! It is not cross-version compatible.
- Handle shuffle file cleanup for dead executors once the application terminates or the ContextCleaner triggers.
- Documentation of the feature in the Spark docs.
- Improve behavior if the shuffle service itself goes down (right now we don't blacklist it, and new executors cannot spawn on that machine).
- SSL and SASL integration
- Nice to have: Handle shuffle file consolidation (this would requires changes to Spark's implementation).
Author: Aaron Davidson <aaron@databricks.com>
Closes #3001 from aarondav/shuffle-service and squashes the following commits:
4d1f8c1 [Aaron Davidson] Remove changes to Worker
705748f [Aaron Davidson] Rename Standalone* to External*
fd3928b [Aaron Davidson] Do not unregister executor outputs unduly
9883918 [Aaron Davidson] Make suggested build changes
3d62679 [Aaron Davidson] Add Spark integration test
7fe51d5 [Aaron Davidson] Fix SBT integration
56caa50 [Aaron Davidson] Address comments
c8d1ac3 [Aaron Davidson] Add unit tests
2f70c0c [Aaron Davidson] Fix unit tests
5483e96 [Aaron Davidson] Fix unit tests
46a70bf [Aaron Davidson] Whoops, bracket
5ea4df6 [Aaron Davidson] [SPARK-3796] Create external service which can serve shuffle files
2014-11-01 17:37:45 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.spark</groupId>
|
2014-11-12 00:36:48 -05:00
|
|
|
<artifactId>spark-network-shuffle_${scala.binary.version}</artifactId>
|
[SPARK-3796] Create external service which can serve shuffle files
This patch introduces the tooling necessary to construct an external shuffle service which is independent of Spark executors, and then use this service inside Spark. An example (just for the sake of this PR) of the service creation can be found in Worker, and the service itself is used by plugging in the StandaloneShuffleClient as Spark's ShuffleClient (setup in BlockManager).
This PR continues the work from #2753, which extracted out the transport layer of Spark's block transfer into an independent package within Spark. A new package was created which contains the Spark business logic necessary to retrieve the actual shuffle data, which is completely independent of the transport layer introduced in the previous patch. Similar to the transport layer, this package must not depend on Spark as we anticipate plugging this service as a lightweight process within, say, the YARN NodeManager, and do not wish to include Spark's dependencies (including Scala itself).
There are several outstanding tasks which must be complete before this PR can be merged:
- [x] Complete unit testing of network/shuffle package.
- [x] Performance and correctness testing on a real cluster.
- [x] Remove example service instantiation from Worker.scala.
There are even more shortcomings of this PR which should be addressed in followup patches:
- Don't use Java serializer for RPC layer! It is not cross-version compatible.
- Handle shuffle file cleanup for dead executors once the application terminates or the ContextCleaner triggers.
- Documentation of the feature in the Spark docs.
- Improve behavior if the shuffle service itself goes down (right now we don't blacklist it, and new executors cannot spawn on that machine).
- SSL and SASL integration
- Nice to have: Handle shuffle file consolidation (this would requires changes to Spark's implementation).
Author: Aaron Davidson <aaron@databricks.com>
Closes #3001 from aarondav/shuffle-service and squashes the following commits:
4d1f8c1 [Aaron Davidson] Remove changes to Worker
705748f [Aaron Davidson] Rename Standalone* to External*
fd3928b [Aaron Davidson] Do not unregister executor outputs unduly
9883918 [Aaron Davidson] Make suggested build changes
3d62679 [Aaron Davidson] Add Spark integration test
7fe51d5 [Aaron Davidson] Fix SBT integration
56caa50 [Aaron Davidson] Address comments
c8d1ac3 [Aaron Davidson] Add unit tests
2f70c0c [Aaron Davidson] Fix unit tests
5483e96 [Aaron Davidson] Fix unit tests
46a70bf [Aaron Davidson] Whoops, bracket
5ea4df6 [Aaron Davidson] [SPARK-3796] Create external service which can serve shuffle files
2014-11-01 17:37:45 -04:00
|
|
|
<version>${project.version}</version>
|
|
|
|
</dependency>
|
2014-03-08 02:10:35 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>net.java.dev.jets3t</groupId>
|
|
|
|
<artifactId>jets3t</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.curator</groupId>
|
|
|
|
<artifactId>curator-recipes</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.eclipse.jetty</groupId>
|
|
|
|
<artifactId>jetty-plus</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.eclipse.jetty</groupId>
|
|
|
|
<artifactId>jetty-security</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.eclipse.jetty</groupId>
|
|
|
|
<artifactId>jetty-util</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.eclipse.jetty</groupId>
|
|
|
|
<artifactId>jetty-server</artifactId>
|
|
|
|
</dependency>
|
[SPARK-2848] Shade Guava in uber-jars.
For further discussion, please check the JIRA entry.
This change moves Guava classes to a different package so that they don't conflict with the user-provided Guava (or the Hadoop-provided one). Since one class (Optional) was exposed through Spark's public API, that class was forked from Guava at the current dependency version (14.0.1) so that it can be kept going forward (until the API is cleaned).
Note this change has a few implications:
- *all* classes in the final jars will reference the relocated classes. If Hadoop classes are included (i.e. "-Phadoop-provided" is not activated), those will also reference the Guava 14 classes (instead of the Guava 11 classes from the Hadoop classpath).
- if the Guava version in Spark is ever changed, the new Guava will still reference the forked Optional class; this may or may not be a problem, but in the long term it's better to think about removing Optional from the public API.
For the end user, there are two visible implications:
- Guava is not provided as a transitive dependency anymore (since it's "provided" in Spark)
- At runtime, unless they provide their own, they'll either have no Guava or Hadoop's version of Guava (11), depending on how they set up their classpath.
Note that this patch does not change the sbt deliverables; those will still contain guava in its original package, and provide guava as a compile-time dependency. This assumes that maven is the canonical build, and sbt-built artifacts are not (officially) published.
Author: Marcelo Vanzin <vanzin@cloudera.com>
Closes #1813 from vanzin/SPARK-2848 and squashes the following commits:
9bdffb0 [Marcelo Vanzin] Undo sbt build changes.
819b445 [Marcelo Vanzin] Review feedback.
05e0a3d [Marcelo Vanzin] Merge branch 'master' into SPARK-2848
fef4370 [Marcelo Vanzin] Unfork Optional.java.
d3ea8e1 [Marcelo Vanzin] Exclude asm classes from final jar.
637189b [Marcelo Vanzin] Add hacky filter to prefer Spark's copy of Optional.
2fec990 [Marcelo Vanzin] Shade Guava in the sbt build.
616998e [Marcelo Vanzin] Shade Guava in the maven build, fork Guava's Optional.java.
2014-08-20 19:23:10 -04:00
|
|
|
<!--
|
|
|
|
Promote Guava to "compile" so that maven-shade-plugin picks it up (for packaging the Optional
|
|
|
|
class exposed in the Java API). The plugin will then remove this dependency from the published
|
|
|
|
pom, so that Guava does not pollute the client's compilation classpath.
|
|
|
|
-->
|
2014-03-08 02:10:35 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>com.google.guava</groupId>
|
|
|
|
<artifactId>guava</artifactId>
|
[SPARK-2848] Shade Guava in uber-jars.
For further discussion, please check the JIRA entry.
This change moves Guava classes to a different package so that they don't conflict with the user-provided Guava (or the Hadoop-provided one). Since one class (Optional) was exposed through Spark's public API, that class was forked from Guava at the current dependency version (14.0.1) so that it can be kept going forward (until the API is cleaned).
Note this change has a few implications:
- *all* classes in the final jars will reference the relocated classes. If Hadoop classes are included (i.e. "-Phadoop-provided" is not activated), those will also reference the Guava 14 classes (instead of the Guava 11 classes from the Hadoop classpath).
- if the Guava version in Spark is ever changed, the new Guava will still reference the forked Optional class; this may or may not be a problem, but in the long term it's better to think about removing Optional from the public API.
For the end user, there are two visible implications:
- Guava is not provided as a transitive dependency anymore (since it's "provided" in Spark)
- At runtime, unless they provide their own, they'll either have no Guava or Hadoop's version of Guava (11), depending on how they set up their classpath.
Note that this patch does not change the sbt deliverables; those will still contain guava in its original package, and provide guava as a compile-time dependency. This assumes that maven is the canonical build, and sbt-built artifacts are not (officially) published.
Author: Marcelo Vanzin <vanzin@cloudera.com>
Closes #1813 from vanzin/SPARK-2848 and squashes the following commits:
9bdffb0 [Marcelo Vanzin] Undo sbt build changes.
819b445 [Marcelo Vanzin] Review feedback.
05e0a3d [Marcelo Vanzin] Merge branch 'master' into SPARK-2848
fef4370 [Marcelo Vanzin] Unfork Optional.java.
d3ea8e1 [Marcelo Vanzin] Exclude asm classes from final jar.
637189b [Marcelo Vanzin] Add hacky filter to prefer Spark's copy of Optional.
2fec990 [Marcelo Vanzin] Shade Guava in the sbt build.
616998e [Marcelo Vanzin] Shade Guava in the maven build, fork Guava's Optional.java.
2014-08-20 19:23:10 -04:00
|
|
|
<scope>compile</scope>
|
2014-03-08 02:10:35 -05:00
|
|
|
</dependency>
|
2014-05-04 20:43:28 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.commons</groupId>
|
|
|
|
<artifactId>commons-lang3</artifactId>
|
|
|
|
</dependency>
|
2014-06-12 22:44:27 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.commons</groupId>
|
|
|
|
<artifactId>commons-math3</artifactId>
|
|
|
|
</dependency>
|
2014-04-03 20:00:06 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>com.google.code.findbugs</groupId>
|
|
|
|
<artifactId>jsr305</artifactId>
|
|
|
|
</dependency>
|
2014-03-08 02:10:35 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.slf4j</groupId>
|
|
|
|
<artifactId>slf4j-api</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.slf4j</groupId>
|
|
|
|
<artifactId>jul-to-slf4j</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.slf4j</groupId>
|
|
|
|
<artifactId>jcl-over-slf4j</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>log4j</groupId>
|
|
|
|
<artifactId>log4j</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.slf4j</groupId>
|
|
|
|
<artifactId>slf4j-log4j12</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>com.ning</groupId>
|
|
|
|
<artifactId>compress-lzf</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.xerial.snappy</groupId>
|
|
|
|
<artifactId>snappy-java</artifactId>
|
|
|
|
</dependency>
|
2014-07-15 04:46:57 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>net.jpountz.lz4</groupId>
|
|
|
|
<artifactId>lz4</artifactId>
|
|
|
|
</dependency>
|
[SPARK-4019] [SPARK-3740] Fix MapStatus compression bug that could lead to empty results or Snappy errors
This commit fixes a bug in MapStatus that could cause jobs to wrongly return
empty results if those jobs contained stages with more than 2000 partitions
where most of those partitions were empty.
For jobs with > 2000 partitions, MapStatus uses HighlyCompressedMapStatus,
which only stores the average size of blocks. If the average block size is
zero, then this will cause all blocks to be reported as empty, causing
BlockFetcherIterator to mistakenly skip them.
For example, this would return an empty result:
sc.makeRDD(0 until 10, 1000).repartition(2001).collect()
This can also lead to deserialization errors (e.g. Snappy decoding errors)
for jobs with > 2000 partitions where the average block size is non-zero but
there is at least one empty block. In this case, the BlockFetcher attempts to
fetch empty blocks and fails when trying to deserialize them.
The root problem here is that MapStatus has a (previously undocumented)
correctness property that was violated by HighlyCompressedMapStatus:
If a block is non-empty, then getSizeForBlock must be non-zero.
I fixed this by modifying HighlyCompressedMapStatus to store the average size
of _non-empty_ blocks and to use a compressed bitmap to track which blocks are
empty.
I also removed a test which was broken as originally written: it attempted
to check that HighlyCompressedMapStatus's size estimation error was < 10%,
but this was broken because HighlyCompressedMapStatus is only used for map
statuses with > 2000 partitions, but the test only created 50.
Author: Josh Rosen <joshrosen@databricks.com>
Closes #2866 from JoshRosen/spark-4019 and squashes the following commits:
fc8b490 [Josh Rosen] Roll back hashset change, which didn't improve performance.
5faa0a4 [Josh Rosen] Incorporate review feedback
c8b8cae [Josh Rosen] Two performance fixes:
3b892dd [Josh Rosen] Address Reynold's review comments
ba2e71c [Josh Rosen] Add missing newline
609407d [Josh Rosen] Use Roaring Bitmap to track non-empty blocks.
c23897a [Josh Rosen] Use sets when comparing collect() results
91276a3 [Josh Rosen] [SPARK-4019] Fix MapStatus compression bug that could lead to empty results.
2014-10-23 19:39:32 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.roaringbitmap</groupId>
|
|
|
|
<artifactId>RoaringBitmap</artifactId>
|
|
|
|
</dependency>
|
2014-03-08 02:10:35 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>commons-net</groupId>
|
|
|
|
<artifactId>commons-net</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>${akka.group}</groupId>
|
|
|
|
<artifactId>akka-remote_${scala.binary.version}</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>${akka.group}</groupId>
|
|
|
|
<artifactId>akka-slf4j_${scala.binary.version}</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>${akka.group}</groupId>
|
|
|
|
<artifactId>akka-testkit_${scala.binary.version}</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.scala-lang</groupId>
|
|
|
|
<artifactId>scala-library</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.json4s</groupId>
|
|
|
|
<artifactId>json4s-jackson_${scala.binary.version}</artifactId>
|
2014-08-06 00:59:10 -04:00
|
|
|
<version>3.2.10</version>
|
2014-03-08 02:10:35 -05:00
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.mesos</groupId>
|
|
|
|
<artifactId>mesos</artifactId>
|
2014-05-12 14:10:28 -04:00
|
|
|
<classifier>${mesos.classifier}</classifier>
|
2014-03-08 02:10:35 -05:00
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>io.netty</groupId>
|
|
|
|
<artifactId>netty-all</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>com.clearspring.analytics</groupId>
|
|
|
|
<artifactId>stream</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>com.codahale.metrics</groupId>
|
|
|
|
<artifactId>metrics-core</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>com.codahale.metrics</groupId>
|
|
|
|
<artifactId>metrics-jvm</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>com.codahale.metrics</groupId>
|
|
|
|
<artifactId>metrics-json</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>com.codahale.metrics</groupId>
|
|
|
|
<artifactId>metrics-graphite</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.derby</groupId>
|
|
|
|
<artifactId>derby</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2014-04-04 23:36:24 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.tachyonproject</groupId>
|
2014-08-01 01:53:42 -04:00
|
|
|
<artifactId>tachyon-client</artifactId>
|
|
|
|
<version>0.5.0</version>
|
2014-04-04 23:36:24 -04:00
|
|
|
<exclusions>
|
|
|
|
<exclusion>
|
|
|
|
<groupId>org.apache.hadoop</groupId>
|
|
|
|
<artifactId>hadoop-client</artifactId>
|
|
|
|
</exclusion>
|
|
|
|
<exclusion>
|
|
|
|
<groupId>org.apache.curator</groupId>
|
|
|
|
<artifactId>curator-recipes</artifactId>
|
|
|
|
</exclusion>
|
|
|
|
<exclusion>
|
|
|
|
<groupId>org.eclipse.jetty</groupId>
|
|
|
|
<artifactId>jetty-jsp</artifactId>
|
|
|
|
</exclusion>
|
|
|
|
<exclusion>
|
|
|
|
<groupId>org.eclipse.jetty</groupId>
|
|
|
|
<artifactId>jetty-webapp</artifactId>
|
|
|
|
</exclusion>
|
|
|
|
<exclusion>
|
|
|
|
<groupId>org.eclipse.jetty</groupId>
|
|
|
|
<artifactId>jetty-server</artifactId>
|
|
|
|
</exclusion>
|
|
|
|
<exclusion>
|
|
|
|
<groupId>org.eclipse.jetty</groupId>
|
|
|
|
<artifactId>jetty-servlet</artifactId>
|
|
|
|
</exclusion>
|
|
|
|
<exclusion>
|
|
|
|
<groupId>junit</groupId>
|
|
|
|
<artifactId>junit</artifactId>
|
|
|
|
</exclusion>
|
|
|
|
<exclusion>
|
|
|
|
<groupId>org.powermock</groupId>
|
|
|
|
<artifactId>powermock-module-junit4</artifactId>
|
|
|
|
</exclusion>
|
|
|
|
<exclusion>
|
|
|
|
<groupId>org.powermock</groupId>
|
|
|
|
<artifactId>powermock-api-mockito</artifactId>
|
|
|
|
</exclusion>
|
|
|
|
<exclusion>
|
|
|
|
<groupId>org.apache.curator</groupId>
|
|
|
|
<artifactId>curator-test</artifactId>
|
|
|
|
</exclusion>
|
|
|
|
</exclusions>
|
|
|
|
</dependency>
|
2014-10-26 14:29:27 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.seleniumhq.selenium</groupId>
|
|
|
|
<artifactId>selenium-java</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2014-03-08 02:10:35 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.scalatest</groupId>
|
|
|
|
<artifactId>scalatest_${scala.binary.version}</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.mockito</groupId>
|
|
|
|
<artifactId>mockito-all</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.scalacheck</groupId>
|
|
|
|
<artifactId>scalacheck_${scala.binary.version}</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.easymock</groupId>
|
2014-06-06 14:45:21 -04:00
|
|
|
<artifactId>easymockclassextension</artifactId>
|
2014-03-08 02:10:35 -05:00
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2014-06-20 23:05:12 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>asm</groupId>
|
|
|
|
<artifactId>asm</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2014-07-30 18:04:33 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>junit</groupId>
|
|
|
|
<artifactId>junit</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2014-03-08 02:10:35 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>com.novocode</groupId>
|
|
|
|
<artifactId>junit-interface</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
SPARK-1374: PySpark API for SparkSQL
An initial API that exposes SparkSQL functionality in PySpark. A PythonRDD composed of dictionaries, with string keys and primitive values (boolean, float, int, long, string) can be converted into a SchemaRDD that supports sql queries.
```
from pyspark.context import SQLContext
sqlCtx = SQLContext(sc)
rdd = sc.parallelize([{"field1" : 1, "field2" : "row1"}, {"field1" : 2, "field2": "row2"}, {"field1" : 3, "field2": "row3"}])
srdd = sqlCtx.applySchema(rdd)
sqlCtx.registerRDDAsTable(srdd, "table1")
srdd2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2 from table1")
srdd2.collect()
```
The last line yields ```[{"f1" : 1, "f2" : "row1"}, {"f1" : 2, "f2": "row2"}, {"f1" : 3, "f2": "row3"}]```
Author: Ahir Reddy <ahirreddy@gmail.com>
Author: Michael Armbrust <michael@databricks.com>
Closes #363 from ahirreddy/pysql and squashes the following commits:
0294497 [Ahir Reddy] Updated log4j properties to supress Hive Warns
307d6e0 [Ahir Reddy] Style fix
6f7b8f6 [Ahir Reddy] Temporary fix MIMA checker. Since we now assemble Spark jar with Hive, we don't want to check the interfaces of all of our hive dependencies
3ef074a [Ahir Reddy] Updated documentation because classes moved to sql.py
29245bf [Ahir Reddy] Cache underlying SchemaRDD instead of generating and caching PythonRDD
f2312c7 [Ahir Reddy] Moved everything into sql.py
a19afe4 [Ahir Reddy] Doc fixes
6d658ba [Ahir Reddy] Remove the metastore directory created by the HiveContext tests in SparkSQL
521ff6d [Ahir Reddy] Trying to get spark to build with hive
ab95eba [Ahir Reddy] Set SPARK_HIVE=true on jenkins
ded03e7 [Ahir Reddy] Added doc test for HiveContext
22de1d4 [Ahir Reddy] Fixed maven pyrolite dependency
e4da06c [Ahir Reddy] Display message if hive is not built into spark
227a0be [Michael Armbrust] Update API links. Fix Hive example.
58e2aa9 [Michael Armbrust] Build Docs for pyspark SQL Api. Minor fixes.
4285340 [Michael Armbrust] Fix building of Hive API Docs.
38a92b0 [Michael Armbrust] Add note to future non-python developers about python docs.
337b201 [Ahir Reddy] Changed com.clearspring.analytics stream version from 2.4.0 to 2.5.1 to match SBT build, and added pyrolite to maven build
40491c9 [Ahir Reddy] PR Changes + Method Visibility
1836944 [Michael Armbrust] Fix comments.
e00980f [Michael Armbrust] First draft of python sql programming guide.
b0192d3 [Ahir Reddy] Added Long, Double and Boolean as usable types + unit test
f98a422 [Ahir Reddy] HiveContexts
79621cf [Ahir Reddy] cleaning up cruft
b406ba0 [Ahir Reddy] doctest formatting
20936a5 [Ahir Reddy] Added tests and documentation
e4d21b4 [Ahir Reddy] Added pyrolite dependency
79f739d [Ahir Reddy] added more tests
7515ba0 [Ahir Reddy] added more tests :)
d26ec5e [Ahir Reddy] added test
e9f5b8d [Ahir Reddy] adding tests
906d180 [Ahir Reddy] added todo explaining cost of creating Row object in python
251f99d [Ahir Reddy] for now only allow dictionaries as input
09b9980 [Ahir Reddy] made jrdd explicitly lazy
c608947 [Ahir Reddy] SchemaRDD now has all RDD operations
725c91e [Ahir Reddy] awesome row objects
55d1c76 [Ahir Reddy] return row objects
4fe1319 [Ahir Reddy] output dictionaries correctly
be079de [Ahir Reddy] returning dictionaries works
cd5f79f [Ahir Reddy] Switched to using Scala SQLContext
e948bd9 [Ahir Reddy] yippie
4886052 [Ahir Reddy] even better
c0fb1c6 [Ahir Reddy] more working
043ca85 [Ahir Reddy] working
5496f9f [Ahir Reddy] doesn't crash
b8b904b [Ahir Reddy] Added schema rdd class
67ba875 [Ahir Reddy] java to python, and python to java
bcc0f23 [Ahir Reddy] Java to python
ab6025d [Ahir Reddy] compiling
2014-04-15 03:07:55 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.spark-project</groupId>
|
|
|
|
<artifactId>pyrolite</artifactId>
|
2014-04-22 12:44:41 -04:00
|
|
|
<version>2.0.1</version>
|
SPARK-1374: PySpark API for SparkSQL
An initial API that exposes SparkSQL functionality in PySpark. A PythonRDD composed of dictionaries, with string keys and primitive values (boolean, float, int, long, string) can be converted into a SchemaRDD that supports sql queries.
```
from pyspark.context import SQLContext
sqlCtx = SQLContext(sc)
rdd = sc.parallelize([{"field1" : 1, "field2" : "row1"}, {"field1" : 2, "field2": "row2"}, {"field1" : 3, "field2": "row3"}])
srdd = sqlCtx.applySchema(rdd)
sqlCtx.registerRDDAsTable(srdd, "table1")
srdd2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2 from table1")
srdd2.collect()
```
The last line yields ```[{"f1" : 1, "f2" : "row1"}, {"f1" : 2, "f2": "row2"}, {"f1" : 3, "f2": "row3"}]```
Author: Ahir Reddy <ahirreddy@gmail.com>
Author: Michael Armbrust <michael@databricks.com>
Closes #363 from ahirreddy/pysql and squashes the following commits:
0294497 [Ahir Reddy] Updated log4j properties to supress Hive Warns
307d6e0 [Ahir Reddy] Style fix
6f7b8f6 [Ahir Reddy] Temporary fix MIMA checker. Since we now assemble Spark jar with Hive, we don't want to check the interfaces of all of our hive dependencies
3ef074a [Ahir Reddy] Updated documentation because classes moved to sql.py
29245bf [Ahir Reddy] Cache underlying SchemaRDD instead of generating and caching PythonRDD
f2312c7 [Ahir Reddy] Moved everything into sql.py
a19afe4 [Ahir Reddy] Doc fixes
6d658ba [Ahir Reddy] Remove the metastore directory created by the HiveContext tests in SparkSQL
521ff6d [Ahir Reddy] Trying to get spark to build with hive
ab95eba [Ahir Reddy] Set SPARK_HIVE=true on jenkins
ded03e7 [Ahir Reddy] Added doc test for HiveContext
22de1d4 [Ahir Reddy] Fixed maven pyrolite dependency
e4da06c [Ahir Reddy] Display message if hive is not built into spark
227a0be [Michael Armbrust] Update API links. Fix Hive example.
58e2aa9 [Michael Armbrust] Build Docs for pyspark SQL Api. Minor fixes.
4285340 [Michael Armbrust] Fix building of Hive API Docs.
38a92b0 [Michael Armbrust] Add note to future non-python developers about python docs.
337b201 [Ahir Reddy] Changed com.clearspring.analytics stream version from 2.4.0 to 2.5.1 to match SBT build, and added pyrolite to maven build
40491c9 [Ahir Reddy] PR Changes + Method Visibility
1836944 [Michael Armbrust] Fix comments.
e00980f [Michael Armbrust] First draft of python sql programming guide.
b0192d3 [Ahir Reddy] Added Long, Double and Boolean as usable types + unit test
f98a422 [Ahir Reddy] HiveContexts
79621cf [Ahir Reddy] cleaning up cruft
b406ba0 [Ahir Reddy] doctest formatting
20936a5 [Ahir Reddy] Added tests and documentation
e4d21b4 [Ahir Reddy] Added pyrolite dependency
79f739d [Ahir Reddy] added more tests
7515ba0 [Ahir Reddy] added more tests :)
d26ec5e [Ahir Reddy] added test
e9f5b8d [Ahir Reddy] adding tests
906d180 [Ahir Reddy] added todo explaining cost of creating Row object in python
251f99d [Ahir Reddy] for now only allow dictionaries as input
09b9980 [Ahir Reddy] made jrdd explicitly lazy
c608947 [Ahir Reddy] SchemaRDD now has all RDD operations
725c91e [Ahir Reddy] awesome row objects
55d1c76 [Ahir Reddy] return row objects
4fe1319 [Ahir Reddy] output dictionaries correctly
be079de [Ahir Reddy] returning dictionaries works
cd5f79f [Ahir Reddy] Switched to using Scala SQLContext
e948bd9 [Ahir Reddy] yippie
4886052 [Ahir Reddy] even better
c0fb1c6 [Ahir Reddy] more working
043ca85 [Ahir Reddy] working
5496f9f [Ahir Reddy] doesn't crash
b8b904b [Ahir Reddy] Added schema rdd class
67ba875 [Ahir Reddy] java to python, and python to java
bcc0f23 [Ahir Reddy] Java to python
ab6025d [Ahir Reddy] compiling
2014-04-15 03:07:55 -04:00
|
|
|
</dependency>
|
[SPARK-1549] Add Python support to spark-submit
This PR updates spark-submit to allow submitting Python scripts (currently only with deploy-mode=client, but that's all that was supported before) and updates the PySpark code to properly find various paths, etc. One significant change is that we assume we can always find the Python files either from the Spark assembly JAR (which will happen with the Maven assembly build in make-distribution.sh) or from SPARK_HOME (which will exist in local mode even if you use sbt assembly, and should be enough for testing). This means we no longer need a weird hack to modify the environment for YARN.
This patch also updates the Python worker manager to run python with -u, which means unbuffered output (send it to our logs right away instead of waiting a while after stuff was written); this should simplify debugging.
In addition, it fixes https://issues.apache.org/jira/browse/SPARK-1709, setting the main class from a JAR's Main-Class attribute if not specified by the user, and fixes a few help strings and style issues in spark-submit.
In the future we may want to make the `pyspark` shell use spark-submit as well, but it seems unnecessary for 1.0.
Author: Matei Zaharia <matei@databricks.com>
Closes #664 from mateiz/py-submit and squashes the following commits:
15e9669 [Matei Zaharia] Fix some uses of path.separator property
051278c [Matei Zaharia] Small style fixes
0afe886 [Matei Zaharia] Add license headers
4650412 [Matei Zaharia] Add pyFiles to PYTHONPATH in executors, remove old YARN stuff, add tests
15f8e1e [Matei Zaharia] Set PYTHONPATH in PythonWorkerFactory in case it wasn't set from outside
47c0655 [Matei Zaharia] More work to make spark-submit work with Python:
d4375bd [Matei Zaharia] Clean up description of spark-submit args a bit and add Python ones
2014-05-06 18:12:35 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>net.sf.py4j</groupId>
|
|
|
|
<artifactId>py4j</artifactId>
|
2014-07-29 22:02:06 -04:00
|
|
|
<version>0.8.2.1</version>
|
[SPARK-1549] Add Python support to spark-submit
This PR updates spark-submit to allow submitting Python scripts (currently only with deploy-mode=client, but that's all that was supported before) and updates the PySpark code to properly find various paths, etc. One significant change is that we assume we can always find the Python files either from the Spark assembly JAR (which will happen with the Maven assembly build in make-distribution.sh) or from SPARK_HOME (which will exist in local mode even if you use sbt assembly, and should be enough for testing). This means we no longer need a weird hack to modify the environment for YARN.
This patch also updates the Python worker manager to run python with -u, which means unbuffered output (send it to our logs right away instead of waiting a while after stuff was written); this should simplify debugging.
In addition, it fixes https://issues.apache.org/jira/browse/SPARK-1709, setting the main class from a JAR's Main-Class attribute if not specified by the user, and fixes a few help strings and style issues in spark-submit.
In the future we may want to make the `pyspark` shell use spark-submit as well, but it seems unnecessary for 1.0.
Author: Matei Zaharia <matei@databricks.com>
Closes #664 from mateiz/py-submit and squashes the following commits:
15e9669 [Matei Zaharia] Fix some uses of path.separator property
051278c [Matei Zaharia] Small style fixes
0afe886 [Matei Zaharia] Add license headers
4650412 [Matei Zaharia] Add pyFiles to PYTHONPATH in executors, remove old YARN stuff, add tests
15f8e1e [Matei Zaharia] Set PYTHONPATH in PythonWorkerFactory in case it wasn't set from outside
47c0655 [Matei Zaharia] More work to make spark-submit work with Python:
d4375bd [Matei Zaharia] Clean up description of spark-submit args a bit and add Python ones
2014-05-06 18:12:35 -04:00
|
|
|
</dependency>
|
2014-03-08 02:10:35 -05:00
|
|
|
</dependencies>
|
|
|
|
<build>
|
|
|
|
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
|
|
|
|
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
|
|
|
|
<plugins>
|
|
|
|
<plugin>
|
|
|
|
<groupId>org.scalatest</groupId>
|
|
|
|
<artifactId>scalatest-maven-plugin</artifactId>
|
2014-11-12 00:36:48 -05:00
|
|
|
<executions>
|
|
|
|
<execution>
|
|
|
|
<id>test</id>
|
|
|
|
<goals>
|
|
|
|
<goal>test</goal>
|
|
|
|
</goals>
|
|
|
|
</execution>
|
|
|
|
</executions>
|
2014-03-08 02:10:35 -05:00
|
|
|
</plugin>
|
2014-11-12 00:36:48 -05:00
|
|
|
|
2014-04-30 02:24:34 -04:00
|
|
|
<!-- Unzip py4j so we can include its files in the jar -->
|
|
|
|
<plugin>
|
2014-09-02 13:45:14 -04:00
|
|
|
<groupId>org.apache.maven.plugins</groupId>
|
|
|
|
<artifactId>maven-antrun-plugin</artifactId>
|
2014-04-30 02:24:34 -04:00
|
|
|
<executions>
|
|
|
|
<execution>
|
|
|
|
<phase>generate-resources</phase>
|
|
|
|
<goals>
|
2014-09-02 13:45:14 -04:00
|
|
|
<goal>run</goal>
|
2014-04-30 02:24:34 -04:00
|
|
|
</goals>
|
|
|
|
</execution>
|
|
|
|
</executions>
|
|
|
|
<configuration>
|
2014-09-02 13:45:14 -04:00
|
|
|
<tasks>
|
|
|
|
<unzip src="../python/lib/py4j-0.8.2.1-src.zip" dest="../python/build" />
|
|
|
|
</tasks>
|
2014-04-30 02:24:34 -04:00
|
|
|
</configuration>
|
|
|
|
</plugin>
|
2014-10-01 11:55:04 -04:00
|
|
|
<plugin>
|
|
|
|
<artifactId>maven-clean-plugin</artifactId>
|
|
|
|
<configuration>
|
|
|
|
<filesets>
|
|
|
|
<fileset>
|
|
|
|
<directory>${basedir}/../python/build</directory>
|
|
|
|
</fileset>
|
|
|
|
</filesets>
|
|
|
|
<verbose>true</verbose>
|
|
|
|
</configuration>
|
|
|
|
</plugin>
|
[SPARK-2848] Shade Guava in uber-jars.
For further discussion, please check the JIRA entry.
This change moves Guava classes to a different package so that they don't conflict with the user-provided Guava (or the Hadoop-provided one). Since one class (Optional) was exposed through Spark's public API, that class was forked from Guava at the current dependency version (14.0.1) so that it can be kept going forward (until the API is cleaned).
Note this change has a few implications:
- *all* classes in the final jars will reference the relocated classes. If Hadoop classes are included (i.e. "-Phadoop-provided" is not activated), those will also reference the Guava 14 classes (instead of the Guava 11 classes from the Hadoop classpath).
- if the Guava version in Spark is ever changed, the new Guava will still reference the forked Optional class; this may or may not be a problem, but in the long term it's better to think about removing Optional from the public API.
For the end user, there are two visible implications:
- Guava is not provided as a transitive dependency anymore (since it's "provided" in Spark)
- At runtime, unless they provide their own, they'll either have no Guava or Hadoop's version of Guava (11), depending on how they set up their classpath.
Note that this patch does not change the sbt deliverables; those will still contain guava in its original package, and provide guava as a compile-time dependency. This assumes that maven is the canonical build, and sbt-built artifacts are not (officially) published.
Author: Marcelo Vanzin <vanzin@cloudera.com>
Closes #1813 from vanzin/SPARK-2848 and squashes the following commits:
9bdffb0 [Marcelo Vanzin] Undo sbt build changes.
819b445 [Marcelo Vanzin] Review feedback.
05e0a3d [Marcelo Vanzin] Merge branch 'master' into SPARK-2848
fef4370 [Marcelo Vanzin] Unfork Optional.java.
d3ea8e1 [Marcelo Vanzin] Exclude asm classes from final jar.
637189b [Marcelo Vanzin] Add hacky filter to prefer Spark's copy of Optional.
2fec990 [Marcelo Vanzin] Shade Guava in the sbt build.
616998e [Marcelo Vanzin] Shade Guava in the maven build, fork Guava's Optional.java.
2014-08-20 19:23:10 -04:00
|
|
|
<plugin>
|
|
|
|
<groupId>org.apache.maven.plugins</groupId>
|
|
|
|
<artifactId>maven-shade-plugin</artifactId>
|
|
|
|
<executions>
|
|
|
|
<execution>
|
|
|
|
<phase>package</phase>
|
|
|
|
<goals>
|
|
|
|
<goal>shade</goal>
|
|
|
|
</goals>
|
|
|
|
<configuration>
|
|
|
|
<shadedArtifactAttached>false</shadedArtifactAttached>
|
|
|
|
<artifactSet>
|
|
|
|
<includes>
|
|
|
|
<include>com.google.guava:guava</include>
|
|
|
|
</includes>
|
|
|
|
</artifactSet>
|
|
|
|
<filters>
|
|
|
|
<!-- See comment in the guava dependency declaration above. -->
|
|
|
|
<filter>
|
|
|
|
<artifact>com.google.guava:guava</artifact>
|
|
|
|
<includes>
|
2014-09-23 16:42:00 -04:00
|
|
|
<include>com/google/common/base/Absent*</include>
|
[SPARK-2848] Shade Guava in uber-jars.
For further discussion, please check the JIRA entry.
This change moves Guava classes to a different package so that they don't conflict with the user-provided Guava (or the Hadoop-provided one). Since one class (Optional) was exposed through Spark's public API, that class was forked from Guava at the current dependency version (14.0.1) so that it can be kept going forward (until the API is cleaned).
Note this change has a few implications:
- *all* classes in the final jars will reference the relocated classes. If Hadoop classes are included (i.e. "-Phadoop-provided" is not activated), those will also reference the Guava 14 classes (instead of the Guava 11 classes from the Hadoop classpath).
- if the Guava version in Spark is ever changed, the new Guava will still reference the forked Optional class; this may or may not be a problem, but in the long term it's better to think about removing Optional from the public API.
For the end user, there are two visible implications:
- Guava is not provided as a transitive dependency anymore (since it's "provided" in Spark)
- At runtime, unless they provide their own, they'll either have no Guava or Hadoop's version of Guava (11), depending on how they set up their classpath.
Note that this patch does not change the sbt deliverables; those will still contain guava in its original package, and provide guava as a compile-time dependency. This assumes that maven is the canonical build, and sbt-built artifacts are not (officially) published.
Author: Marcelo Vanzin <vanzin@cloudera.com>
Closes #1813 from vanzin/SPARK-2848 and squashes the following commits:
9bdffb0 [Marcelo Vanzin] Undo sbt build changes.
819b445 [Marcelo Vanzin] Review feedback.
05e0a3d [Marcelo Vanzin] Merge branch 'master' into SPARK-2848
fef4370 [Marcelo Vanzin] Unfork Optional.java.
d3ea8e1 [Marcelo Vanzin] Exclude asm classes from final jar.
637189b [Marcelo Vanzin] Add hacky filter to prefer Spark's copy of Optional.
2fec990 [Marcelo Vanzin] Shade Guava in the sbt build.
616998e [Marcelo Vanzin] Shade Guava in the maven build, fork Guava's Optional.java.
2014-08-20 19:23:10 -04:00
|
|
|
<include>com/google/common/base/Optional*</include>
|
2014-09-23 16:42:00 -04:00
|
|
|
<include>com/google/common/base/Present*</include>
|
[SPARK-2848] Shade Guava in uber-jars.
For further discussion, please check the JIRA entry.
This change moves Guava classes to a different package so that they don't conflict with the user-provided Guava (or the Hadoop-provided one). Since one class (Optional) was exposed through Spark's public API, that class was forked from Guava at the current dependency version (14.0.1) so that it can be kept going forward (until the API is cleaned).
Note this change has a few implications:
- *all* classes in the final jars will reference the relocated classes. If Hadoop classes are included (i.e. "-Phadoop-provided" is not activated), those will also reference the Guava 14 classes (instead of the Guava 11 classes from the Hadoop classpath).
- if the Guava version in Spark is ever changed, the new Guava will still reference the forked Optional class; this may or may not be a problem, but in the long term it's better to think about removing Optional from the public API.
For the end user, there are two visible implications:
- Guava is not provided as a transitive dependency anymore (since it's "provided" in Spark)
- At runtime, unless they provide their own, they'll either have no Guava or Hadoop's version of Guava (11), depending on how they set up their classpath.
Note that this patch does not change the sbt deliverables; those will still contain guava in its original package, and provide guava as a compile-time dependency. This assumes that maven is the canonical build, and sbt-built artifacts are not (officially) published.
Author: Marcelo Vanzin <vanzin@cloudera.com>
Closes #1813 from vanzin/SPARK-2848 and squashes the following commits:
9bdffb0 [Marcelo Vanzin] Undo sbt build changes.
819b445 [Marcelo Vanzin] Review feedback.
05e0a3d [Marcelo Vanzin] Merge branch 'master' into SPARK-2848
fef4370 [Marcelo Vanzin] Unfork Optional.java.
d3ea8e1 [Marcelo Vanzin] Exclude asm classes from final jar.
637189b [Marcelo Vanzin] Add hacky filter to prefer Spark's copy of Optional.
2fec990 [Marcelo Vanzin] Shade Guava in the sbt build.
616998e [Marcelo Vanzin] Shade Guava in the maven build, fork Guava's Optional.java.
2014-08-20 19:23:10 -04:00
|
|
|
</includes>
|
|
|
|
</filter>
|
|
|
|
</filters>
|
|
|
|
</configuration>
|
|
|
|
</execution>
|
|
|
|
</executions>
|
|
|
|
</plugin>
|
2014-09-12 17:54:42 -04:00
|
|
|
<!--
|
|
|
|
Copy guava to the build directory. This is needed to make the SPARK_PREPEND_CLASSES
|
|
|
|
option work in compute-classpath.sh, since it would put the non-shaded Spark classes in
|
|
|
|
the runtime classpath.
|
|
|
|
-->
|
|
|
|
<plugin>
|
|
|
|
<groupId>org.apache.maven.plugins</groupId>
|
|
|
|
<artifactId>maven-dependency-plugin</artifactId>
|
|
|
|
<executions>
|
|
|
|
<execution>
|
|
|
|
<id>copy-dependencies</id>
|
|
|
|
<phase>package</phase>
|
|
|
|
<goals>
|
|
|
|
<goal>copy-dependencies</goal>
|
|
|
|
</goals>
|
|
|
|
<configuration>
|
|
|
|
<outputDirectory>${project.build.directory}</outputDirectory>
|
|
|
|
<overWriteReleases>false</overWriteReleases>
|
|
|
|
<overWriteSnapshots>false</overWriteSnapshots>
|
|
|
|
<overWriteIfNewer>true</overWriteIfNewer>
|
|
|
|
<useSubDirectoryPerType>true</useSubDirectoryPerType>
|
|
|
|
<includeArtifactIds>guava</includeArtifactIds>
|
|
|
|
<silent>true</silent>
|
|
|
|
</configuration>
|
|
|
|
</execution>
|
|
|
|
</executions>
|
|
|
|
</plugin>
|
2014-03-08 02:10:35 -05:00
|
|
|
</plugins>
|
2014-05-12 14:10:28 -04:00
|
|
|
|
2014-04-30 02:24:34 -04:00
|
|
|
<resources>
|
|
|
|
<resource>
|
|
|
|
<directory>src/main/resources</directory>
|
|
|
|
</resource>
|
|
|
|
<resource>
|
|
|
|
<directory>../python</directory>
|
|
|
|
<includes>
|
|
|
|
<include>pyspark/*.py</include>
|
|
|
|
</includes>
|
|
|
|
</resource>
|
|
|
|
<resource>
|
|
|
|
<directory>../python/build</directory>
|
|
|
|
<includes>
|
|
|
|
<include>py4j/*.py</include>
|
|
|
|
</includes>
|
|
|
|
</resource>
|
|
|
|
</resources>
|
2014-03-08 02:10:35 -05:00
|
|
|
</build>
|
2014-11-12 00:36:48 -05:00
|
|
|
|
2013-12-31 22:30:08 -05:00
|
|
|
</project>
|