2014-06-01 20:27:05 -04:00
|
|
|
/*
|
|
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
* contributor license agreements. See the NOTICE file distributed with
|
|
|
|
* this work for additional information regarding copyright ownership.
|
|
|
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
* (the "License"); you may not use this file except in compliance with
|
|
|
|
* the License. You may obtain a copy of the License at
|
|
|
|
*
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
*
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
import com.typesafe.tools.mima.core._
|
2015-03-20 14:43:57 -04:00
|
|
|
import com.typesafe.tools.mima.core.ProblemFilters._
|
2014-06-01 20:27:05 -04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Additional excludes for checking of Spark's binary compatibility.
|
|
|
|
*
|
|
|
|
* The Mima build will automatically exclude @DeveloperApi and @Experimental classes. This acts
|
|
|
|
* as an official audit of cases where we excluded other classes. Please use the narrowest
|
|
|
|
* possible exclude here. MIMA will usually tell you what exclude to use, e.g.:
|
|
|
|
*
|
|
|
|
* ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.rdd.RDD.take")
|
|
|
|
*
|
|
|
|
* It is also possible to exclude Spark classes and packages. This should be used sparingly:
|
|
|
|
*
|
|
|
|
* MimaBuild.excludeSparkClass("graphx.util.collection.GraphXPrimitiveKeyOpenHashMap")
|
|
|
|
*/
|
|
|
|
object MimaExcludes {
|
2014-07-23 20:12:28 -04:00
|
|
|
def excludes(version: String) =
|
|
|
|
version match {
|
2015-03-20 14:43:57 -04:00
|
|
|
case v if v.startsWith("1.4") =>
|
|
|
|
Seq(
|
|
|
|
MimaBuild.excludeSparkPackage("deploy"),
|
|
|
|
MimaBuild.excludeSparkPackage("ml"),
|
|
|
|
// SPARK-5922 Adding a generalized diff(other: RDD[(VertexId, VD)]) to VertexRDD
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.graphx.VertexRDD.diff"),
|
|
|
|
// These are needed if checking against the sbt build, since they are part of
|
|
|
|
// the maven-generated artifacts in 1.3.
|
|
|
|
excludePackage("org.spark-project.jetty"),
|
|
|
|
MimaBuild.excludeSparkPackage("unused"),
|
2015-03-24 02:41:06 -04:00
|
|
|
ProblemFilters.exclude[MissingClassProblem]("com.google.common.base.Optional"),
|
|
|
|
ProblemFilters.exclude[IncompatibleResultTypeProblem](
|
|
|
|
"org.apache.spark.rdd.JdbcRDD.compute"),
|
|
|
|
ProblemFilters.exclude[IncompatibleResultTypeProblem](
|
|
|
|
"org.apache.spark.broadcast.HttpBroadcastFactory.newBroadcast"),
|
|
|
|
ProblemFilters.exclude[IncompatibleResultTypeProblem](
|
2015-03-30 00:25:09 -04:00
|
|
|
"org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.scheduler.OutputCommitCoordinator$OutputCommitCoordinatorActor")
|
2015-03-26 22:08:09 -04:00
|
|
|
) ++ Seq(
|
2015-04-01 06:09:00 -04:00
|
|
|
// SPARK-4655 - Making Stage an Abstract class broke binary compatility even though
|
|
|
|
// the stage class is defined as private[spark]
|
|
|
|
ProblemFilters.exclude[AbstractClassProblem]("org.apache.spark.scheduler.Stage")
|
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-6510 Add a Graph#minus method acting as Set#difference
|
2015-03-26 22:08:09 -04:00
|
|
|
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.graphx.VertexRDD.minus")
|
2015-04-03 14:23:11 -04:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-6492 Fix deadlock in SparkContext.stop()
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.SparkContext.org$" +
|
|
|
|
"apache$spark$SparkContext$$SPARK_CONTEXT_CONSTRUCTOR_LOCK")
|
2015-04-09 18:37:45 -04:00
|
|
|
)++ Seq(
|
|
|
|
// SPARK-6693 add tostring with max lines and width for matrix
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.Matrix.toString")
|
2015-04-17 21:28:42 -04:00
|
|
|
)++ Seq(
|
|
|
|
// SPARK-6703 Add getOrCreate method to SparkContext
|
|
|
|
ProblemFilters.exclude[IncompatibleResultTypeProblem]
|
|
|
|
("org.apache.spark.SparkContext.org$apache$spark$SparkContext$$activeContext")
|
2015-04-27 22:02:51 -04:00
|
|
|
)++ Seq(
|
|
|
|
// SPARK-7090 Introduce LDAOptimizer to LDA to further improve extensibility
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.mllib.clustering.LDA$EMOptimizer")
|
[SPARK-6756] [MLLIB] add toSparse, toDense, numActives, numNonzeros, and compressed to Vector
Add `compressed` to `Vector` with some other methods: `numActives`, `numNonzeros`, `toSparse`, and `toDense`. jkbradley
Author: Xiangrui Meng <meng@databricks.com>
Closes #5756 from mengxr/SPARK-6756 and squashes the following commits:
8d4ecbd [Xiangrui Meng] address comment and add mima excludes
da54179 [Xiangrui Meng] add toSparse, toDense, numActives, numNonzeros, and compressed to Vector
2015-04-29 00:49:53 -04:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-6756 add toSparse, toDense, numActives, numNonzeros, and compressed to Vector
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.Vector.compressed"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.Vector.toDense"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.Vector.numNonzeros"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.Vector.toSparse"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.Vector.numActives")
|
2015-04-30 19:23:01 -04:00
|
|
|
) ++ Seq(
|
[SPARK-6908] [SQL] Use isolated Hive client
This PR switches Spark SQL's Hive support to use the isolated hive client interface introduced by #5851, instead of directly interacting with the client. By using this isolated client we can now allow users to dynamically configure the version of Hive that they are connecting to by setting `spark.sql.hive.metastore.version` without the need recompile. This also greatly reduces the surface area for our interaction with the hive libraries, hopefully making it easier to support other versions in the future.
Jars for the desired hive version can be configured using `spark.sql.hive.metastore.jars`, which accepts the following options:
- a colon-separated list of jar files or directories for hive and hadoop.
- `builtin` - attempt to discover the jars that were used to load Spark SQL and use those. This
option is only valid when using the execution version of Hive.
- `maven` - download the correct version of hive on demand from maven.
By default, `builtin` is used for Hive 13.
This PR also removes the test step for building against Hive 12, as this will no longer be required to talk to Hive 12 metastores. However, the full removal of the Shim is deferred until a later PR.
Remaining TODOs:
- Remove the Hive Shims and inline code for Hive 13.
- Several HiveCompatibility tests are not yet passing.
- `nullformatCTAS` - As detailed below, we now are handling CTAS parsing ourselves instead of hacking into the Hive semantic analyzer. However, we currently only handle the common cases and not things like CTAS where the null format is specified.
- `combine1` now leaks state about compression somehow, breaking all subsequent tests. As such we currently add it to the blacklist
- `part_inherit_tbl_props` and `part_inherit_tbl_props_with_star` do not work anymore. We are correctly propagating the information
- "load_dyn_part14.*" - These tests pass when run on their own, but fail when run with all other tests. It seems our `RESET` mechanism may not be as robust as it used to be?
Other required changes:
- `CreateTableAsSelect` no longer carries parts of the HiveQL AST with it through the query execution pipeline. Instead, we parse CTAS during the HiveQL conversion and construct a `HiveTable`. The full parsing here is not yet complete as detailed above in the remaining TODOs. Since the operator is Hive specific, it is moved to the hive package.
- `Command` is simplified to be a trait that simply acts as a marker for a LogicalPlan that should be eagerly evaluated.
Author: Michael Armbrust <michael@databricks.com>
Closes #5876 from marmbrus/useIsolatedClient and squashes the following commits:
258d000 [Michael Armbrust] really really correct path handling
e56fd4a [Michael Armbrust] getAbsolutePath
5a259f5 [Michael Armbrust] fix typos
81bb366 [Michael Armbrust] comments from vanzin
5f3945e [Michael Armbrust] Merge remote-tracking branch 'origin/master' into useIsolatedClient
4b5cd41 [Michael Armbrust] yin's comments
f5de7de [Michael Armbrust] cleanup
11e9c72 [Michael Armbrust] better coverage in versions suite
7e8f010 [Michael Armbrust] better error messages and jar handling
e7b3941 [Michael Armbrust] more permisive checking for function registration
da91ba7 [Michael Armbrust] Merge remote-tracking branch 'origin/master' into useIsolatedClient
5fe5894 [Michael Armbrust] fix serialization suite
81711c4 [Michael Armbrust] Initial support for running without maven
1d8ae44 [Michael Armbrust] fix final tests?
1c50813 [Michael Armbrust] more comments
a3bee70 [Michael Armbrust] Merge remote-tracking branch 'origin/master' into useIsolatedClient
a6f5df1 [Michael Armbrust] style
ab07f7e [Michael Armbrust] WIP
4d8bf02 [Michael Armbrust] Remove hive 12 compilation
8843a25 [Michael Armbrust] [SPARK-6908] [SQL] Use isolated Hive client
(cherry picked from commit cd1d4110cfffb413ab585cf1cc8f1264243cb393)
Signed-off-by: Yin Huai <yhuai@databricks.com>
2015-05-07 22:36:24 -04:00
|
|
|
// Execution should never be included as its always internal.
|
|
|
|
MimaBuild.excludeSparkPackage("sql.execution"),
|
2015-04-30 19:23:01 -04:00
|
|
|
// This `protected[sql]` method was removed in 1.3.1
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.sql.SQLContext.checkAnalysis"),
|
[SPARK-3928] [SPARK-5182] [SQL] Partitioning support for the data sources API
This PR adds partitioning support for the external data sources API. It aims to simplify development of file system based data sources, and provide first class partitioning support for both read path and write path. Existing data sources like JSON and Parquet can be simplified with this work.
## New features provided
1. Hive compatible partition discovery
This actually generalizes the partition discovery strategy used in Parquet data source in Spark 1.3.0.
1. Generalized partition pruning optimization
Now partition pruning is handled during physical planning phase. Specific data sources don't need to worry about this harness anymore.
(This also implies that we can remove `CatalystScan` after migrating the Parquet data source, since now we don't need to pass Catalyst expressions to data source implementations.)
1. Insertion with dynamic partitions
When inserting data to a `FSBasedRelation`, data can be partitioned dynamically by specified partition columns.
## New structures provided
### Developer API
1. `FSBasedRelation`
Base abstract class for file system based data sources.
1. `OutputWriter`
Base abstract class for output row writers, responsible for writing a single row object.
1. `FSBasedRelationProvider`
A new relation provider for `FSBasedRelation` subclasses. Note that data sources extending `FSBasedRelation` don't need to extend `RelationProvider` and `SchemaRelationProvider`.
### User API
New overloaded versions of
1. `DataFrame.save()`
1. `DataFrame.saveAsTable()`
1. `SQLContext.load()`
are provided to allow users to save/load DataFrames with user defined dynamic partition columns.
### Spark SQL query planning
1. `InsertIntoFSBasedRelation`
Used to implement write path for `FSBasedRelation`s.
1. New rules for `FSBasedRelation` in `DataSourceStrategy`
These are added to hook `FSBasedRelation` into physical query plan in read path, and perform partition pruning.
## TODO
- [ ] Use scratch directories when overwriting a table with data selected from itself.
Currently, this is not supported, because the table been overwritten is always deleted before writing any data to it.
- [ ] When inserting with dynamic partition columns, use external sorter to group the data first.
This ensures that we only need to open a single `OutputWriter` at a time. For data sources like Parquet, `OutputWriter`s can be quite memory consuming. One issue is that, this approach breaks the row distribution in the original DataFrame. However, we did't promise to preserve data distribution when writing a DataFrame.
- [x] More tests. Specifically, test cases for
- [x] Self-join
- [x] Loading partitioned relations with a subset of partition columns stored in data files.
- [x] `SQLContext.load()` with user defined dynamic partition columns.
## Parquet data source migration
Parquet data source migration is covered in PR https://github.com/liancheng/spark/pull/6, which is against this PR branch and for preview only. A formal PR need to be made after this one is merged.
Author: Cheng Lian <lian@databricks.com>
Closes #5526 from liancheng/partitioning-support and squashes the following commits:
5351a1b [Cheng Lian] Fixes compilation error introduced while rebasing
1f9b1a5 [Cheng Lian] Tweaks data schema passed to FSBasedRelations
43ba50e [Cheng Lian] Avoids serializing generated projection code
edf49e7 [Cheng Lian] Removed commented stale code block
348a922 [Cheng Lian] Adds projection in FSBasedRelation.buildScan(requiredColumns, inputPaths)
ad4d4de [Cheng Lian] Enables HDFS style globbing
8d12e69 [Cheng Lian] Fixes compilation error
c71ac6c [Cheng Lian] Addresses comments from @marmbrus
7552168 [Cheng Lian] Fixes typo in MimaExclude.scala
0349e09 [Cheng Lian] Fixes compilation error introduced while rebasing
52b0c9b [Cheng Lian] Adjusts project/MimaExclude.scala
c466de6 [Cheng Lian] Addresses comments
bc3f9b4 [Cheng Lian] Uses projection to separate partition columns and data columns while inserting rows
795920a [Cheng Lian] Fixes compilation error after rebasing
0b8cd70 [Cheng Lian] Adds Scala/Catalyst row conversion when writing non-partitioned tables
fa543f3 [Cheng Lian] Addresses comments
5849dd0 [Cheng Lian] Fixes doc typos. Fixes partition discovery refresh.
51be443 [Cheng Lian] Replaces FSBasedRelation.outputCommitterClass with FSBasedRelation.prepareForWrite
c4ed4fe [Cheng Lian] Bug fixes and a new test suite
a29e663 [Cheng Lian] Bug fix: should only pass actuall data files to FSBaseRelation.buildScan
5f423d3 [Cheng Lian] Bug fixes. Lets data source to customize OutputCommitter rather than OutputFormat
54c3d7b [Cheng Lian] Enforces that FileOutputFormat must be used
be0c268 [Cheng Lian] Uses TaskAttempContext rather than Configuration in OutputWriter.init
0bc6ad1 [Cheng Lian] Resorts to new Hadoop API, and now FSBasedRelation can customize output format class
f320766 [Cheng Lian] Adds prepareForWrite() hook, refactored writer containers
422ff4a [Cheng Lian] Fixes style issue
ce52353 [Cheng Lian] Adds new SQLContext.load() overload with user defined dynamic partition columns
8d2ff71 [Cheng Lian] Merges partition columns when reading partitioned relations
ca1805b [Cheng Lian] Removes duplicated partition discovery code in new Parquet
f18dec2 [Cheng Lian] More strict schema checking
b746ab5 [Cheng Lian] More tests
9b487bf [Cheng Lian] Fixes compilation errors introduced while rebasing
ea6c8dd [Cheng Lian] Removes remote debugging stuff
327bb1d [Cheng Lian] Implements partitioning support for data sources API
3c5073a [Cheng Lian] Fixes SaveModes used in test cases
fb5a607 [Cheng Lian] Fixes compilation error
9d17607 [Cheng Lian] Adds the contract that OutputWriter should have zero-arg constructor
5de194a [Cheng Lian] Forgot Apache licence header
95d0b4d [Cheng Lian] Renames PartitionedSchemaRelationProvider to FSBasedRelationProvider
770b5ba [Cheng Lian] Adds tests for FSBasedRelation
3ba9bbf [Cheng Lian] Adds DataFrame.saveAsTable() overrides which support partitioning
1b8231f [Cheng Lian] Renames FSBasedPrunedFilteredScan to FSBasedRelation
aa8ba9a [Cheng Lian] Javadoc fix
012ed2d [Cheng Lian] Adds PartitioningOptions
7dd8dd5 [Cheng Lian] Adds new interfaces and stub methods for data sources API partitioning support
(cherry picked from commit 0595b6de8f1da04baceda082553c2aa1aa2cb006)
Signed-off-by: Cheng Lian <lian@databricks.com>
2015-05-12 13:32:28 -04:00
|
|
|
// These `private[sql]` class were removed in 1.4.0:
|
2015-04-30 19:23:01 -04:00
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.sql.execution.AddExchange"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.sql.execution.AddExchange$"),
|
[SPARK-3928] [SPARK-5182] [SQL] Partitioning support for the data sources API
This PR adds partitioning support for the external data sources API. It aims to simplify development of file system based data sources, and provide first class partitioning support for both read path and write path. Existing data sources like JSON and Parquet can be simplified with this work.
## New features provided
1. Hive compatible partition discovery
This actually generalizes the partition discovery strategy used in Parquet data source in Spark 1.3.0.
1. Generalized partition pruning optimization
Now partition pruning is handled during physical planning phase. Specific data sources don't need to worry about this harness anymore.
(This also implies that we can remove `CatalystScan` after migrating the Parquet data source, since now we don't need to pass Catalyst expressions to data source implementations.)
1. Insertion with dynamic partitions
When inserting data to a `FSBasedRelation`, data can be partitioned dynamically by specified partition columns.
## New structures provided
### Developer API
1. `FSBasedRelation`
Base abstract class for file system based data sources.
1. `OutputWriter`
Base abstract class for output row writers, responsible for writing a single row object.
1. `FSBasedRelationProvider`
A new relation provider for `FSBasedRelation` subclasses. Note that data sources extending `FSBasedRelation` don't need to extend `RelationProvider` and `SchemaRelationProvider`.
### User API
New overloaded versions of
1. `DataFrame.save()`
1. `DataFrame.saveAsTable()`
1. `SQLContext.load()`
are provided to allow users to save/load DataFrames with user defined dynamic partition columns.
### Spark SQL query planning
1. `InsertIntoFSBasedRelation`
Used to implement write path for `FSBasedRelation`s.
1. New rules for `FSBasedRelation` in `DataSourceStrategy`
These are added to hook `FSBasedRelation` into physical query plan in read path, and perform partition pruning.
## TODO
- [ ] Use scratch directories when overwriting a table with data selected from itself.
Currently, this is not supported, because the table been overwritten is always deleted before writing any data to it.
- [ ] When inserting with dynamic partition columns, use external sorter to group the data first.
This ensures that we only need to open a single `OutputWriter` at a time. For data sources like Parquet, `OutputWriter`s can be quite memory consuming. One issue is that, this approach breaks the row distribution in the original DataFrame. However, we did't promise to preserve data distribution when writing a DataFrame.
- [x] More tests. Specifically, test cases for
- [x] Self-join
- [x] Loading partitioned relations with a subset of partition columns stored in data files.
- [x] `SQLContext.load()` with user defined dynamic partition columns.
## Parquet data source migration
Parquet data source migration is covered in PR https://github.com/liancheng/spark/pull/6, which is against this PR branch and for preview only. A formal PR need to be made after this one is merged.
Author: Cheng Lian <lian@databricks.com>
Closes #5526 from liancheng/partitioning-support and squashes the following commits:
5351a1b [Cheng Lian] Fixes compilation error introduced while rebasing
1f9b1a5 [Cheng Lian] Tweaks data schema passed to FSBasedRelations
43ba50e [Cheng Lian] Avoids serializing generated projection code
edf49e7 [Cheng Lian] Removed commented stale code block
348a922 [Cheng Lian] Adds projection in FSBasedRelation.buildScan(requiredColumns, inputPaths)
ad4d4de [Cheng Lian] Enables HDFS style globbing
8d12e69 [Cheng Lian] Fixes compilation error
c71ac6c [Cheng Lian] Addresses comments from @marmbrus
7552168 [Cheng Lian] Fixes typo in MimaExclude.scala
0349e09 [Cheng Lian] Fixes compilation error introduced while rebasing
52b0c9b [Cheng Lian] Adjusts project/MimaExclude.scala
c466de6 [Cheng Lian] Addresses comments
bc3f9b4 [Cheng Lian] Uses projection to separate partition columns and data columns while inserting rows
795920a [Cheng Lian] Fixes compilation error after rebasing
0b8cd70 [Cheng Lian] Adds Scala/Catalyst row conversion when writing non-partitioned tables
fa543f3 [Cheng Lian] Addresses comments
5849dd0 [Cheng Lian] Fixes doc typos. Fixes partition discovery refresh.
51be443 [Cheng Lian] Replaces FSBasedRelation.outputCommitterClass with FSBasedRelation.prepareForWrite
c4ed4fe [Cheng Lian] Bug fixes and a new test suite
a29e663 [Cheng Lian] Bug fix: should only pass actuall data files to FSBaseRelation.buildScan
5f423d3 [Cheng Lian] Bug fixes. Lets data source to customize OutputCommitter rather than OutputFormat
54c3d7b [Cheng Lian] Enforces that FileOutputFormat must be used
be0c268 [Cheng Lian] Uses TaskAttempContext rather than Configuration in OutputWriter.init
0bc6ad1 [Cheng Lian] Resorts to new Hadoop API, and now FSBasedRelation can customize output format class
f320766 [Cheng Lian] Adds prepareForWrite() hook, refactored writer containers
422ff4a [Cheng Lian] Fixes style issue
ce52353 [Cheng Lian] Adds new SQLContext.load() overload with user defined dynamic partition columns
8d2ff71 [Cheng Lian] Merges partition columns when reading partitioned relations
ca1805b [Cheng Lian] Removes duplicated partition discovery code in new Parquet
f18dec2 [Cheng Lian] More strict schema checking
b746ab5 [Cheng Lian] More tests
9b487bf [Cheng Lian] Fixes compilation errors introduced while rebasing
ea6c8dd [Cheng Lian] Removes remote debugging stuff
327bb1d [Cheng Lian] Implements partitioning support for data sources API
3c5073a [Cheng Lian] Fixes SaveModes used in test cases
fb5a607 [Cheng Lian] Fixes compilation error
9d17607 [Cheng Lian] Adds the contract that OutputWriter should have zero-arg constructor
5de194a [Cheng Lian] Forgot Apache licence header
95d0b4d [Cheng Lian] Renames PartitionedSchemaRelationProvider to FSBasedRelationProvider
770b5ba [Cheng Lian] Adds tests for FSBasedRelation
3ba9bbf [Cheng Lian] Adds DataFrame.saveAsTable() overrides which support partitioning
1b8231f [Cheng Lian] Renames FSBasedPrunedFilteredScan to FSBasedRelation
aa8ba9a [Cheng Lian] Javadoc fix
012ed2d [Cheng Lian] Adds PartitioningOptions
7dd8dd5 [Cheng Lian] Adds new interfaces and stub methods for data sources API partitioning support
(cherry picked from commit 0595b6de8f1da04baceda082553c2aa1aa2cb006)
Signed-off-by: Cheng Lian <lian@databricks.com>
2015-05-12 13:32:28 -04:00
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.sql.parquet.PartitionSpec"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.sql.parquet.PartitionSpec$"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.sql.parquet.Partition"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.sql.parquet.Partition$"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.sql.parquet.ParquetRelation2$PartitionValues"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.sql.parquet.ParquetRelation2$PartitionValues$"),
|
2015-05-13 14:04:10 -04:00
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.sql.parquet.ParquetRelation2"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.sql.parquet.ParquetRelation2$"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.sql.parquet.ParquetRelation2$MetadataCache"),
|
2015-04-30 19:23:01 -04:00
|
|
|
// These test support classes were moved out of src/main and into src/test:
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.sql.parquet.ParquetTestData"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.sql.parquet.ParquetTestData$"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
2015-05-13 19:15:31 -04:00
|
|
|
"org.apache.spark.sql.parquet.TestGroupWriteSupport"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.CachedData"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.CachedData$"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.CacheManager")
|
2015-05-11 21:53:50 -04:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-7530 Added StreamingContext.getState()
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.streaming.StreamingContext.state_=")
|
[SPARK-7081] Faster sort-based shuffle path using binary processing cache-aware sort
This patch introduces a new shuffle manager that enhances the existing sort-based shuffle with a new cache-friendly sort algorithm that operates directly on binary data. The goals of this patch are to lower memory usage and Java object overheads during shuffle and to speed up sorting. It also lays groundwork for follow-up patches that will enable end-to-end processing of serialized records.
The new shuffle manager, `UnsafeShuffleManager`, can be enabled by setting `spark.shuffle.manager=tungsten-sort` in SparkConf.
The new shuffle manager uses directly-managed memory to implement several performance optimizations for certain types of shuffles. In cases where the new performance optimizations cannot be applied, the new shuffle manager delegates to SortShuffleManager to handle those shuffles.
UnsafeShuffleManager's optimizations will apply when _all_ of the following conditions hold:
- The shuffle dependency specifies no aggregation or output ordering.
- The shuffle serializer supports relocation of serialized values (this is currently supported
by KryoSerializer and Spark SQL's custom serializers).
- The shuffle produces fewer than 16777216 output partitions.
- No individual record is larger than 128 MB when serialized.
In addition, extra spill-merging optimizations are automatically applied when the shuffle compression codec supports concatenation of serialized streams. This is currently supported by Spark's LZF serializer.
At a high-level, UnsafeShuffleManager's design is similar to Spark's existing SortShuffleManager. In sort-based shuffle, incoming records are sorted according to their target partition ids, then written to a single map output file. Reducers fetch contiguous regions of this file in order to read their portion of the map output. In cases where the map output data is too large to fit in memory, sorted subsets of the output can are spilled to disk and those on-disk files are merged to produce the final output file.
UnsafeShuffleManager optimizes this process in several ways:
- Its sort operates on serialized binary data rather than Java objects, which reduces memory consumption and GC overheads. This optimization requires the record serializer to have certain properties to allow serialized records to be re-ordered without requiring deserialization. See SPARK-4550, where this optimization was first proposed and implemented, for more details.
- It uses a specialized cache-efficient sorter (UnsafeShuffleExternalSorter) that sorts arrays of compressed record pointers and partition ids. By using only 8 bytes of space per record in the sorting array, this fits more of the array into cache.
- The spill merging procedure operates on blocks of serialized records that belong to the same partition and does not need to deserialize records during the merge.
- When the spill compression codec supports concatenation of compressed data, the spill merge simply concatenates the serialized and compressed spill partitions to produce the final output partition. This allows efficient data copying methods, like NIO's `transferTo`, to be used and avoids the need to allocate decompression or copying buffers during the merge.
The shuffle read path is unchanged.
This patch is similar to [SPARK-4550](http://issues.apache.org/jira/browse/SPARK-4550) / #4450 but uses a slightly different implementation. The `unsafe`-based implementation featured in this patch lays the groundwork for followup patches that will enable sorting to operate on serialized data pages that will be prepared by Spark SQL's new `unsafe` operators (such as the new aggregation operator introduced in #5725).
### Future work
There are several tasks that build upon this patch, which will be left to future work:
- [SPARK-7271](https://issues.apache.org/jira/browse/SPARK-7271) Redesign / extend the shuffle interfaces to accept binary data as input. The goal here is to let us bypass serialization steps in cases where the sort input is produced by an operator that operates directly on binary data.
- Extension / redesign of the `Serializer` API. We can add new methods which allow serializers to determine the size requirements for serializing objects and for serializing objects directly to a specified memory address (similar to how `UnsafeRowConverter` works in Spark SQL).
<!-- Reviewable:start -->
[<img src="https://reviewable.io/review_button.png" height=40 alt="Review on Reviewable"/>](https://reviewable.io/reviews/apache/spark/5868)
<!-- Reviewable:end -->
Author: Josh Rosen <joshrosen@databricks.com>
Closes #5868 from JoshRosen/unsafe-sort and squashes the following commits:
ef0a86e [Josh Rosen] Fix scalastyle errors
7610f2f [Josh Rosen] Add tests for proper cleanup of shuffle data.
d494ffe [Josh Rosen] Fix deserialization of JavaSerializer instances.
52a9981 [Josh Rosen] Fix some bugs in the address packing code.
51812a7 [Josh Rosen] Change shuffle manager sort name to tungsten-sort
4023fa4 [Josh Rosen] Add @Private annotation to some Java classes.
de40b9d [Josh Rosen] More comments to try to explain metrics code
df07699 [Josh Rosen] Attempt to clarify confusing metrics update code
5e189c6 [Josh Rosen] Track time spend closing / flushing files; split TimeTrackingOutputStream into separate file.
d5779c6 [Josh Rosen] Merge remote-tracking branch 'origin/master' into unsafe-sort
c2ce78e [Josh Rosen] Fix a missed usage of MAX_PARTITION_ID
e3b8855 [Josh Rosen] Cleanup in UnsafeShuffleWriter
4a2c785 [Josh Rosen] rename 'sort buffer' to 'pointer array'
6276168 [Josh Rosen] Remove ability to disable spilling in UnsafeShuffleExternalSorter.
57312c9 [Josh Rosen] Clarify fileBufferSize units
2d4e4f4 [Josh Rosen] Address some minor comments in UnsafeShuffleExternalSorter.
fdcac08 [Josh Rosen] Guard against overflow when expanding sort buffer.
85da63f [Josh Rosen] Cleanup in UnsafeShuffleSorterIterator.
0ad34da [Josh Rosen] Fix off-by-one in nextInt() call
56781a1 [Josh Rosen] Rename UnsafeShuffleSorter to UnsafeShuffleInMemorySorter
e995d1a [Josh Rosen] Introduce MAX_SHUFFLE_OUTPUT_PARTITIONS.
e58a6b4 [Josh Rosen] Add more tests for PackedRecordPointer encoding.
4f0b770 [Josh Rosen] Attempt to implement proper shuffle write metrics.
d4e6d89 [Josh Rosen] Update to bit shifting constants
69d5899 [Josh Rosen] Remove some unnecessary override vals
8531286 [Josh Rosen] Add tests that automatically trigger spills.
7c953f9 [Josh Rosen] Add test that covers UnsafeShuffleSortDataFormat.swap().
e1855e5 [Josh Rosen] Fix a handful of misc. IntelliJ inspections
39434f9 [Josh Rosen] Avoid integer multiplication overflow in getMemoryUsage (thanks FindBugs!)
1e3ad52 [Josh Rosen] Delete unused ByteBufferOutputStream class.
ea4f85f [Josh Rosen] Roll back an unnecessary change in Spillable.
ae538dc [Josh Rosen] Document UnsafeShuffleManager.
ec6d626 [Josh Rosen] Add notes on maximum # of supported shuffle partitions.
0d4d199 [Josh Rosen] Bump up shuffle.memoryFraction to make tests pass.
b3b1924 [Josh Rosen] Properly implement close() and flush() in DummySerializerInstance.
1ef56c7 [Josh Rosen] Revise compression codec support in merger; test cross product of configurations.
b57c17f [Josh Rosen] Disable some overly-verbose logs that rendered DEBUG useless.
f780fb1 [Josh Rosen] Add test demonstrating which compression codecs support concatenation.
4a01c45 [Josh Rosen] Remove unnecessary log message
27b18b0 [Josh Rosen] That for inserting records AT the max record size.
fcd9a3c [Josh Rosen] Add notes + tests for maximum record / page sizes.
9d1ee7c [Josh Rosen] Fix MiMa excludes for ShuffleWriter change
fd4bb9e [Josh Rosen] Use own ByteBufferOutputStream rather than Kryo's
67d25ba [Josh Rosen] Update Exchange operator's copying logic to account for new shuffle manager
8f5061a [Josh Rosen] Strengthen assertion to check partitioning
01afc74 [Josh Rosen] Actually read data in UnsafeShuffleWriterSuite
1929a74 [Josh Rosen] Update to reflect upstream ShuffleBlockManager -> ShuffleBlockResolver rename.
e8718dd [Josh Rosen] Merge remote-tracking branch 'origin/master' into unsafe-sort
9b7ebed [Josh Rosen] More defensive programming RE: cleaning up spill files and memory after errors
7cd013b [Josh Rosen] Begin refactoring to enable proper tests for spilling.
722849b [Josh Rosen] Add workaround for transferTo() bug in merging code; refactor tests.
9883e30 [Josh Rosen] Merge remote-tracking branch 'origin/master' into unsafe-sort
b95e642 [Josh Rosen] Refactor and document logic that decides when to spill.
1ce1300 [Josh Rosen] More minor cleanup
5e8cf75 [Josh Rosen] More minor cleanup
e67f1ea [Josh Rosen] Remove upper type bound in ShuffleWriter interface.
cfe0ec4 [Josh Rosen] Address a number of minor review comments:
8a6fe52 [Josh Rosen] Rename UnsafeShuffleSpillWriter to UnsafeShuffleExternalSorter
11feeb6 [Josh Rosen] Update TODOs related to shuffle write metrics.
b674412 [Josh Rosen] Merge remote-tracking branch 'origin/master' into unsafe-sort
aaea17b [Josh Rosen] Add comments to UnsafeShuffleSpillWriter.
4f70141 [Josh Rosen] Fix merging; now passes UnsafeShuffleSuite tests.
133c8c9 [Josh Rosen] WIP towards testing UnsafeShuffleWriter.
f480fb2 [Josh Rosen] WIP in mega-refactoring towards shuffle-specific sort.
57f1ec0 [Josh Rosen] WIP towards packed record pointers for use in optimized shuffle sort.
69232fd [Josh Rosen] Enable compressible address encoding for off-heap mode.
7ee918e [Josh Rosen] Re-order imports in tests
3aeaff7 [Josh Rosen] More refactoring and cleanup; begin cleaning iterator interfaces
3490512 [Josh Rosen] Misc. cleanup
f156a8f [Josh Rosen] Hacky metrics integration; refactor some interfaces.
2776aca [Josh Rosen] First passing test for ExternalSorter.
5e100b2 [Josh Rosen] Super-messy WIP on external sort
595923a [Josh Rosen] Remove some unused variables.
8958584 [Josh Rosen] Fix bug in calculating free space in current page.
f17fa8f [Josh Rosen] Add missing newline
c2fca17 [Josh Rosen] Small refactoring of SerializerPropertiesSuite to enable test re-use:
b8a09fe [Josh Rosen] Back out accidental log4j.properties change
bfc12d3 [Josh Rosen] Add tests for serializer relocation property.
240864c [Josh Rosen] Remove PrefixComputer and require prefix to be specified as part of insert()
1433b42 [Josh Rosen] Store record length as int instead of long.
026b497 [Josh Rosen] Re-use a buffer in UnsafeShuffleWriter
0748458 [Josh Rosen] Port UnsafeShuffleWriter to Java.
87e721b [Josh Rosen] Renaming and comments
d3cc310 [Josh Rosen] Flag that SparkSqlSerializer2 supports relocation
e2d96ca [Josh Rosen] Expand serializer API and use new function to help control when new UnsafeShuffle path is used.
e267cee [Josh Rosen] Fix compilation of UnsafeSorterSuite
9c6cf58 [Josh Rosen] Refactor to use DiskBlockObjectWriter.
253f13e [Josh Rosen] More cleanup
8e3ec20 [Josh Rosen] Begin code cleanup.
4d2f5e1 [Josh Rosen] WIP
3db12de [Josh Rosen] Minor simplification and sanity checks in UnsafeSorter
767d3ca [Josh Rosen] Fix invalid range in UnsafeSorter.
e900152 [Josh Rosen] Add test for empty iterator in UnsafeSorter
57a4ea0 [Josh Rosen] Make initialSize configurable in UnsafeSorter
abf7bfe [Josh Rosen] Add basic test case.
81d52c5 [Josh Rosen] WIP on UnsafeSorter
(cherry picked from commit 73bed408fbb47dfc28063afa3898c27fbdec7735)
Signed-off-by: Reynold Xin <rxin@databricks.com>
2015-05-13 20:07:31 -04:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-7081 changed ShuffleWriter from a trait to an abstract class and removed some
|
|
|
|
// unnecessary type bounds in order to fix some compiler warnings that occurred when
|
|
|
|
// implementing this interface in Java. Note that ShuffleWriter is private[spark].
|
|
|
|
ProblemFilters.exclude[IncompatibleTemplateDefProblem](
|
|
|
|
"org.apache.spark.shuffle.ShuffleWriter")
|
2015-03-20 14:43:57 -04:00
|
|
|
)
|
|
|
|
|
2014-11-19 00:24:18 -05:00
|
|
|
case v if v.startsWith("1.3") =>
|
|
|
|
Seq(
|
|
|
|
MimaBuild.excludeSparkPackage("deploy"),
|
[SPARK-4789] [SPARK-4942] [SPARK-5031] [mllib] Standardize ML Prediction APIs
This is part (1a) of the updates from the design doc in [https://docs.google.com/document/d/1BH9el33kBX8JiDdgUJXdLW14CA2qhTCWIG46eXZVoJs]
**UPDATE**: Most of the APIs are being kept private[spark] to allow further discussion. Here is a list of changes which are public:
* new output columns: rawPrediction, probabilities
* The “score” column is now called “rawPrediction”
* Classifiers now provide numClasses
* Params.get and .set are now protected instead of private[ml].
* ParamMap now has a size method.
* new classes: LinearRegression, LinearRegressionModel
* LogisticRegression now has an intercept.
### Sketch of APIs (most of which are private[spark] for now)
Abstract classes for learning algorithms (+ corresponding Model abstractions):
* Classifier (+ ClassificationModel)
* ProbabilisticClassifier (+ ProbabilisticClassificationModel)
* Regressor (+ RegressionModel)
* Predictor (+ PredictionModel)
* *For all of these*:
* There is no strongly typed training-time API.
* There is a strongly typed test-time (prediction) API which helps developers implement new algorithms.
Concrete classes: learning algorithms
* LinearRegression
* LogisticRegression (updated to use new abstract classes)
* Also, removed "score" in favor of "probability" output column. Changed BinaryClassificationEvaluator to match. (SPARK-5031)
Other updates:
* params.scala: Changed Params.set/get to be protected instead of private[ml]
* This was needed for the example of defining a class from outside of the MLlib namespace.
* VectorUDT: Will later change from private[spark] to public.
* This is needed for outside users to write their own validateAndTransformSchema() methods using vectors.
* Also, added equals() method.f
* SPARK-4942 : ML Transformers should allow output cols to be turned on,off
* Update validateAndTransformSchema
* Update transform
* (Updated examples, test suites according to other changes)
New examples:
* DeveloperApiExample.scala (example of defining algorithm from outside of the MLlib namespace)
* Added Java version too
Test Suites:
* LinearRegressionSuite
* LogisticRegressionSuite
* + Java versions of above suites
CC: mengxr etrain shivaram
Author: Joseph K. Bradley <joseph@databricks.com>
Closes #3637 from jkbradley/ml-api-part1 and squashes the following commits:
405bfb8 [Joseph K. Bradley] Last edits based on code review. Small cleanups
fec348a [Joseph K. Bradley] Added JavaDeveloperApiExample.java and fixed other issues: Made developer API private[spark] for now. Added constructors Java can understand to specialized Param types.
8316d5e [Joseph K. Bradley] fixes after rebasing on master
fc62406 [Joseph K. Bradley] fixed test suites after last commit
bcb9549 [Joseph K. Bradley] Fixed issues after rebasing from master (after move from SchemaRDD to DataFrame)
9872424 [Joseph K. Bradley] fixed JavaLinearRegressionSuite.java Java sql api
f542997 [Joseph K. Bradley] Added MIMA excludes for VectorUDT (now public), and added DeveloperApi annotation to it
216d199 [Joseph K. Bradley] fixed after sql datatypes PR got merged
f549e34 [Joseph K. Bradley] Updates based on code review. Major ones are: * Created weakly typed Predictor.train() method which is called by fit() so that developers do not have to call schema validation or copy parameters. * Made Predictor.featuresDataType have a default value of VectorUDT. * NOTE: This could be dangerous since the FeaturesType type parameter cannot have a default value.
343e7bd [Joseph K. Bradley] added blanket mima exclude for ml package
82f340b [Joseph K. Bradley] Fixed bug in LogisticRegression (introduced in this PR). Fixed Java suites
0a16da9 [Joseph K. Bradley] Fixed Linear/Logistic RegressionSuites
c3c8da5 [Joseph K. Bradley] small cleanup
934f97b [Joseph K. Bradley] Fixed bugs from previous commit.
1c61723 [Joseph K. Bradley] * Made ProbabilisticClassificationModel into a subclass of ClassificationModel. Also introduced ProbabilisticClassifier. * This was to support output column “probabilityCol” in transform().
4e2f711 [Joseph K. Bradley] rat fix
bc654e1 [Joseph K. Bradley] Added spark.ml LinearRegressionSuite
8d13233 [Joseph K. Bradley] Added methods: * Classifier: batch predictRaw() * Predictor: train() without paramMap ProbabilisticClassificationModel.predictProbabilities() * Java versions of all above batch methods + others
1680905 [Joseph K. Bradley] Added JavaLabeledPointSuite.java for spark.ml, and added constructor to LabeledPoint which defaults weight to 1.0
adbe50a [Joseph K. Bradley] * fixed LinearRegression train() to use embedded paramMap * added Predictor.predict(RDD[Vector]) method * updated Linear/LogisticRegressionSuites
58802e3 [Joseph K. Bradley] added train() to Predictor subclasses which does not take a ParamMap.
57d54ab [Joseph K. Bradley] * Changed semantics of Predictor.train() to merge the given paramMap with the embedded paramMap. * remove threshold_internal from logreg * Added Predictor.copy() * Extended LogisticRegressionSuite
e433872 [Joseph K. Bradley] Updated docs. Added LabeledPointSuite to spark.ml
54b7b31 [Joseph K. Bradley] Fixed issue with logreg threshold being set correctly
0617d61 [Joseph K. Bradley] Fixed bug from last commit (sorting paramMap by parameter names in toString). Fixed bug in persisting logreg data. Added threshold_internal to logreg for faster test-time prediction (avoiding map lookup).
601e792 [Joseph K. Bradley] Modified ParamMap to sort parameters in toString. Cleaned up classes in class hierarchy, before implementing tests and examples.
d705e87 [Joseph K. Bradley] Added LinearRegression and Regressor back from ml-api branch
52f4fde [Joseph K. Bradley] removing everything except for simple class hierarchy for classification
d35bb5d [Joseph K. Bradley] fixed compilation issues, but have not added tests yet
bfade12 [Joseph K. Bradley] Added lots of classes for new ML API:
2015-02-06 02:43:47 -05:00
|
|
|
MimaBuild.excludeSparkPackage("ml"),
|
2014-11-19 00:24:18 -05:00
|
|
|
// These are needed if checking against the sbt build, since they are part of
|
|
|
|
// the maven-generated artifacts in the 1.2 build.
|
|
|
|
MimaBuild.excludeSparkPackage("unused"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem]("com.google.common.base.Optional")
|
2014-11-19 17:03:44 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-2321
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.SparkStageInfoImpl.this"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.SparkStageInfo.submissionTime")
|
2014-11-26 11:22:50 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-4614
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.Matrices.randn"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.Matrices.rand")
|
2015-01-27 04:46:17 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-5321
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.SparseMatrix.transposeMultiply"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.Matrix.transpose"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.DenseMatrix.transposeMultiply"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.mllib.linalg.Matrix." +
|
|
|
|
"org$apache$spark$mllib$linalg$Matrix$_setter_$isTransposed_="),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.Matrix.isTransposed"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.Matrix.foreachActive")
|
2015-02-02 20:10:01 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-5540
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
2015-02-03 02:49:09 -05:00
|
|
|
"org.apache.spark.mllib.recommendation.ALS.solveLeastSquares"),
|
|
|
|
// SPARK-5536
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$^dateFeatures"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$^dateBlock")
|
2014-12-31 19:59:17 -05:00
|
|
|
) ++ Seq(
|
2015-01-02 18:09:41 -05:00
|
|
|
// SPARK-3325
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.streaming.api.java.JavaDStreamLike.print"),
|
2014-12-31 19:59:17 -05:00
|
|
|
// SPARK-2757
|
|
|
|
ProblemFilters.exclude[IncompatibleResultTypeProblem](
|
|
|
|
"org.apache.spark.streaming.flume.sink.SparkAvroCallbackHandler." +
|
|
|
|
"removeAndGetProcessor")
|
2015-01-13 20:16:41 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-5123 (SparkSQL data type change) - alpha component only
|
|
|
|
ProblemFilters.exclude[IncompatibleResultTypeProblem](
|
|
|
|
"org.apache.spark.ml.feature.HashingTF.outputDataType"),
|
|
|
|
ProblemFilters.exclude[IncompatibleResultTypeProblem](
|
|
|
|
"org.apache.spark.ml.feature.Tokenizer.outputDataType"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem](
|
|
|
|
"org.apache.spark.ml.feature.Tokenizer.validateInputType"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem](
|
|
|
|
"org.apache.spark.ml.classification.LogisticRegressionModel.validateAndTransformSchema"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem](
|
|
|
|
"org.apache.spark.ml.classification.LogisticRegression.validateAndTransformSchema")
|
[SPARK-4014] Add TaskContext.attemptNumber and deprecate TaskContext.attemptId
`TaskContext.attemptId` is misleadingly-named, since it currently returns a taskId, which uniquely identifies a particular task attempt within a particular SparkContext, instead of an attempt number, which conveys how many times a task has been attempted.
This patch deprecates `TaskContext.attemptId` and add `TaskContext.taskId` and `TaskContext.attemptNumber` fields. Prior to this change, it was impossible to determine whether a task was being re-attempted (or was a speculative copy), which made it difficult to write unit tests for tasks that fail on early attempts or speculative tasks that complete faster than original tasks.
Earlier versions of the TaskContext docs suggest that `attemptId` behaves like `attemptNumber`, so there's an argument to be made in favor of changing this method's implementation. Since we've decided against making that change in maintenance branches, I think it's simpler to add better-named methods and retain the old behavior for `attemptId`; if `attemptId` behaved differently in different branches, then this would cause confusing build-breaks when backporting regression tests that rely on the new `attemptId` behavior.
Most of this patch is fairly straightforward, but there is a bit of trickiness related to Mesos tasks: since there's no field in MesosTaskInfo to encode the attemptId, I packed it into the `data` field alongside the task binary.
Author: Josh Rosen <joshrosen@databricks.com>
Closes #3849 from JoshRosen/SPARK-4014 and squashes the following commits:
89d03e0 [Josh Rosen] Merge remote-tracking branch 'origin/master' into SPARK-4014
5cfff05 [Josh Rosen] Introduce wrapper for serializing Mesos task launch data.
38574d4 [Josh Rosen] attemptId -> taskAttemptId in PairRDDFunctions
a180b88 [Josh Rosen] Merge remote-tracking branch 'origin/master' into SPARK-4014
1d43aa6 [Josh Rosen] Merge remote-tracking branch 'origin/master' into SPARK-4014
eee6a45 [Josh Rosen] Merge remote-tracking branch 'origin/master' into SPARK-4014
0b10526 [Josh Rosen] Use putInt instead of putLong (silly mistake)
8c387ce [Josh Rosen] Use local with maxRetries instead of local-cluster.
cbe4d76 [Josh Rosen] Preserve attemptId behavior and deprecate it:
b2dffa3 [Josh Rosen] Address some of Reynold's minor comments
9d8d4d1 [Josh Rosen] Doc typo
1e7a933 [Josh Rosen] [SPARK-4014] Change TaskContext.attemptId to return attempt number instead of task ID.
fd515a5 [Josh Rosen] Add failing test for SPARK-4014
2015-01-14 14:45:40 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-4014
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.TaskContext.taskAttemptId"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.TaskContext.attemptNumber")
|
2015-01-17 00:09:06 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-5166 Spark SQL API stabilization
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Transformer.transform"),
|
2015-01-27 19:08:24 -05:00
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Estimator.fit"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.ml.Transformer.transform"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Pipeline.fit"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.PipelineModel.transform"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.ml.Estimator.fit"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Evaluator.evaluate"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.ml.Evaluator.evaluate"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.tuning.CrossValidator.fit"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.tuning.CrossValidatorModel.transform"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.feature.StandardScaler.fit"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.feature.StandardScalerModel.transform"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.classification.LogisticRegressionModel.transform"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.classification.LogisticRegression.fit"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.evaluation.BinaryClassificationEvaluator.evaluate")
|
2015-01-20 01:50:44 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-5270
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaRDDLike.isEmpty")
|
2015-01-28 20:26:03 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-5430
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaRDDLike.treeReduce"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaRDDLike.treeAggregate")
|
2015-01-21 02:37:47 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-5297 Java FileStream do not work with custom key/values
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.streaming.api.java.JavaStreamingContext.fileStream")
|
2015-01-23 01:04:21 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-5315 Spark Streaming Java API returns Scala DStream
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.streaming.api.java.JavaDStreamLike.reduceByWindow")
|
2015-02-02 17:34:48 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-5461 Graph should have isCheckpointed, getCheckpointFiles methods
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.graphx.Graph.getCheckpointFiles"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.graphx.Graph.isCheckpointed")
|
[SPARK-4789] [SPARK-4942] [SPARK-5031] [mllib] Standardize ML Prediction APIs
This is part (1a) of the updates from the design doc in [https://docs.google.com/document/d/1BH9el33kBX8JiDdgUJXdLW14CA2qhTCWIG46eXZVoJs]
**UPDATE**: Most of the APIs are being kept private[spark] to allow further discussion. Here is a list of changes which are public:
* new output columns: rawPrediction, probabilities
* The “score” column is now called “rawPrediction”
* Classifiers now provide numClasses
* Params.get and .set are now protected instead of private[ml].
* ParamMap now has a size method.
* new classes: LinearRegression, LinearRegressionModel
* LogisticRegression now has an intercept.
### Sketch of APIs (most of which are private[spark] for now)
Abstract classes for learning algorithms (+ corresponding Model abstractions):
* Classifier (+ ClassificationModel)
* ProbabilisticClassifier (+ ProbabilisticClassificationModel)
* Regressor (+ RegressionModel)
* Predictor (+ PredictionModel)
* *For all of these*:
* There is no strongly typed training-time API.
* There is a strongly typed test-time (prediction) API which helps developers implement new algorithms.
Concrete classes: learning algorithms
* LinearRegression
* LogisticRegression (updated to use new abstract classes)
* Also, removed "score" in favor of "probability" output column. Changed BinaryClassificationEvaluator to match. (SPARK-5031)
Other updates:
* params.scala: Changed Params.set/get to be protected instead of private[ml]
* This was needed for the example of defining a class from outside of the MLlib namespace.
* VectorUDT: Will later change from private[spark] to public.
* This is needed for outside users to write their own validateAndTransformSchema() methods using vectors.
* Also, added equals() method.f
* SPARK-4942 : ML Transformers should allow output cols to be turned on,off
* Update validateAndTransformSchema
* Update transform
* (Updated examples, test suites according to other changes)
New examples:
* DeveloperApiExample.scala (example of defining algorithm from outside of the MLlib namespace)
* Added Java version too
Test Suites:
* LinearRegressionSuite
* LogisticRegressionSuite
* + Java versions of above suites
CC: mengxr etrain shivaram
Author: Joseph K. Bradley <joseph@databricks.com>
Closes #3637 from jkbradley/ml-api-part1 and squashes the following commits:
405bfb8 [Joseph K. Bradley] Last edits based on code review. Small cleanups
fec348a [Joseph K. Bradley] Added JavaDeveloperApiExample.java and fixed other issues: Made developer API private[spark] for now. Added constructors Java can understand to specialized Param types.
8316d5e [Joseph K. Bradley] fixes after rebasing on master
fc62406 [Joseph K. Bradley] fixed test suites after last commit
bcb9549 [Joseph K. Bradley] Fixed issues after rebasing from master (after move from SchemaRDD to DataFrame)
9872424 [Joseph K. Bradley] fixed JavaLinearRegressionSuite.java Java sql api
f542997 [Joseph K. Bradley] Added MIMA excludes for VectorUDT (now public), and added DeveloperApi annotation to it
216d199 [Joseph K. Bradley] fixed after sql datatypes PR got merged
f549e34 [Joseph K. Bradley] Updates based on code review. Major ones are: * Created weakly typed Predictor.train() method which is called by fit() so that developers do not have to call schema validation or copy parameters. * Made Predictor.featuresDataType have a default value of VectorUDT. * NOTE: This could be dangerous since the FeaturesType type parameter cannot have a default value.
343e7bd [Joseph K. Bradley] added blanket mima exclude for ml package
82f340b [Joseph K. Bradley] Fixed bug in LogisticRegression (introduced in this PR). Fixed Java suites
0a16da9 [Joseph K. Bradley] Fixed Linear/Logistic RegressionSuites
c3c8da5 [Joseph K. Bradley] small cleanup
934f97b [Joseph K. Bradley] Fixed bugs from previous commit.
1c61723 [Joseph K. Bradley] * Made ProbabilisticClassificationModel into a subclass of ClassificationModel. Also introduced ProbabilisticClassifier. * This was to support output column “probabilityCol” in transform().
4e2f711 [Joseph K. Bradley] rat fix
bc654e1 [Joseph K. Bradley] Added spark.ml LinearRegressionSuite
8d13233 [Joseph K. Bradley] Added methods: * Classifier: batch predictRaw() * Predictor: train() without paramMap ProbabilisticClassificationModel.predictProbabilities() * Java versions of all above batch methods + others
1680905 [Joseph K. Bradley] Added JavaLabeledPointSuite.java for spark.ml, and added constructor to LabeledPoint which defaults weight to 1.0
adbe50a [Joseph K. Bradley] * fixed LinearRegression train() to use embedded paramMap * added Predictor.predict(RDD[Vector]) method * updated Linear/LogisticRegressionSuites
58802e3 [Joseph K. Bradley] added train() to Predictor subclasses which does not take a ParamMap.
57d54ab [Joseph K. Bradley] * Changed semantics of Predictor.train() to merge the given paramMap with the embedded paramMap. * remove threshold_internal from logreg * Added Predictor.copy() * Extended LogisticRegressionSuite
e433872 [Joseph K. Bradley] Updated docs. Added LabeledPointSuite to spark.ml
54b7b31 [Joseph K. Bradley] Fixed issue with logreg threshold being set correctly
0617d61 [Joseph K. Bradley] Fixed bug from last commit (sorting paramMap by parameter names in toString). Fixed bug in persisting logreg data. Added threshold_internal to logreg for faster test-time prediction (avoiding map lookup).
601e792 [Joseph K. Bradley] Modified ParamMap to sort parameters in toString. Cleaned up classes in class hierarchy, before implementing tests and examples.
d705e87 [Joseph K. Bradley] Added LinearRegression and Regressor back from ml-api branch
52f4fde [Joseph K. Bradley] removing everything except for simple class hierarchy for classification
d35bb5d [Joseph K. Bradley] fixed compilation issues, but have not added tests yet
bfade12 [Joseph K. Bradley] Added lots of classes for new ML API:
2015-02-06 02:43:47 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-4789 Standardize ML Prediction APIs
|
|
|
|
ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.mllib.linalg.VectorUDT"),
|
|
|
|
ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.mllib.linalg.VectorUDT.serialize"),
|
|
|
|
ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.mllib.linalg.VectorUDT.sqlType")
|
2015-03-12 04:39:04 -04:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-5814
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$wrapDoubleArray"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$fillFullMatrix"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$iterations"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$makeOutLinkBlock"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$computeYtY"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$makeLinkRDDs"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$alpha"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$randomFactor"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$makeInLinkBlock"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$dspr"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$lambda"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$implicitPrefs"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$rank")
|
2015-02-19 18:35:23 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-4682
|
|
|
|
ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.RealClock"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.Clock"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.TestClock")
|
2015-03-16 04:06:26 -04:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-5922 Adding a generalized diff(other: RDD[(VertexId, VD)]) to VertexRDD
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.graphx.VertexRDD.diff")
|
2014-11-19 00:24:18 -05:00
|
|
|
)
|
|
|
|
|
2014-09-07 23:39:53 -04:00
|
|
|
case v if v.startsWith("1.2") =>
|
|
|
|
Seq(
|
|
|
|
MimaBuild.excludeSparkPackage("deploy"),
|
|
|
|
MimaBuild.excludeSparkPackage("graphx")
|
2014-09-19 01:18:51 -04:00
|
|
|
) ++
|
|
|
|
MimaBuild.excludeSparkClass("mllib.linalg.Matrix") ++
|
[MLlib] [SPARK-2885] DIMSUM: All-pairs similarity
# All-pairs similarity via DIMSUM
Compute all pairs of similar vectors using brute force approach, and also DIMSUM sampling approach.
Laying down some notation: we are looking for all pairs of similar columns in an m x n RowMatrix whose entries are denoted a_ij, with the i’th row denoted r_i and the j’th column denoted c_j. There is an oversampling parameter labeled ɣ that should be set to 4 log(n)/s to get provably correct results (with high probability), where s is the similarity threshold.
The algorithm is stated with a Map and Reduce, with proofs of correctness and efficiency in published papers [1] [2]. The reducer is simply the summation reducer. The mapper is more interesting, and is also the heart of the scheme. As an exercise, you should try to see why in expectation, the map-reduce below outputs cosine similarities.
![dimsumv2](https://cloud.githubusercontent.com/assets/3220351/3807272/d1d9514e-1c62-11e4-9f12-3cfdb1d78b3a.png)
[1] Bosagh-Zadeh, Reza and Carlsson, Gunnar (2013), Dimension Independent Matrix Square using MapReduce, arXiv:1304.1467 http://arxiv.org/abs/1304.1467
[2] Bosagh-Zadeh, Reza and Goel, Ashish (2012), Dimension Independent Similarity Computation, arXiv:1206.2082 http://arxiv.org/abs/1206.2082
# Testing
Tests for all invocations included.
Added L1 and L2 norm computation to MultivariateStatisticalSummary since it was needed. Added tests for both of them.
Author: Reza Zadeh <rizlar@gmail.com>
Author: Xiangrui Meng <meng@databricks.com>
Closes #1778 from rezazadeh/dimsumv2 and squashes the following commits:
404c64c [Reza Zadeh] Merge remote-tracking branch 'upstream/master' into dimsumv2
4eb71c6 [Reza Zadeh] Add excludes for normL1 and normL2
ee8bd65 [Reza Zadeh] Merge remote-tracking branch 'upstream/master' into dimsumv2
976ddd4 [Reza Zadeh] Broadcast colMags. Avoid div by zero.
3467cff [Reza Zadeh] Merge remote-tracking branch 'upstream/master' into dimsumv2
aea0247 [Reza Zadeh] Allow large thresholds to promote sparsity
9fe17c0 [Xiangrui Meng] organize imports
2196ba5 [Xiangrui Meng] Merge branch 'rezazadeh-dimsumv2' into dimsumv2
254ca08 [Reza Zadeh] Merge remote-tracking branch 'upstream/master' into dimsumv2
f2947e4 [Xiangrui Meng] some optimization
3c4cf41 [Xiangrui Meng] Merge branch 'master' into rezazadeh-dimsumv2
0e4eda4 [Reza Zadeh] Use partition index for RNG
251bb9c [Reza Zadeh] Documentation
25e9d0d [Reza Zadeh] Line length for style
fb296f6 [Reza Zadeh] renamed to normL1 and normL2
3764983 [Reza Zadeh] Documentation
e9c6791 [Reza Zadeh] New interface and documentation
613f261 [Reza Zadeh] Column magnitude summary
75a0b51 [Reza Zadeh] Use Ints instead of Longs in the shuffle
0f12ade [Reza Zadeh] Style changes
eb1dc20 [Reza Zadeh] Use Double.PositiveInfinity instead of Double.Max
f56a882 [Reza Zadeh] Remove changes to MultivariateOnlineSummarizer
dbc55ba [Reza Zadeh] Make colMagnitudes a method in RowMatrix
41e8ece [Reza Zadeh] style changes
139c8e1 [Reza Zadeh] Syntax changes
029aa9c [Reza Zadeh] javadoc and new test
75edb25 [Reza Zadeh] All tests passing!
05e59b8 [Reza Zadeh] Add test
502ce52 [Reza Zadeh] new interface
654c4fb [Reza Zadeh] default methods
3726ca9 [Reza Zadeh] Remove MatrixAlgebra
6bebabb [Reza Zadeh] remove changes to MatrixSuite
5b8cd7d [Reza Zadeh] Initial files
2014-09-29 14:15:09 -04:00
|
|
|
MimaBuild.excludeSparkClass("mllib.linalg.Vector") ++
|
|
|
|
Seq(
|
2014-10-02 03:29:31 -04:00
|
|
|
ProblemFilters.exclude[IncompatibleTemplateDefProblem](
|
|
|
|
"org.apache.spark.scheduler.TaskLocation"),
|
[MLlib] [SPARK-2885] DIMSUM: All-pairs similarity
# All-pairs similarity via DIMSUM
Compute all pairs of similar vectors using brute force approach, and also DIMSUM sampling approach.
Laying down some notation: we are looking for all pairs of similar columns in an m x n RowMatrix whose entries are denoted a_ij, with the i’th row denoted r_i and the j’th column denoted c_j. There is an oversampling parameter labeled ɣ that should be set to 4 log(n)/s to get provably correct results (with high probability), where s is the similarity threshold.
The algorithm is stated with a Map and Reduce, with proofs of correctness and efficiency in published papers [1] [2]. The reducer is simply the summation reducer. The mapper is more interesting, and is also the heart of the scheme. As an exercise, you should try to see why in expectation, the map-reduce below outputs cosine similarities.
![dimsumv2](https://cloud.githubusercontent.com/assets/3220351/3807272/d1d9514e-1c62-11e4-9f12-3cfdb1d78b3a.png)
[1] Bosagh-Zadeh, Reza and Carlsson, Gunnar (2013), Dimension Independent Matrix Square using MapReduce, arXiv:1304.1467 http://arxiv.org/abs/1304.1467
[2] Bosagh-Zadeh, Reza and Goel, Ashish (2012), Dimension Independent Similarity Computation, arXiv:1206.2082 http://arxiv.org/abs/1206.2082
# Testing
Tests for all invocations included.
Added L1 and L2 norm computation to MultivariateStatisticalSummary since it was needed. Added tests for both of them.
Author: Reza Zadeh <rizlar@gmail.com>
Author: Xiangrui Meng <meng@databricks.com>
Closes #1778 from rezazadeh/dimsumv2 and squashes the following commits:
404c64c [Reza Zadeh] Merge remote-tracking branch 'upstream/master' into dimsumv2
4eb71c6 [Reza Zadeh] Add excludes for normL1 and normL2
ee8bd65 [Reza Zadeh] Merge remote-tracking branch 'upstream/master' into dimsumv2
976ddd4 [Reza Zadeh] Broadcast colMags. Avoid div by zero.
3467cff [Reza Zadeh] Merge remote-tracking branch 'upstream/master' into dimsumv2
aea0247 [Reza Zadeh] Allow large thresholds to promote sparsity
9fe17c0 [Xiangrui Meng] organize imports
2196ba5 [Xiangrui Meng] Merge branch 'rezazadeh-dimsumv2' into dimsumv2
254ca08 [Reza Zadeh] Merge remote-tracking branch 'upstream/master' into dimsumv2
f2947e4 [Xiangrui Meng] some optimization
3c4cf41 [Xiangrui Meng] Merge branch 'master' into rezazadeh-dimsumv2
0e4eda4 [Reza Zadeh] Use partition index for RNG
251bb9c [Reza Zadeh] Documentation
25e9d0d [Reza Zadeh] Line length for style
fb296f6 [Reza Zadeh] renamed to normL1 and normL2
3764983 [Reza Zadeh] Documentation
e9c6791 [Reza Zadeh] New interface and documentation
613f261 [Reza Zadeh] Column magnitude summary
75a0b51 [Reza Zadeh] Use Ints instead of Longs in the shuffle
0f12ade [Reza Zadeh] Style changes
eb1dc20 [Reza Zadeh] Use Double.PositiveInfinity instead of Double.Max
f56a882 [Reza Zadeh] Remove changes to MultivariateOnlineSummarizer
dbc55ba [Reza Zadeh] Make colMagnitudes a method in RowMatrix
41e8ece [Reza Zadeh] style changes
139c8e1 [Reza Zadeh] Syntax changes
029aa9c [Reza Zadeh] javadoc and new test
75edb25 [Reza Zadeh] All tests passing!
05e59b8 [Reza Zadeh] Add test
502ce52 [Reza Zadeh] new interface
654c4fb [Reza Zadeh] default methods
3726ca9 [Reza Zadeh] Remove MatrixAlgebra
6bebabb [Reza Zadeh] remove changes to MatrixSuite
5b8cd7d [Reza Zadeh] Initial files
2014-09-29 14:15:09 -04:00
|
|
|
// Added normL1 and normL2 to trait MultivariateStatisticalSummary
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.stat.MultivariateStatisticalSummary.normL1"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
2014-09-30 01:56:22 -04:00
|
|
|
"org.apache.spark.mllib.stat.MultivariateStatisticalSummary.normL2"),
|
|
|
|
// MapStatus should be private[spark]
|
|
|
|
ProblemFilters.exclude[IncompatibleTemplateDefProblem](
|
2014-10-16 21:38:45 -04:00
|
|
|
"org.apache.spark.scheduler.MapStatus"),
|
[SPARK-3453] Netty-based BlockTransferService, extracted from Spark core
This PR encapsulates #2330, which is itself a continuation of #2240. The first goal of this PR is to provide an alternate, simpler implementation of the ConnectionManager which is based on Netty.
In addition to this goal, however, we want to resolve [SPARK-3796](https://issues.apache.org/jira/browse/SPARK-3796), which calls for a standalone shuffle service which can be integrated into the YARN NodeManager, Standalone Worker, or on its own. This PR makes the first step in this direction by ensuring that the actual Netty service is as small as possible and extracted from Spark core. Given this, we should be able to construct this standalone jar which can be included in other JVMs without incurring significant dependency or runtime issues. The actual work to ensure that such a standalone shuffle service would work in Spark will be left for a future PR, however.
In order to minimize dependencies and allow for the service to be long-running (possibly much longer-running than Spark, and possibly having to support multiple version of Spark simultaneously), the entire service has been ported to Java, where we have full control over the binary compatibility of the components and do not depend on the Scala runtime or version.
These issues: have been addressed by folding in #2330:
SPARK-3453: Refactor Netty module to use BlockTransferService interface
SPARK-3018: Release all buffers upon task completion/failure
SPARK-3002: Create a connection pool and reuse clients across different threads
SPARK-3017: Integration tests and unit tests for connection failures
SPARK-3049: Make sure client doesn't block when server/connection has error(s)
SPARK-3502: SO_RCVBUF and SO_SNDBUF should be bootstrap childOption, not option
SPARK-3503: Disable thread local cache in PooledByteBufAllocator
TODO before mergeable:
- [x] Implement uploadBlock()
- [x] Unit tests for RPC side of code
- [x] Performance testing (see comments [here](https://github.com/apache/spark/pull/2753#issuecomment-59475022))
- [x] Turn OFF by default (currently on for unit testing)
Author: Reynold Xin <rxin@apache.org>
Author: Aaron Davidson <aaron@databricks.com>
Author: cocoatomo <cocoatomo77@gmail.com>
Author: Patrick Wendell <pwendell@gmail.com>
Author: Prashant Sharma <prashant.s@imaginea.com>
Author: Davies Liu <davies.liu@gmail.com>
Author: Anand Avati <avati@redhat.com>
Closes #2753 from aarondav/netty and squashes the following commits:
cadfd28 [Aaron Davidson] Turn netty off by default
d7be11b [Aaron Davidson] Turn netty on by default
4a204b8 [Aaron Davidson] Fail block fetches if client connection fails
2b0d1c0 [Aaron Davidson] 100ch
0c5bca2 [Aaron Davidson] Merge branch 'master' of https://github.com/apache/spark into netty
14e37f7 [Aaron Davidson] Address Reynold's comments
8dfcceb [Aaron Davidson] Merge branch 'master' of https://github.com/apache/spark into netty
322dfc1 [Aaron Davidson] Address Reynold's comments, including major rename
e5675a4 [Aaron Davidson] Fail outstanding RPCs as well
ccd4959 [Aaron Davidson] Don't throw exception if client immediately fails
9da0bc1 [Aaron Davidson] Add RPC unit tests
d236dfd [Aaron Davidson] Remove no-op serializer :)
7b7a26c [Aaron Davidson] Fix Nio compile issue
dd420fd [Aaron Davidson] Merge branch 'master' of https://github.com/apache/spark into netty-test
939f276 [Aaron Davidson] Attempt to make comm. bidirectional
aa58f67 [cocoatomo] [SPARK-3909][PySpark][Doc] A corrupted format in Sphinx documents and building warnings
8dc1ded [cocoatomo] [SPARK-3867][PySpark] ./python/run-tests failed when it run with Python 2.6 and unittest2 is not installed
5b5dbe6 [Prashant Sharma] [SPARK-2924] Required by scala 2.11, only one fun/ctor amongst overriden alternatives, can have default argument(s).
2c5d9dc [Patrick Wendell] HOTFIX: Fix build issue with Akka 2.3.4 upgrade.
020691e [Davies Liu] [SPARK-3886] [PySpark] use AutoBatchedSerializer by default
ae4083a [Anand Avati] [SPARK-2805] Upgrade Akka to 2.3.4
29c6dcf [Aaron Davidson] [SPARK-3453] Netty-based BlockTransferService, extracted from Spark core
f7e7568 [Reynold Xin] Fixed spark.shuffle.io.receiveBuffer setting.
5d98ce3 [Reynold Xin] Flip buffer.
f6c220d [Reynold Xin] Merge with latest master.
407e59a [Reynold Xin] Fix style violation.
a0518c7 [Reynold Xin] Implemented block uploads.
4b18db2 [Reynold Xin] Copy the buffer in fetchBlockSync.
bec4ea2 [Reynold Xin] Removed OIO and added num threads settings.
1bdd7ee [Reynold Xin] Fixed tests.
d68f328 [Reynold Xin] Logging close() in case close() fails.
f63fb4c [Reynold Xin] Add more debug message.
6afc435 [Reynold Xin] Added logging.
c066309 [Reynold Xin] Implement java.io.Closeable interface.
519d64d [Reynold Xin] Mark private package visibility and MimaExcludes.
f0a16e9 [Reynold Xin] Fixed test hanging.
14323a5 [Reynold Xin] Removed BlockManager.getLocalShuffleFromDisk.
b2f3281 [Reynold Xin] Added connection pooling.
d23ed7b [Reynold Xin] Incorporated feedback from Norman: - use same pool for boss and worker - remove ioratio - disable caching of byte buf allocator - childoption sendbuf/receivebuf - fire exception through pipeline
9e0cb87 [Reynold Xin] Fixed BlockClientHandlerSuite
5cd33d7 [Reynold Xin] Fixed style violation.
cb589ec [Reynold Xin] Added more test cases covering cleanup when fault happens in ShuffleBlockFetcherIteratorSuite
1be4e8e [Reynold Xin] Shorten NioManagedBuffer and NettyManagedBuffer class names.
108c9ed [Reynold Xin] Forgot to add TestSerializer to the commit list.
b5c8d1f [Reynold Xin] Fixed ShuffleBlockFetcherIteratorSuite.
064747b [Reynold Xin] Reference count buffers and clean them up properly.
2b44cf1 [Reynold Xin] Added more documentation.
1760d32 [Reynold Xin] Use Epoll.isAvailable in BlockServer as well.
165eab1 [Reynold Xin] [SPARK-3453] Refactor Netty module to use BlockTransferService.
2014-10-29 14:27:07 -04:00
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.network.netty.PathResolver"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.network.netty.client.BlockClientListener"),
|
|
|
|
|
2014-10-16 21:38:45 -04:00
|
|
|
// TaskContext was promoted to Abstract class
|
|
|
|
ProblemFilters.exclude[AbstractClassProblem](
|
[SPARK-4084] Reuse sort key in Sorter
Sorter uses generic-typed key for sorting. When data is large, it creates lots of key objects, which is not efficient. We should reuse the key in Sorter for memory efficiency. This change is part of the petabyte sort implementation from rxin .
The `Sorter` class was written in Java and marked package private. So it is only available to `org.apache.spark.util.collection`. I renamed it to `TimSort` and add a simple wrapper of it, still called `Sorter`, in Scala, which is `private[spark]`.
The benchmark code is updated, which now resets the array before each run. Here is the result on sorting primitive Int arrays of size 25 million using Sorter:
~~~
[info] - Sorter benchmark for key-value pairs !!! IGNORED !!!
Java Arrays.sort() on non-primitive int array: Took 13237 ms
Java Arrays.sort() on non-primitive int array: Took 13320 ms
Java Arrays.sort() on non-primitive int array: Took 15718 ms
Java Arrays.sort() on non-primitive int array: Took 13283 ms
Java Arrays.sort() on non-primitive int array: Took 13267 ms
Java Arrays.sort() on non-primitive int array: Took 15122 ms
Java Arrays.sort() on non-primitive int array: Took 15495 ms
Java Arrays.sort() on non-primitive int array: Took 14877 ms
Java Arrays.sort() on non-primitive int array: Took 16429 ms
Java Arrays.sort() on non-primitive int array: Took 14250 ms
Java Arrays.sort() on non-primitive int array: (13878 ms first try, 14499 ms average)
Java Arrays.sort() on primitive int array: Took 2683 ms
Java Arrays.sort() on primitive int array: Took 2683 ms
Java Arrays.sort() on primitive int array: Took 2701 ms
Java Arrays.sort() on primitive int array: Took 2746 ms
Java Arrays.sort() on primitive int array: Took 2685 ms
Java Arrays.sort() on primitive int array: Took 2735 ms
Java Arrays.sort() on primitive int array: Took 2669 ms
Java Arrays.sort() on primitive int array: Took 2693 ms
Java Arrays.sort() on primitive int array: Took 2680 ms
Java Arrays.sort() on primitive int array: Took 2642 ms
Java Arrays.sort() on primitive int array: (2948 ms first try, 2691 ms average)
Sorter without key reuse on primitive int array: Took 10732 ms
Sorter without key reuse on primitive int array: Took 12482 ms
Sorter without key reuse on primitive int array: Took 10718 ms
Sorter without key reuse on primitive int array: Took 12650 ms
Sorter without key reuse on primitive int array: Took 10747 ms
Sorter without key reuse on primitive int array: Took 10783 ms
Sorter without key reuse on primitive int array: Took 12721 ms
Sorter without key reuse on primitive int array: Took 10604 ms
Sorter without key reuse on primitive int array: Took 10622 ms
Sorter without key reuse on primitive int array: Took 11843 ms
Sorter without key reuse on primitive int array: (11089 ms first try, 11390 ms average)
Sorter with key reuse on primitive int array: Took 5141 ms
Sorter with key reuse on primitive int array: Took 5298 ms
Sorter with key reuse on primitive int array: Took 5066 ms
Sorter with key reuse on primitive int array: Took 5164 ms
Sorter with key reuse on primitive int array: Took 5203 ms
Sorter with key reuse on primitive int array: Took 5274 ms
Sorter with key reuse on primitive int array: Took 5186 ms
Sorter with key reuse on primitive int array: Took 5159 ms
Sorter with key reuse on primitive int array: Took 5164 ms
Sorter with key reuse on primitive int array: Took 5078 ms
Sorter with key reuse on primitive int array: (5311 ms first try, 5173 ms average)
~~~
So with key reuse, it is faster and less likely to trigger GC.
Author: Xiangrui Meng <meng@databricks.com>
Author: Reynold Xin <rxin@apache.org>
Closes #2937 from mengxr/SPARK-4084 and squashes the following commits:
d73c3d0 [Xiangrui Meng] address comments
0b7b682 [Xiangrui Meng] fix mima
a72f53c [Xiangrui Meng] update timeIt
38ba50c [Xiangrui Meng] update timeIt
720f731 [Xiangrui Meng] add doc about JIT specialization
78f2879 [Xiangrui Meng] update tests
7de2efd [Xiangrui Meng] update the Sorter benchmark code to be correct
8626356 [Xiangrui Meng] add prepare to timeIt and update testsin SorterSuite
5f0d530 [Xiangrui Meng] update method modifiers of SortDataFormat
6ffbe66 [Xiangrui Meng] rename Sorter to TimSort and add a Scala wrapper that is private[spark]
b00db4d [Xiangrui Meng] doc and tests
cf94e8a [Xiangrui Meng] renaming
464ddce [Reynold Xin] cherry-pick rxin's commit
2014-10-28 18:14:41 -04:00
|
|
|
"org.apache.spark.TaskContext"),
|
|
|
|
ProblemFilters.exclude[IncompatibleTemplateDefProblem](
|
|
|
|
"org.apache.spark.util.collection.SortDataFormat")
|
2014-10-19 23:02:31 -04:00
|
|
|
) ++ Seq(
|
|
|
|
// Adding new methods to the JavaRDDLike trait:
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaRDDLike.takeAsync"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaRDDLike.foreachPartitionAsync"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaRDDLike.countAsync"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaRDDLike.foreachAsync"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaRDDLike.collectAsync")
|
2014-10-29 17:01:00 -04:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-3822
|
|
|
|
ProblemFilters.exclude[IncompatibleResultTypeProblem](
|
|
|
|
"org.apache.spark.SparkContext.org$apache$spark$SparkContext$$createTaskScheduler")
|
2014-11-10 01:11:20 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-1209
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.hadoop.mapreduce.SparkHadoopMapReduceUtil"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.hadoop.mapred.SparkHadoopMapRedUtil"),
|
|
|
|
ProblemFilters.exclude[MissingTypesProblem](
|
|
|
|
"org.apache.spark.rdd.PairRDDFunctions")
|
2014-11-14 17:33:37 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-4062
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.streaming.kafka.KafkaReceiver#MessageHandler.this")
|
[MLlib] [SPARK-2885] DIMSUM: All-pairs similarity
# All-pairs similarity via DIMSUM
Compute all pairs of similar vectors using brute force approach, and also DIMSUM sampling approach.
Laying down some notation: we are looking for all pairs of similar columns in an m x n RowMatrix whose entries are denoted a_ij, with the i’th row denoted r_i and the j’th column denoted c_j. There is an oversampling parameter labeled ɣ that should be set to 4 log(n)/s to get provably correct results (with high probability), where s is the similarity threshold.
The algorithm is stated with a Map and Reduce, with proofs of correctness and efficiency in published papers [1] [2]. The reducer is simply the summation reducer. The mapper is more interesting, and is also the heart of the scheme. As an exercise, you should try to see why in expectation, the map-reduce below outputs cosine similarities.
![dimsumv2](https://cloud.githubusercontent.com/assets/3220351/3807272/d1d9514e-1c62-11e4-9f12-3cfdb1d78b3a.png)
[1] Bosagh-Zadeh, Reza and Carlsson, Gunnar (2013), Dimension Independent Matrix Square using MapReduce, arXiv:1304.1467 http://arxiv.org/abs/1304.1467
[2] Bosagh-Zadeh, Reza and Goel, Ashish (2012), Dimension Independent Similarity Computation, arXiv:1206.2082 http://arxiv.org/abs/1206.2082
# Testing
Tests for all invocations included.
Added L1 and L2 norm computation to MultivariateStatisticalSummary since it was needed. Added tests for both of them.
Author: Reza Zadeh <rizlar@gmail.com>
Author: Xiangrui Meng <meng@databricks.com>
Closes #1778 from rezazadeh/dimsumv2 and squashes the following commits:
404c64c [Reza Zadeh] Merge remote-tracking branch 'upstream/master' into dimsumv2
4eb71c6 [Reza Zadeh] Add excludes for normL1 and normL2
ee8bd65 [Reza Zadeh] Merge remote-tracking branch 'upstream/master' into dimsumv2
976ddd4 [Reza Zadeh] Broadcast colMags. Avoid div by zero.
3467cff [Reza Zadeh] Merge remote-tracking branch 'upstream/master' into dimsumv2
aea0247 [Reza Zadeh] Allow large thresholds to promote sparsity
9fe17c0 [Xiangrui Meng] organize imports
2196ba5 [Xiangrui Meng] Merge branch 'rezazadeh-dimsumv2' into dimsumv2
254ca08 [Reza Zadeh] Merge remote-tracking branch 'upstream/master' into dimsumv2
f2947e4 [Xiangrui Meng] some optimization
3c4cf41 [Xiangrui Meng] Merge branch 'master' into rezazadeh-dimsumv2
0e4eda4 [Reza Zadeh] Use partition index for RNG
251bb9c [Reza Zadeh] Documentation
25e9d0d [Reza Zadeh] Line length for style
fb296f6 [Reza Zadeh] renamed to normL1 and normL2
3764983 [Reza Zadeh] Documentation
e9c6791 [Reza Zadeh] New interface and documentation
613f261 [Reza Zadeh] Column magnitude summary
75a0b51 [Reza Zadeh] Use Ints instead of Longs in the shuffle
0f12ade [Reza Zadeh] Style changes
eb1dc20 [Reza Zadeh] Use Double.PositiveInfinity instead of Double.Max
f56a882 [Reza Zadeh] Remove changes to MultivariateOnlineSummarizer
dbc55ba [Reza Zadeh] Make colMagnitudes a method in RowMatrix
41e8ece [Reza Zadeh] style changes
139c8e1 [Reza Zadeh] Syntax changes
029aa9c [Reza Zadeh] javadoc and new test
75edb25 [Reza Zadeh] All tests passing!
05e59b8 [Reza Zadeh] Add test
502ce52 [Reza Zadeh] new interface
654c4fb [Reza Zadeh] default methods
3726ca9 [Reza Zadeh] Remove MatrixAlgebra
6bebabb [Reza Zadeh] remove changes to MatrixSuite
5b8cd7d [Reza Zadeh] Initial files
2014-09-29 14:15:09 -04:00
|
|
|
)
|
2014-09-16 00:14:00 -04:00
|
|
|
|
2014-07-23 20:12:28 -04:00
|
|
|
case v if v.startsWith("1.1") =>
|
|
|
|
Seq(
|
|
|
|
MimaBuild.excludeSparkPackage("deploy"),
|
|
|
|
MimaBuild.excludeSparkPackage("graphx")
|
|
|
|
) ++
|
|
|
|
Seq(
|
|
|
|
// Adding new method to JavaRDLike trait - we should probably mark this as a developer API.
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.api.java.JavaRDDLike.partitions"),
|
2014-09-02 02:28:19 -04:00
|
|
|
// Should probably mark this as Experimental
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaRDDLike.foreachAsync"),
|
2014-07-23 20:12:28 -04:00
|
|
|
// We made a mistake earlier (ed06500d3) in the Java API to use default parameter values
|
|
|
|
// for countApproxDistinct* functions, which does not work in Java. We later removed
|
|
|
|
// them, and use the following to tell Mima to not care about them.
|
|
|
|
ProblemFilters.exclude[IncompatibleResultTypeProblem](
|
|
|
|
"org.apache.spark.api.java.JavaPairRDD.countApproxDistinctByKey"),
|
|
|
|
ProblemFilters.exclude[IncompatibleResultTypeProblem](
|
|
|
|
"org.apache.spark.api.java.JavaPairRDD.countApproxDistinctByKey"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaPairRDD.countApproxDistinct$default$1"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaPairRDD.countApproxDistinctByKey$default$1"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaRDD.countApproxDistinct$default$1"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaRDDLike.countApproxDistinct$default$1"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaDoubleRDD.countApproxDistinct$default$1"),
|
2014-08-30 02:05:18 -04:00
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.storage.DiskStore.getValues"),
|
2014-07-23 20:12:28 -04:00
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.storage.MemoryStore.Entry")
|
|
|
|
) ++
|
2014-08-16 02:12:34 -04:00
|
|
|
Seq(
|
|
|
|
// Serializer interface change. See SPARK-3045.
|
|
|
|
ProblemFilters.exclude[IncompatibleTemplateDefProblem](
|
|
|
|
"org.apache.spark.serializer.DeserializationStream"),
|
|
|
|
ProblemFilters.exclude[IncompatibleTemplateDefProblem](
|
|
|
|
"org.apache.spark.serializer.Serializer"),
|
|
|
|
ProblemFilters.exclude[IncompatibleTemplateDefProblem](
|
|
|
|
"org.apache.spark.serializer.SerializationStream"),
|
|
|
|
ProblemFilters.exclude[IncompatibleTemplateDefProblem](
|
|
|
|
"org.apache.spark.serializer.SerializerInstance")
|
|
|
|
)++
|
2014-07-23 20:12:28 -04:00
|
|
|
Seq(
|
2014-07-27 19:08:16 -04:00
|
|
|
// Renamed putValues -> putArray + putIterator
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.storage.MemoryStore.putValues"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.storage.DiskStore.putValues"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.storage.TachyonStore.putValues")
|
|
|
|
) ++
|
|
|
|
Seq(
|
2014-08-01 07:32:46 -04:00
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.streaming.flume.FlumeReceiver.this"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem](
|
|
|
|
"org.apache.spark.streaming.kafka.KafkaUtils.createStream"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem](
|
|
|
|
"org.apache.spark.streaming.kafka.KafkaReceiver.this")
|
2014-07-23 20:12:28 -04:00
|
|
|
) ++
|
|
|
|
Seq( // Ignore some private methods in ALS.
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$^dateFeatures"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem]( // The only public constructor is the one without arguments.
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.this"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$<init>$default$7"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$^dateFeatures")
|
|
|
|
) ++
|
|
|
|
MimaBuild.excludeSparkClass("mllib.linalg.distributed.ColumnStatisticsAggregator") ++
|
|
|
|
MimaBuild.excludeSparkClass("rdd.ZippedRDD") ++
|
|
|
|
MimaBuild.excludeSparkClass("rdd.ZippedPartition") ++
|
|
|
|
MimaBuild.excludeSparkClass("util.SerializableHyperLogLog") ++
|
|
|
|
MimaBuild.excludeSparkClass("storage.Values") ++
|
|
|
|
MimaBuild.excludeSparkClass("storage.Entry") ++
|
|
|
|
MimaBuild.excludeSparkClass("storage.MemoryStore$Entry") ++
|
2014-09-03 17:57:38 -04:00
|
|
|
// Class was missing "@DeveloperApi" annotation in 1.0.
|
|
|
|
MimaBuild.excludeSparkClass("scheduler.SparkListenerApplicationStart") ++
|
2014-07-23 20:12:28 -04:00
|
|
|
Seq(
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem](
|
|
|
|
"org.apache.spark.mllib.tree.impurity.Gini.calculate"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem](
|
|
|
|
"org.apache.spark.mllib.tree.impurity.Entropy.calculate"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem](
|
|
|
|
"org.apache.spark.mllib.tree.impurity.Variance.calculate")
|
2014-07-30 20:34:32 -04:00
|
|
|
) ++
|
2014-09-03 17:57:38 -04:00
|
|
|
Seq( // Package-private classes removed in SPARK-2341
|
2014-07-30 20:34:32 -04:00
|
|
|
ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.BinaryLabelParser"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.BinaryLabelParser$"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.LabelParser"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.LabelParser$"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.MulticlassLabelParser"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.MulticlassLabelParser$")
|
2014-09-03 17:57:38 -04:00
|
|
|
) ++
|
2014-08-12 01:33:45 -04:00
|
|
|
Seq( // package-private classes removed in MLlib
|
2014-08-08 18:07:31 -04:00
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.regression.GeneralizedLinearAlgorithm.org$apache$spark$mllib$regression$GeneralizedLinearAlgorithm$$prependOne")
|
2014-08-12 01:33:45 -04:00
|
|
|
) ++
|
|
|
|
Seq( // new Vector methods in MLlib (binary compatible assuming users do not implement Vector)
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.mllib.linalg.Vector.copy")
|
2014-08-15 11:53:52 -04:00
|
|
|
) ++
|
2014-08-16 18:13:34 -04:00
|
|
|
Seq( // synthetic methods generated in LabeledPoint
|
|
|
|
ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.mllib.regression.LabeledPoint$"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.mllib.regression.LabeledPoint.apply"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.mllib.regression.LabeledPoint.toString")
|
|
|
|
) ++
|
2014-08-15 11:53:52 -04:00
|
|
|
Seq ( // Scala 2.11 compatibility fix
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.streaming.StreamingContext.<init>$default$2")
|
2014-08-02 02:55:30 -04:00
|
|
|
)
|
2014-07-23 20:12:28 -04:00
|
|
|
case v if v.startsWith("1.0") =>
|
|
|
|
Seq(
|
|
|
|
MimaBuild.excludeSparkPackage("api.java"),
|
|
|
|
MimaBuild.excludeSparkPackage("mllib"),
|
|
|
|
MimaBuild.excludeSparkPackage("streaming")
|
|
|
|
) ++
|
|
|
|
MimaBuild.excludeSparkClass("rdd.ClassTags") ++
|
|
|
|
MimaBuild.excludeSparkClass("util.XORShiftRandom") ++
|
|
|
|
MimaBuild.excludeSparkClass("graphx.EdgeRDD") ++
|
|
|
|
MimaBuild.excludeSparkClass("graphx.VertexRDD") ++
|
|
|
|
MimaBuild.excludeSparkClass("graphx.impl.GraphImpl") ++
|
|
|
|
MimaBuild.excludeSparkClass("graphx.impl.RoutingTable") ++
|
|
|
|
MimaBuild.excludeSparkClass("graphx.util.collection.PrimitiveKeyOpenHashMap") ++
|
|
|
|
MimaBuild.excludeSparkClass("graphx.util.collection.GraphXPrimitiveKeyOpenHashMap") ++
|
|
|
|
MimaBuild.excludeSparkClass("mllib.recommendation.MFDataGenerator") ++
|
|
|
|
MimaBuild.excludeSparkClass("mllib.optimization.SquaredGradient") ++
|
|
|
|
MimaBuild.excludeSparkClass("mllib.regression.RidgeRegressionWithSGD") ++
|
|
|
|
MimaBuild.excludeSparkClass("mllib.regression.LassoWithSGD") ++
|
|
|
|
MimaBuild.excludeSparkClass("mllib.regression.LinearRegressionWithSGD")
|
|
|
|
case _ => Seq()
|
|
|
|
}
|
2014-06-01 20:27:05 -04:00
|
|
|
}
|