2014-06-01 20:27:05 -04:00
|
|
|
/*
|
|
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
* contributor license agreements. See the NOTICE file distributed with
|
|
|
|
* this work for additional information regarding copyright ownership.
|
|
|
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
* (the "License"); you may not use this file except in compliance with
|
|
|
|
* the License. You may obtain a copy of the License at
|
|
|
|
*
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
*
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
import com.typesafe.tools.mima.core._
|
2015-03-20 14:43:57 -04:00
|
|
|
import com.typesafe.tools.mima.core.ProblemFilters._
|
2014-06-01 20:27:05 -04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Additional excludes for checking of Spark's binary compatibility.
|
|
|
|
*
|
|
|
|
* The Mima build will automatically exclude @DeveloperApi and @Experimental classes. This acts
|
|
|
|
* as an official audit of cases where we excluded other classes. Please use the narrowest
|
|
|
|
* possible exclude here. MIMA will usually tell you what exclude to use, e.g.:
|
|
|
|
*
|
|
|
|
* ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.rdd.RDD.take")
|
|
|
|
*
|
|
|
|
* It is also possible to exclude Spark classes and packages. This should be used sparingly:
|
|
|
|
*
|
|
|
|
* MimaBuild.excludeSparkClass("graphx.util.collection.GraphXPrimitiveKeyOpenHashMap")
|
|
|
|
*/
|
|
|
|
object MimaExcludes {
|
2014-07-23 20:12:28 -04:00
|
|
|
def excludes(version: String) =
|
|
|
|
version match {
|
2015-03-20 14:43:57 -04:00
|
|
|
case v if v.startsWith("1.4") =>
|
|
|
|
Seq(
|
|
|
|
MimaBuild.excludeSparkPackage("deploy"),
|
|
|
|
MimaBuild.excludeSparkPackage("ml"),
|
|
|
|
// SPARK-5922 Adding a generalized diff(other: RDD[(VertexId, VD)]) to VertexRDD
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.graphx.VertexRDD.diff"),
|
|
|
|
// These are needed if checking against the sbt build, since they are part of
|
|
|
|
// the maven-generated artifacts in 1.3.
|
|
|
|
excludePackage("org.spark-project.jetty"),
|
|
|
|
MimaBuild.excludeSparkPackage("unused"),
|
2015-03-24 02:41:06 -04:00
|
|
|
ProblemFilters.exclude[MissingClassProblem]("com.google.common.base.Optional"),
|
|
|
|
ProblemFilters.exclude[IncompatibleResultTypeProblem](
|
|
|
|
"org.apache.spark.rdd.JdbcRDD.compute"),
|
|
|
|
ProblemFilters.exclude[IncompatibleResultTypeProblem](
|
|
|
|
"org.apache.spark.broadcast.HttpBroadcastFactory.newBroadcast"),
|
|
|
|
ProblemFilters.exclude[IncompatibleResultTypeProblem](
|
2015-03-30 00:25:09 -04:00
|
|
|
"org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.scheduler.OutputCommitCoordinator$OutputCommitCoordinatorActor")
|
2015-03-26 22:08:09 -04:00
|
|
|
) ++ Seq(
|
2015-04-01 06:09:00 -04:00
|
|
|
// SPARK-4655 - Making Stage an Abstract class broke binary compatility even though
|
|
|
|
// the stage class is defined as private[spark]
|
|
|
|
ProblemFilters.exclude[AbstractClassProblem]("org.apache.spark.scheduler.Stage")
|
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-6510 Add a Graph#minus method acting as Set#difference
|
2015-03-26 22:08:09 -04:00
|
|
|
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.graphx.VertexRDD.minus")
|
2015-04-03 14:23:11 -04:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-6492 Fix deadlock in SparkContext.stop()
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.SparkContext.org$" +
|
|
|
|
"apache$spark$SparkContext$$SPARK_CONTEXT_CONSTRUCTOR_LOCK")
|
2015-04-09 18:37:45 -04:00
|
|
|
)++ Seq(
|
|
|
|
// SPARK-6693 add tostring with max lines and width for matrix
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.Matrix.toString")
|
2015-04-17 21:28:42 -04:00
|
|
|
)++ Seq(
|
|
|
|
// SPARK-6703 Add getOrCreate method to SparkContext
|
|
|
|
ProblemFilters.exclude[IncompatibleResultTypeProblem]
|
|
|
|
("org.apache.spark.SparkContext.org$apache$spark$SparkContext$$activeContext")
|
2015-04-27 22:02:51 -04:00
|
|
|
)++ Seq(
|
|
|
|
// SPARK-7090 Introduce LDAOptimizer to LDA to further improve extensibility
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.mllib.clustering.LDA$EMOptimizer")
|
[SPARK-6756] [MLLIB] add toSparse, toDense, numActives, numNonzeros, and compressed to Vector
Add `compressed` to `Vector` with some other methods: `numActives`, `numNonzeros`, `toSparse`, and `toDense`. jkbradley
Author: Xiangrui Meng <meng@databricks.com>
Closes #5756 from mengxr/SPARK-6756 and squashes the following commits:
8d4ecbd [Xiangrui Meng] address comment and add mima excludes
da54179 [Xiangrui Meng] add toSparse, toDense, numActives, numNonzeros, and compressed to Vector
2015-04-29 00:49:53 -04:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-6756 add toSparse, toDense, numActives, numNonzeros, and compressed to Vector
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.Vector.compressed"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.Vector.toDense"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.Vector.numNonzeros"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.Vector.toSparse"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.Vector.numActives")
|
2015-04-30 19:23:01 -04:00
|
|
|
) ++ Seq(
|
[SPARK-6908] [SQL] Use isolated Hive client
This PR switches Spark SQL's Hive support to use the isolated hive client interface introduced by #5851, instead of directly interacting with the client. By using this isolated client we can now allow users to dynamically configure the version of Hive that they are connecting to by setting `spark.sql.hive.metastore.version` without the need recompile. This also greatly reduces the surface area for our interaction with the hive libraries, hopefully making it easier to support other versions in the future.
Jars for the desired hive version can be configured using `spark.sql.hive.metastore.jars`, which accepts the following options:
- a colon-separated list of jar files or directories for hive and hadoop.
- `builtin` - attempt to discover the jars that were used to load Spark SQL and use those. This
option is only valid when using the execution version of Hive.
- `maven` - download the correct version of hive on demand from maven.
By default, `builtin` is used for Hive 13.
This PR also removes the test step for building against Hive 12, as this will no longer be required to talk to Hive 12 metastores. However, the full removal of the Shim is deferred until a later PR.
Remaining TODOs:
- Remove the Hive Shims and inline code for Hive 13.
- Several HiveCompatibility tests are not yet passing.
- `nullformatCTAS` - As detailed below, we now are handling CTAS parsing ourselves instead of hacking into the Hive semantic analyzer. However, we currently only handle the common cases and not things like CTAS where the null format is specified.
- `combine1` now leaks state about compression somehow, breaking all subsequent tests. As such we currently add it to the blacklist
- `part_inherit_tbl_props` and `part_inherit_tbl_props_with_star` do not work anymore. We are correctly propagating the information
- "load_dyn_part14.*" - These tests pass when run on their own, but fail when run with all other tests. It seems our `RESET` mechanism may not be as robust as it used to be?
Other required changes:
- `CreateTableAsSelect` no longer carries parts of the HiveQL AST with it through the query execution pipeline. Instead, we parse CTAS during the HiveQL conversion and construct a `HiveTable`. The full parsing here is not yet complete as detailed above in the remaining TODOs. Since the operator is Hive specific, it is moved to the hive package.
- `Command` is simplified to be a trait that simply acts as a marker for a LogicalPlan that should be eagerly evaluated.
Author: Michael Armbrust <michael@databricks.com>
Closes #5876 from marmbrus/useIsolatedClient and squashes the following commits:
258d000 [Michael Armbrust] really really correct path handling
e56fd4a [Michael Armbrust] getAbsolutePath
5a259f5 [Michael Armbrust] fix typos
81bb366 [Michael Armbrust] comments from vanzin
5f3945e [Michael Armbrust] Merge remote-tracking branch 'origin/master' into useIsolatedClient
4b5cd41 [Michael Armbrust] yin's comments
f5de7de [Michael Armbrust] cleanup
11e9c72 [Michael Armbrust] better coverage in versions suite
7e8f010 [Michael Armbrust] better error messages and jar handling
e7b3941 [Michael Armbrust] more permisive checking for function registration
da91ba7 [Michael Armbrust] Merge remote-tracking branch 'origin/master' into useIsolatedClient
5fe5894 [Michael Armbrust] fix serialization suite
81711c4 [Michael Armbrust] Initial support for running without maven
1d8ae44 [Michael Armbrust] fix final tests?
1c50813 [Michael Armbrust] more comments
a3bee70 [Michael Armbrust] Merge remote-tracking branch 'origin/master' into useIsolatedClient
a6f5df1 [Michael Armbrust] style
ab07f7e [Michael Armbrust] WIP
4d8bf02 [Michael Armbrust] Remove hive 12 compilation
8843a25 [Michael Armbrust] [SPARK-6908] [SQL] Use isolated Hive client
(cherry picked from commit cd1d4110cfffb413ab585cf1cc8f1264243cb393)
Signed-off-by: Yin Huai <yhuai@databricks.com>
2015-05-07 22:36:24 -04:00
|
|
|
// Execution should never be included as its always internal.
|
|
|
|
MimaBuild.excludeSparkPackage("sql.execution"),
|
2015-04-30 19:23:01 -04:00
|
|
|
// This `protected[sql]` method was removed in 1.3.1
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.sql.SQLContext.checkAnalysis"),
|
|
|
|
// This `private[sql]` class was removed in 1.4.0:
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.sql.execution.AddExchange"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.sql.execution.AddExchange$"),
|
|
|
|
// These test support classes were moved out of src/main and into src/test:
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.sql.parquet.ParquetTestData"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.sql.parquet.ParquetTestData$"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.sql.parquet.TestGroupWriteSupport")
|
2015-03-20 14:43:57 -04:00
|
|
|
)
|
|
|
|
|
2014-11-19 00:24:18 -05:00
|
|
|
case v if v.startsWith("1.3") =>
|
|
|
|
Seq(
|
|
|
|
MimaBuild.excludeSparkPackage("deploy"),
|
[SPARK-4789] [SPARK-4942] [SPARK-5031] [mllib] Standardize ML Prediction APIs
This is part (1a) of the updates from the design doc in [https://docs.google.com/document/d/1BH9el33kBX8JiDdgUJXdLW14CA2qhTCWIG46eXZVoJs]
**UPDATE**: Most of the APIs are being kept private[spark] to allow further discussion. Here is a list of changes which are public:
* new output columns: rawPrediction, probabilities
* The “score” column is now called “rawPrediction”
* Classifiers now provide numClasses
* Params.get and .set are now protected instead of private[ml].
* ParamMap now has a size method.
* new classes: LinearRegression, LinearRegressionModel
* LogisticRegression now has an intercept.
### Sketch of APIs (most of which are private[spark] for now)
Abstract classes for learning algorithms (+ corresponding Model abstractions):
* Classifier (+ ClassificationModel)
* ProbabilisticClassifier (+ ProbabilisticClassificationModel)
* Regressor (+ RegressionModel)
* Predictor (+ PredictionModel)
* *For all of these*:
* There is no strongly typed training-time API.
* There is a strongly typed test-time (prediction) API which helps developers implement new algorithms.
Concrete classes: learning algorithms
* LinearRegression
* LogisticRegression (updated to use new abstract classes)
* Also, removed "score" in favor of "probability" output column. Changed BinaryClassificationEvaluator to match. (SPARK-5031)
Other updates:
* params.scala: Changed Params.set/get to be protected instead of private[ml]
* This was needed for the example of defining a class from outside of the MLlib namespace.
* VectorUDT: Will later change from private[spark] to public.
* This is needed for outside users to write their own validateAndTransformSchema() methods using vectors.
* Also, added equals() method.f
* SPARK-4942 : ML Transformers should allow output cols to be turned on,off
* Update validateAndTransformSchema
* Update transform
* (Updated examples, test suites according to other changes)
New examples:
* DeveloperApiExample.scala (example of defining algorithm from outside of the MLlib namespace)
* Added Java version too
Test Suites:
* LinearRegressionSuite
* LogisticRegressionSuite
* + Java versions of above suites
CC: mengxr etrain shivaram
Author: Joseph K. Bradley <joseph@databricks.com>
Closes #3637 from jkbradley/ml-api-part1 and squashes the following commits:
405bfb8 [Joseph K. Bradley] Last edits based on code review. Small cleanups
fec348a [Joseph K. Bradley] Added JavaDeveloperApiExample.java and fixed other issues: Made developer API private[spark] for now. Added constructors Java can understand to specialized Param types.
8316d5e [Joseph K. Bradley] fixes after rebasing on master
fc62406 [Joseph K. Bradley] fixed test suites after last commit
bcb9549 [Joseph K. Bradley] Fixed issues after rebasing from master (after move from SchemaRDD to DataFrame)
9872424 [Joseph K. Bradley] fixed JavaLinearRegressionSuite.java Java sql api
f542997 [Joseph K. Bradley] Added MIMA excludes for VectorUDT (now public), and added DeveloperApi annotation to it
216d199 [Joseph K. Bradley] fixed after sql datatypes PR got merged
f549e34 [Joseph K. Bradley] Updates based on code review. Major ones are: * Created weakly typed Predictor.train() method which is called by fit() so that developers do not have to call schema validation or copy parameters. * Made Predictor.featuresDataType have a default value of VectorUDT. * NOTE: This could be dangerous since the FeaturesType type parameter cannot have a default value.
343e7bd [Joseph K. Bradley] added blanket mima exclude for ml package
82f340b [Joseph K. Bradley] Fixed bug in LogisticRegression (introduced in this PR). Fixed Java suites
0a16da9 [Joseph K. Bradley] Fixed Linear/Logistic RegressionSuites
c3c8da5 [Joseph K. Bradley] small cleanup
934f97b [Joseph K. Bradley] Fixed bugs from previous commit.
1c61723 [Joseph K. Bradley] * Made ProbabilisticClassificationModel into a subclass of ClassificationModel. Also introduced ProbabilisticClassifier. * This was to support output column “probabilityCol” in transform().
4e2f711 [Joseph K. Bradley] rat fix
bc654e1 [Joseph K. Bradley] Added spark.ml LinearRegressionSuite
8d13233 [Joseph K. Bradley] Added methods: * Classifier: batch predictRaw() * Predictor: train() without paramMap ProbabilisticClassificationModel.predictProbabilities() * Java versions of all above batch methods + others
1680905 [Joseph K. Bradley] Added JavaLabeledPointSuite.java for spark.ml, and added constructor to LabeledPoint which defaults weight to 1.0
adbe50a [Joseph K. Bradley] * fixed LinearRegression train() to use embedded paramMap * added Predictor.predict(RDD[Vector]) method * updated Linear/LogisticRegressionSuites
58802e3 [Joseph K. Bradley] added train() to Predictor subclasses which does not take a ParamMap.
57d54ab [Joseph K. Bradley] * Changed semantics of Predictor.train() to merge the given paramMap with the embedded paramMap. * remove threshold_internal from logreg * Added Predictor.copy() * Extended LogisticRegressionSuite
e433872 [Joseph K. Bradley] Updated docs. Added LabeledPointSuite to spark.ml
54b7b31 [Joseph K. Bradley] Fixed issue with logreg threshold being set correctly
0617d61 [Joseph K. Bradley] Fixed bug from last commit (sorting paramMap by parameter names in toString). Fixed bug in persisting logreg data. Added threshold_internal to logreg for faster test-time prediction (avoiding map lookup).
601e792 [Joseph K. Bradley] Modified ParamMap to sort parameters in toString. Cleaned up classes in class hierarchy, before implementing tests and examples.
d705e87 [Joseph K. Bradley] Added LinearRegression and Regressor back from ml-api branch
52f4fde [Joseph K. Bradley] removing everything except for simple class hierarchy for classification
d35bb5d [Joseph K. Bradley] fixed compilation issues, but have not added tests yet
bfade12 [Joseph K. Bradley] Added lots of classes for new ML API:
2015-02-06 02:43:47 -05:00
|
|
|
MimaBuild.excludeSparkPackage("ml"),
|
2014-11-19 00:24:18 -05:00
|
|
|
// These are needed if checking against the sbt build, since they are part of
|
|
|
|
// the maven-generated artifacts in the 1.2 build.
|
|
|
|
MimaBuild.excludeSparkPackage("unused"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem]("com.google.common.base.Optional")
|
2014-11-19 17:03:44 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-2321
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.SparkStageInfoImpl.this"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.SparkStageInfo.submissionTime")
|
2014-11-26 11:22:50 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-4614
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.Matrices.randn"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.Matrices.rand")
|
2015-01-27 04:46:17 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-5321
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.SparseMatrix.transposeMultiply"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.Matrix.transpose"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.DenseMatrix.transposeMultiply"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.mllib.linalg.Matrix." +
|
|
|
|
"org$apache$spark$mllib$linalg$Matrix$_setter_$isTransposed_="),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.Matrix.isTransposed"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.linalg.Matrix.foreachActive")
|
2015-02-02 20:10:01 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-5540
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
2015-02-03 02:49:09 -05:00
|
|
|
"org.apache.spark.mllib.recommendation.ALS.solveLeastSquares"),
|
|
|
|
// SPARK-5536
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$^dateFeatures"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$^dateBlock")
|
2014-12-31 19:59:17 -05:00
|
|
|
) ++ Seq(
|
2015-01-02 18:09:41 -05:00
|
|
|
// SPARK-3325
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.streaming.api.java.JavaDStreamLike.print"),
|
2014-12-31 19:59:17 -05:00
|
|
|
// SPARK-2757
|
|
|
|
ProblemFilters.exclude[IncompatibleResultTypeProblem](
|
|
|
|
"org.apache.spark.streaming.flume.sink.SparkAvroCallbackHandler." +
|
|
|
|
"removeAndGetProcessor")
|
2015-01-13 20:16:41 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-5123 (SparkSQL data type change) - alpha component only
|
|
|
|
ProblemFilters.exclude[IncompatibleResultTypeProblem](
|
|
|
|
"org.apache.spark.ml.feature.HashingTF.outputDataType"),
|
|
|
|
ProblemFilters.exclude[IncompatibleResultTypeProblem](
|
|
|
|
"org.apache.spark.ml.feature.Tokenizer.outputDataType"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem](
|
|
|
|
"org.apache.spark.ml.feature.Tokenizer.validateInputType"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem](
|
|
|
|
"org.apache.spark.ml.classification.LogisticRegressionModel.validateAndTransformSchema"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem](
|
|
|
|
"org.apache.spark.ml.classification.LogisticRegression.validateAndTransformSchema")
|
[SPARK-4014] Add TaskContext.attemptNumber and deprecate TaskContext.attemptId
`TaskContext.attemptId` is misleadingly-named, since it currently returns a taskId, which uniquely identifies a particular task attempt within a particular SparkContext, instead of an attempt number, which conveys how many times a task has been attempted.
This patch deprecates `TaskContext.attemptId` and add `TaskContext.taskId` and `TaskContext.attemptNumber` fields. Prior to this change, it was impossible to determine whether a task was being re-attempted (or was a speculative copy), which made it difficult to write unit tests for tasks that fail on early attempts or speculative tasks that complete faster than original tasks.
Earlier versions of the TaskContext docs suggest that `attemptId` behaves like `attemptNumber`, so there's an argument to be made in favor of changing this method's implementation. Since we've decided against making that change in maintenance branches, I think it's simpler to add better-named methods and retain the old behavior for `attemptId`; if `attemptId` behaved differently in different branches, then this would cause confusing build-breaks when backporting regression tests that rely on the new `attemptId` behavior.
Most of this patch is fairly straightforward, but there is a bit of trickiness related to Mesos tasks: since there's no field in MesosTaskInfo to encode the attemptId, I packed it into the `data` field alongside the task binary.
Author: Josh Rosen <joshrosen@databricks.com>
Closes #3849 from JoshRosen/SPARK-4014 and squashes the following commits:
89d03e0 [Josh Rosen] Merge remote-tracking branch 'origin/master' into SPARK-4014
5cfff05 [Josh Rosen] Introduce wrapper for serializing Mesos task launch data.
38574d4 [Josh Rosen] attemptId -> taskAttemptId in PairRDDFunctions
a180b88 [Josh Rosen] Merge remote-tracking branch 'origin/master' into SPARK-4014
1d43aa6 [Josh Rosen] Merge remote-tracking branch 'origin/master' into SPARK-4014
eee6a45 [Josh Rosen] Merge remote-tracking branch 'origin/master' into SPARK-4014
0b10526 [Josh Rosen] Use putInt instead of putLong (silly mistake)
8c387ce [Josh Rosen] Use local with maxRetries instead of local-cluster.
cbe4d76 [Josh Rosen] Preserve attemptId behavior and deprecate it:
b2dffa3 [Josh Rosen] Address some of Reynold's minor comments
9d8d4d1 [Josh Rosen] Doc typo
1e7a933 [Josh Rosen] [SPARK-4014] Change TaskContext.attemptId to return attempt number instead of task ID.
fd515a5 [Josh Rosen] Add failing test for SPARK-4014
2015-01-14 14:45:40 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-4014
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.TaskContext.taskAttemptId"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.TaskContext.attemptNumber")
|
2015-01-17 00:09:06 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-5166 Spark SQL API stabilization
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Transformer.transform"),
|
2015-01-27 19:08:24 -05:00
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Estimator.fit"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.ml.Transformer.transform"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Pipeline.fit"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.PipelineModel.transform"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.ml.Estimator.fit"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Evaluator.evaluate"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.ml.Evaluator.evaluate"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.tuning.CrossValidator.fit"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.tuning.CrossValidatorModel.transform"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.feature.StandardScaler.fit"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.feature.StandardScalerModel.transform"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.classification.LogisticRegressionModel.transform"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.classification.LogisticRegression.fit"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.evaluation.BinaryClassificationEvaluator.evaluate")
|
2015-01-20 01:50:44 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-5270
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaRDDLike.isEmpty")
|
2015-01-28 20:26:03 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-5430
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaRDDLike.treeReduce"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaRDDLike.treeAggregate")
|
2015-01-21 02:37:47 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-5297 Java FileStream do not work with custom key/values
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.streaming.api.java.JavaStreamingContext.fileStream")
|
2015-01-23 01:04:21 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-5315 Spark Streaming Java API returns Scala DStream
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.streaming.api.java.JavaDStreamLike.reduceByWindow")
|
2015-02-02 17:34:48 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-5461 Graph should have isCheckpointed, getCheckpointFiles methods
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.graphx.Graph.getCheckpointFiles"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.graphx.Graph.isCheckpointed")
|
[SPARK-4789] [SPARK-4942] [SPARK-5031] [mllib] Standardize ML Prediction APIs
This is part (1a) of the updates from the design doc in [https://docs.google.com/document/d/1BH9el33kBX8JiDdgUJXdLW14CA2qhTCWIG46eXZVoJs]
**UPDATE**: Most of the APIs are being kept private[spark] to allow further discussion. Here is a list of changes which are public:
* new output columns: rawPrediction, probabilities
* The “score” column is now called “rawPrediction”
* Classifiers now provide numClasses
* Params.get and .set are now protected instead of private[ml].
* ParamMap now has a size method.
* new classes: LinearRegression, LinearRegressionModel
* LogisticRegression now has an intercept.
### Sketch of APIs (most of which are private[spark] for now)
Abstract classes for learning algorithms (+ corresponding Model abstractions):
* Classifier (+ ClassificationModel)
* ProbabilisticClassifier (+ ProbabilisticClassificationModel)
* Regressor (+ RegressionModel)
* Predictor (+ PredictionModel)
* *For all of these*:
* There is no strongly typed training-time API.
* There is a strongly typed test-time (prediction) API which helps developers implement new algorithms.
Concrete classes: learning algorithms
* LinearRegression
* LogisticRegression (updated to use new abstract classes)
* Also, removed "score" in favor of "probability" output column. Changed BinaryClassificationEvaluator to match. (SPARK-5031)
Other updates:
* params.scala: Changed Params.set/get to be protected instead of private[ml]
* This was needed for the example of defining a class from outside of the MLlib namespace.
* VectorUDT: Will later change from private[spark] to public.
* This is needed for outside users to write their own validateAndTransformSchema() methods using vectors.
* Also, added equals() method.f
* SPARK-4942 : ML Transformers should allow output cols to be turned on,off
* Update validateAndTransformSchema
* Update transform
* (Updated examples, test suites according to other changes)
New examples:
* DeveloperApiExample.scala (example of defining algorithm from outside of the MLlib namespace)
* Added Java version too
Test Suites:
* LinearRegressionSuite
* LogisticRegressionSuite
* + Java versions of above suites
CC: mengxr etrain shivaram
Author: Joseph K. Bradley <joseph@databricks.com>
Closes #3637 from jkbradley/ml-api-part1 and squashes the following commits:
405bfb8 [Joseph K. Bradley] Last edits based on code review. Small cleanups
fec348a [Joseph K. Bradley] Added JavaDeveloperApiExample.java and fixed other issues: Made developer API private[spark] for now. Added constructors Java can understand to specialized Param types.
8316d5e [Joseph K. Bradley] fixes after rebasing on master
fc62406 [Joseph K. Bradley] fixed test suites after last commit
bcb9549 [Joseph K. Bradley] Fixed issues after rebasing from master (after move from SchemaRDD to DataFrame)
9872424 [Joseph K. Bradley] fixed JavaLinearRegressionSuite.java Java sql api
f542997 [Joseph K. Bradley] Added MIMA excludes for VectorUDT (now public), and added DeveloperApi annotation to it
216d199 [Joseph K. Bradley] fixed after sql datatypes PR got merged
f549e34 [Joseph K. Bradley] Updates based on code review. Major ones are: * Created weakly typed Predictor.train() method which is called by fit() so that developers do not have to call schema validation or copy parameters. * Made Predictor.featuresDataType have a default value of VectorUDT. * NOTE: This could be dangerous since the FeaturesType type parameter cannot have a default value.
343e7bd [Joseph K. Bradley] added blanket mima exclude for ml package
82f340b [Joseph K. Bradley] Fixed bug in LogisticRegression (introduced in this PR). Fixed Java suites
0a16da9 [Joseph K. Bradley] Fixed Linear/Logistic RegressionSuites
c3c8da5 [Joseph K. Bradley] small cleanup
934f97b [Joseph K. Bradley] Fixed bugs from previous commit.
1c61723 [Joseph K. Bradley] * Made ProbabilisticClassificationModel into a subclass of ClassificationModel. Also introduced ProbabilisticClassifier. * This was to support output column “probabilityCol” in transform().
4e2f711 [Joseph K. Bradley] rat fix
bc654e1 [Joseph K. Bradley] Added spark.ml LinearRegressionSuite
8d13233 [Joseph K. Bradley] Added methods: * Classifier: batch predictRaw() * Predictor: train() without paramMap ProbabilisticClassificationModel.predictProbabilities() * Java versions of all above batch methods + others
1680905 [Joseph K. Bradley] Added JavaLabeledPointSuite.java for spark.ml, and added constructor to LabeledPoint which defaults weight to 1.0
adbe50a [Joseph K. Bradley] * fixed LinearRegression train() to use embedded paramMap * added Predictor.predict(RDD[Vector]) method * updated Linear/LogisticRegressionSuites
58802e3 [Joseph K. Bradley] added train() to Predictor subclasses which does not take a ParamMap.
57d54ab [Joseph K. Bradley] * Changed semantics of Predictor.train() to merge the given paramMap with the embedded paramMap. * remove threshold_internal from logreg * Added Predictor.copy() * Extended LogisticRegressionSuite
e433872 [Joseph K. Bradley] Updated docs. Added LabeledPointSuite to spark.ml
54b7b31 [Joseph K. Bradley] Fixed issue with logreg threshold being set correctly
0617d61 [Joseph K. Bradley] Fixed bug from last commit (sorting paramMap by parameter names in toString). Fixed bug in persisting logreg data. Added threshold_internal to logreg for faster test-time prediction (avoiding map lookup).
601e792 [Joseph K. Bradley] Modified ParamMap to sort parameters in toString. Cleaned up classes in class hierarchy, before implementing tests and examples.
d705e87 [Joseph K. Bradley] Added LinearRegression and Regressor back from ml-api branch
52f4fde [Joseph K. Bradley] removing everything except for simple class hierarchy for classification
d35bb5d [Joseph K. Bradley] fixed compilation issues, but have not added tests yet
bfade12 [Joseph K. Bradley] Added lots of classes for new ML API:
2015-02-06 02:43:47 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-4789 Standardize ML Prediction APIs
|
|
|
|
ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.mllib.linalg.VectorUDT"),
|
|
|
|
ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.mllib.linalg.VectorUDT.serialize"),
|
|
|
|
ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.mllib.linalg.VectorUDT.sqlType")
|
2015-03-12 04:39:04 -04:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-5814
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$wrapDoubleArray"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$fillFullMatrix"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$iterations"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$makeOutLinkBlock"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$computeYtY"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$makeLinkRDDs"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$alpha"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$randomFactor"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$makeInLinkBlock"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$dspr"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$lambda"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$implicitPrefs"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$rank")
|
2015-02-19 18:35:23 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-4682
|
|
|
|
ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.RealClock"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.Clock"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.TestClock")
|
2015-03-16 04:06:26 -04:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-5922 Adding a generalized diff(other: RDD[(VertexId, VD)]) to VertexRDD
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.graphx.VertexRDD.diff")
|
2014-11-19 00:24:18 -05:00
|
|
|
)
|
|
|
|
|
2014-09-07 23:39:53 -04:00
|
|
|
case v if v.startsWith("1.2") =>
|
|
|
|
Seq(
|
|
|
|
MimaBuild.excludeSparkPackage("deploy"),
|
|
|
|
MimaBuild.excludeSparkPackage("graphx")
|
2014-09-19 01:18:51 -04:00
|
|
|
) ++
|
|
|
|
MimaBuild.excludeSparkClass("mllib.linalg.Matrix") ++
|
[MLlib] [SPARK-2885] DIMSUM: All-pairs similarity
# All-pairs similarity via DIMSUM
Compute all pairs of similar vectors using brute force approach, and also DIMSUM sampling approach.
Laying down some notation: we are looking for all pairs of similar columns in an m x n RowMatrix whose entries are denoted a_ij, with the i’th row denoted r_i and the j’th column denoted c_j. There is an oversampling parameter labeled ɣ that should be set to 4 log(n)/s to get provably correct results (with high probability), where s is the similarity threshold.
The algorithm is stated with a Map and Reduce, with proofs of correctness and efficiency in published papers [1] [2]. The reducer is simply the summation reducer. The mapper is more interesting, and is also the heart of the scheme. As an exercise, you should try to see why in expectation, the map-reduce below outputs cosine similarities.
![dimsumv2](https://cloud.githubusercontent.com/assets/3220351/3807272/d1d9514e-1c62-11e4-9f12-3cfdb1d78b3a.png)
[1] Bosagh-Zadeh, Reza and Carlsson, Gunnar (2013), Dimension Independent Matrix Square using MapReduce, arXiv:1304.1467 http://arxiv.org/abs/1304.1467
[2] Bosagh-Zadeh, Reza and Goel, Ashish (2012), Dimension Independent Similarity Computation, arXiv:1206.2082 http://arxiv.org/abs/1206.2082
# Testing
Tests for all invocations included.
Added L1 and L2 norm computation to MultivariateStatisticalSummary since it was needed. Added tests for both of them.
Author: Reza Zadeh <rizlar@gmail.com>
Author: Xiangrui Meng <meng@databricks.com>
Closes #1778 from rezazadeh/dimsumv2 and squashes the following commits:
404c64c [Reza Zadeh] Merge remote-tracking branch 'upstream/master' into dimsumv2
4eb71c6 [Reza Zadeh] Add excludes for normL1 and normL2
ee8bd65 [Reza Zadeh] Merge remote-tracking branch 'upstream/master' into dimsumv2
976ddd4 [Reza Zadeh] Broadcast colMags. Avoid div by zero.
3467cff [Reza Zadeh] Merge remote-tracking branch 'upstream/master' into dimsumv2
aea0247 [Reza Zadeh] Allow large thresholds to promote sparsity
9fe17c0 [Xiangrui Meng] organize imports
2196ba5 [Xiangrui Meng] Merge branch 'rezazadeh-dimsumv2' into dimsumv2
254ca08 [Reza Zadeh] Merge remote-tracking branch 'upstream/master' into dimsumv2
f2947e4 [Xiangrui Meng] some optimization
3c4cf41 [Xiangrui Meng] Merge branch 'master' into rezazadeh-dimsumv2
0e4eda4 [Reza Zadeh] Use partition index for RNG
251bb9c [Reza Zadeh] Documentation
25e9d0d [Reza Zadeh] Line length for style
fb296f6 [Reza Zadeh] renamed to normL1 and normL2
3764983 [Reza Zadeh] Documentation
e9c6791 [Reza Zadeh] New interface and documentation
613f261 [Reza Zadeh] Column magnitude summary
75a0b51 [Reza Zadeh] Use Ints instead of Longs in the shuffle
0f12ade [Reza Zadeh] Style changes
eb1dc20 [Reza Zadeh] Use Double.PositiveInfinity instead of Double.Max
f56a882 [Reza Zadeh] Remove changes to MultivariateOnlineSummarizer
dbc55ba [Reza Zadeh] Make colMagnitudes a method in RowMatrix
41e8ece [Reza Zadeh] style changes
139c8e1 [Reza Zadeh] Syntax changes
029aa9c [Reza Zadeh] javadoc and new test
75edb25 [Reza Zadeh] All tests passing!
05e59b8 [Reza Zadeh] Add test
502ce52 [Reza Zadeh] new interface
654c4fb [Reza Zadeh] default methods
3726ca9 [Reza Zadeh] Remove MatrixAlgebra
6bebabb [Reza Zadeh] remove changes to MatrixSuite
5b8cd7d [Reza Zadeh] Initial files
2014-09-29 14:15:09 -04:00
|
|
|
MimaBuild.excludeSparkClass("mllib.linalg.Vector") ++
|
|
|
|
Seq(
|
2014-10-02 03:29:31 -04:00
|
|
|
ProblemFilters.exclude[IncompatibleTemplateDefProblem](
|
|
|
|
"org.apache.spark.scheduler.TaskLocation"),
|
[MLlib] [SPARK-2885] DIMSUM: All-pairs similarity
# All-pairs similarity via DIMSUM
Compute all pairs of similar vectors using brute force approach, and also DIMSUM sampling approach.
Laying down some notation: we are looking for all pairs of similar columns in an m x n RowMatrix whose entries are denoted a_ij, with the i’th row denoted r_i and the j’th column denoted c_j. There is an oversampling parameter labeled ɣ that should be set to 4 log(n)/s to get provably correct results (with high probability), where s is the similarity threshold.
The algorithm is stated with a Map and Reduce, with proofs of correctness and efficiency in published papers [1] [2]. The reducer is simply the summation reducer. The mapper is more interesting, and is also the heart of the scheme. As an exercise, you should try to see why in expectation, the map-reduce below outputs cosine similarities.
![dimsumv2](https://cloud.githubusercontent.com/assets/3220351/3807272/d1d9514e-1c62-11e4-9f12-3cfdb1d78b3a.png)
[1] Bosagh-Zadeh, Reza and Carlsson, Gunnar (2013), Dimension Independent Matrix Square using MapReduce, arXiv:1304.1467 http://arxiv.org/abs/1304.1467
[2] Bosagh-Zadeh, Reza and Goel, Ashish (2012), Dimension Independent Similarity Computation, arXiv:1206.2082 http://arxiv.org/abs/1206.2082
# Testing
Tests for all invocations included.
Added L1 and L2 norm computation to MultivariateStatisticalSummary since it was needed. Added tests for both of them.
Author: Reza Zadeh <rizlar@gmail.com>
Author: Xiangrui Meng <meng@databricks.com>
Closes #1778 from rezazadeh/dimsumv2 and squashes the following commits:
404c64c [Reza Zadeh] Merge remote-tracking branch 'upstream/master' into dimsumv2
4eb71c6 [Reza Zadeh] Add excludes for normL1 and normL2
ee8bd65 [Reza Zadeh] Merge remote-tracking branch 'upstream/master' into dimsumv2
976ddd4 [Reza Zadeh] Broadcast colMags. Avoid div by zero.
3467cff [Reza Zadeh] Merge remote-tracking branch 'upstream/master' into dimsumv2
aea0247 [Reza Zadeh] Allow large thresholds to promote sparsity
9fe17c0 [Xiangrui Meng] organize imports
2196ba5 [Xiangrui Meng] Merge branch 'rezazadeh-dimsumv2' into dimsumv2
254ca08 [Reza Zadeh] Merge remote-tracking branch 'upstream/master' into dimsumv2
f2947e4 [Xiangrui Meng] some optimization
3c4cf41 [Xiangrui Meng] Merge branch 'master' into rezazadeh-dimsumv2
0e4eda4 [Reza Zadeh] Use partition index for RNG
251bb9c [Reza Zadeh] Documentation
25e9d0d [Reza Zadeh] Line length for style
fb296f6 [Reza Zadeh] renamed to normL1 and normL2
3764983 [Reza Zadeh] Documentation
e9c6791 [Reza Zadeh] New interface and documentation
613f261 [Reza Zadeh] Column magnitude summary
75a0b51 [Reza Zadeh] Use Ints instead of Longs in the shuffle
0f12ade [Reza Zadeh] Style changes
eb1dc20 [Reza Zadeh] Use Double.PositiveInfinity instead of Double.Max
f56a882 [Reza Zadeh] Remove changes to MultivariateOnlineSummarizer
dbc55ba [Reza Zadeh] Make colMagnitudes a method in RowMatrix
41e8ece [Reza Zadeh] style changes
139c8e1 [Reza Zadeh] Syntax changes
029aa9c [Reza Zadeh] javadoc and new test
75edb25 [Reza Zadeh] All tests passing!
05e59b8 [Reza Zadeh] Add test
502ce52 [Reza Zadeh] new interface
654c4fb [Reza Zadeh] default methods
3726ca9 [Reza Zadeh] Remove MatrixAlgebra
6bebabb [Reza Zadeh] remove changes to MatrixSuite
5b8cd7d [Reza Zadeh] Initial files
2014-09-29 14:15:09 -04:00
|
|
|
// Added normL1 and normL2 to trait MultivariateStatisticalSummary
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.stat.MultivariateStatisticalSummary.normL1"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
2014-09-30 01:56:22 -04:00
|
|
|
"org.apache.spark.mllib.stat.MultivariateStatisticalSummary.normL2"),
|
|
|
|
// MapStatus should be private[spark]
|
|
|
|
ProblemFilters.exclude[IncompatibleTemplateDefProblem](
|
2014-10-16 21:38:45 -04:00
|
|
|
"org.apache.spark.scheduler.MapStatus"),
|
[SPARK-3453] Netty-based BlockTransferService, extracted from Spark core
This PR encapsulates #2330, which is itself a continuation of #2240. The first goal of this PR is to provide an alternate, simpler implementation of the ConnectionManager which is based on Netty.
In addition to this goal, however, we want to resolve [SPARK-3796](https://issues.apache.org/jira/browse/SPARK-3796), which calls for a standalone shuffle service which can be integrated into the YARN NodeManager, Standalone Worker, or on its own. This PR makes the first step in this direction by ensuring that the actual Netty service is as small as possible and extracted from Spark core. Given this, we should be able to construct this standalone jar which can be included in other JVMs without incurring significant dependency or runtime issues. The actual work to ensure that such a standalone shuffle service would work in Spark will be left for a future PR, however.
In order to minimize dependencies and allow for the service to be long-running (possibly much longer-running than Spark, and possibly having to support multiple version of Spark simultaneously), the entire service has been ported to Java, where we have full control over the binary compatibility of the components and do not depend on the Scala runtime or version.
These issues: have been addressed by folding in #2330:
SPARK-3453: Refactor Netty module to use BlockTransferService interface
SPARK-3018: Release all buffers upon task completion/failure
SPARK-3002: Create a connection pool and reuse clients across different threads
SPARK-3017: Integration tests and unit tests for connection failures
SPARK-3049: Make sure client doesn't block when server/connection has error(s)
SPARK-3502: SO_RCVBUF and SO_SNDBUF should be bootstrap childOption, not option
SPARK-3503: Disable thread local cache in PooledByteBufAllocator
TODO before mergeable:
- [x] Implement uploadBlock()
- [x] Unit tests for RPC side of code
- [x] Performance testing (see comments [here](https://github.com/apache/spark/pull/2753#issuecomment-59475022))
- [x] Turn OFF by default (currently on for unit testing)
Author: Reynold Xin <rxin@apache.org>
Author: Aaron Davidson <aaron@databricks.com>
Author: cocoatomo <cocoatomo77@gmail.com>
Author: Patrick Wendell <pwendell@gmail.com>
Author: Prashant Sharma <prashant.s@imaginea.com>
Author: Davies Liu <davies.liu@gmail.com>
Author: Anand Avati <avati@redhat.com>
Closes #2753 from aarondav/netty and squashes the following commits:
cadfd28 [Aaron Davidson] Turn netty off by default
d7be11b [Aaron Davidson] Turn netty on by default
4a204b8 [Aaron Davidson] Fail block fetches if client connection fails
2b0d1c0 [Aaron Davidson] 100ch
0c5bca2 [Aaron Davidson] Merge branch 'master' of https://github.com/apache/spark into netty
14e37f7 [Aaron Davidson] Address Reynold's comments
8dfcceb [Aaron Davidson] Merge branch 'master' of https://github.com/apache/spark into netty
322dfc1 [Aaron Davidson] Address Reynold's comments, including major rename
e5675a4 [Aaron Davidson] Fail outstanding RPCs as well
ccd4959 [Aaron Davidson] Don't throw exception if client immediately fails
9da0bc1 [Aaron Davidson] Add RPC unit tests
d236dfd [Aaron Davidson] Remove no-op serializer :)
7b7a26c [Aaron Davidson] Fix Nio compile issue
dd420fd [Aaron Davidson] Merge branch 'master' of https://github.com/apache/spark into netty-test
939f276 [Aaron Davidson] Attempt to make comm. bidirectional
aa58f67 [cocoatomo] [SPARK-3909][PySpark][Doc] A corrupted format in Sphinx documents and building warnings
8dc1ded [cocoatomo] [SPARK-3867][PySpark] ./python/run-tests failed when it run with Python 2.6 and unittest2 is not installed
5b5dbe6 [Prashant Sharma] [SPARK-2924] Required by scala 2.11, only one fun/ctor amongst overriden alternatives, can have default argument(s).
2c5d9dc [Patrick Wendell] HOTFIX: Fix build issue with Akka 2.3.4 upgrade.
020691e [Davies Liu] [SPARK-3886] [PySpark] use AutoBatchedSerializer by default
ae4083a [Anand Avati] [SPARK-2805] Upgrade Akka to 2.3.4
29c6dcf [Aaron Davidson] [SPARK-3453] Netty-based BlockTransferService, extracted from Spark core
f7e7568 [Reynold Xin] Fixed spark.shuffle.io.receiveBuffer setting.
5d98ce3 [Reynold Xin] Flip buffer.
f6c220d [Reynold Xin] Merge with latest master.
407e59a [Reynold Xin] Fix style violation.
a0518c7 [Reynold Xin] Implemented block uploads.
4b18db2 [Reynold Xin] Copy the buffer in fetchBlockSync.
bec4ea2 [Reynold Xin] Removed OIO and added num threads settings.
1bdd7ee [Reynold Xin] Fixed tests.
d68f328 [Reynold Xin] Logging close() in case close() fails.
f63fb4c [Reynold Xin] Add more debug message.
6afc435 [Reynold Xin] Added logging.
c066309 [Reynold Xin] Implement java.io.Closeable interface.
519d64d [Reynold Xin] Mark private package visibility and MimaExcludes.
f0a16e9 [Reynold Xin] Fixed test hanging.
14323a5 [Reynold Xin] Removed BlockManager.getLocalShuffleFromDisk.
b2f3281 [Reynold Xin] Added connection pooling.
d23ed7b [Reynold Xin] Incorporated feedback from Norman: - use same pool for boss and worker - remove ioratio - disable caching of byte buf allocator - childoption sendbuf/receivebuf - fire exception through pipeline
9e0cb87 [Reynold Xin] Fixed BlockClientHandlerSuite
5cd33d7 [Reynold Xin] Fixed style violation.
cb589ec [Reynold Xin] Added more test cases covering cleanup when fault happens in ShuffleBlockFetcherIteratorSuite
1be4e8e [Reynold Xin] Shorten NioManagedBuffer and NettyManagedBuffer class names.
108c9ed [Reynold Xin] Forgot to add TestSerializer to the commit list.
b5c8d1f [Reynold Xin] Fixed ShuffleBlockFetcherIteratorSuite.
064747b [Reynold Xin] Reference count buffers and clean them up properly.
2b44cf1 [Reynold Xin] Added more documentation.
1760d32 [Reynold Xin] Use Epoll.isAvailable in BlockServer as well.
165eab1 [Reynold Xin] [SPARK-3453] Refactor Netty module to use BlockTransferService.
2014-10-29 14:27:07 -04:00
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.network.netty.PathResolver"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.spark.network.netty.client.BlockClientListener"),
|
|
|
|
|
2014-10-16 21:38:45 -04:00
|
|
|
// TaskContext was promoted to Abstract class
|
|
|
|
ProblemFilters.exclude[AbstractClassProblem](
|
[SPARK-4084] Reuse sort key in Sorter
Sorter uses generic-typed key for sorting. When data is large, it creates lots of key objects, which is not efficient. We should reuse the key in Sorter for memory efficiency. This change is part of the petabyte sort implementation from rxin .
The `Sorter` class was written in Java and marked package private. So it is only available to `org.apache.spark.util.collection`. I renamed it to `TimSort` and add a simple wrapper of it, still called `Sorter`, in Scala, which is `private[spark]`.
The benchmark code is updated, which now resets the array before each run. Here is the result on sorting primitive Int arrays of size 25 million using Sorter:
~~~
[info] - Sorter benchmark for key-value pairs !!! IGNORED !!!
Java Arrays.sort() on non-primitive int array: Took 13237 ms
Java Arrays.sort() on non-primitive int array: Took 13320 ms
Java Arrays.sort() on non-primitive int array: Took 15718 ms
Java Arrays.sort() on non-primitive int array: Took 13283 ms
Java Arrays.sort() on non-primitive int array: Took 13267 ms
Java Arrays.sort() on non-primitive int array: Took 15122 ms
Java Arrays.sort() on non-primitive int array: Took 15495 ms
Java Arrays.sort() on non-primitive int array: Took 14877 ms
Java Arrays.sort() on non-primitive int array: Took 16429 ms
Java Arrays.sort() on non-primitive int array: Took 14250 ms
Java Arrays.sort() on non-primitive int array: (13878 ms first try, 14499 ms average)
Java Arrays.sort() on primitive int array: Took 2683 ms
Java Arrays.sort() on primitive int array: Took 2683 ms
Java Arrays.sort() on primitive int array: Took 2701 ms
Java Arrays.sort() on primitive int array: Took 2746 ms
Java Arrays.sort() on primitive int array: Took 2685 ms
Java Arrays.sort() on primitive int array: Took 2735 ms
Java Arrays.sort() on primitive int array: Took 2669 ms
Java Arrays.sort() on primitive int array: Took 2693 ms
Java Arrays.sort() on primitive int array: Took 2680 ms
Java Arrays.sort() on primitive int array: Took 2642 ms
Java Arrays.sort() on primitive int array: (2948 ms first try, 2691 ms average)
Sorter without key reuse on primitive int array: Took 10732 ms
Sorter without key reuse on primitive int array: Took 12482 ms
Sorter without key reuse on primitive int array: Took 10718 ms
Sorter without key reuse on primitive int array: Took 12650 ms
Sorter without key reuse on primitive int array: Took 10747 ms
Sorter without key reuse on primitive int array: Took 10783 ms
Sorter without key reuse on primitive int array: Took 12721 ms
Sorter without key reuse on primitive int array: Took 10604 ms
Sorter without key reuse on primitive int array: Took 10622 ms
Sorter without key reuse on primitive int array: Took 11843 ms
Sorter without key reuse on primitive int array: (11089 ms first try, 11390 ms average)
Sorter with key reuse on primitive int array: Took 5141 ms
Sorter with key reuse on primitive int array: Took 5298 ms
Sorter with key reuse on primitive int array: Took 5066 ms
Sorter with key reuse on primitive int array: Took 5164 ms
Sorter with key reuse on primitive int array: Took 5203 ms
Sorter with key reuse on primitive int array: Took 5274 ms
Sorter with key reuse on primitive int array: Took 5186 ms
Sorter with key reuse on primitive int array: Took 5159 ms
Sorter with key reuse on primitive int array: Took 5164 ms
Sorter with key reuse on primitive int array: Took 5078 ms
Sorter with key reuse on primitive int array: (5311 ms first try, 5173 ms average)
~~~
So with key reuse, it is faster and less likely to trigger GC.
Author: Xiangrui Meng <meng@databricks.com>
Author: Reynold Xin <rxin@apache.org>
Closes #2937 from mengxr/SPARK-4084 and squashes the following commits:
d73c3d0 [Xiangrui Meng] address comments
0b7b682 [Xiangrui Meng] fix mima
a72f53c [Xiangrui Meng] update timeIt
38ba50c [Xiangrui Meng] update timeIt
720f731 [Xiangrui Meng] add doc about JIT specialization
78f2879 [Xiangrui Meng] update tests
7de2efd [Xiangrui Meng] update the Sorter benchmark code to be correct
8626356 [Xiangrui Meng] add prepare to timeIt and update testsin SorterSuite
5f0d530 [Xiangrui Meng] update method modifiers of SortDataFormat
6ffbe66 [Xiangrui Meng] rename Sorter to TimSort and add a Scala wrapper that is private[spark]
b00db4d [Xiangrui Meng] doc and tests
cf94e8a [Xiangrui Meng] renaming
464ddce [Reynold Xin] cherry-pick rxin's commit
2014-10-28 18:14:41 -04:00
|
|
|
"org.apache.spark.TaskContext"),
|
|
|
|
ProblemFilters.exclude[IncompatibleTemplateDefProblem](
|
|
|
|
"org.apache.spark.util.collection.SortDataFormat")
|
2014-10-19 23:02:31 -04:00
|
|
|
) ++ Seq(
|
|
|
|
// Adding new methods to the JavaRDDLike trait:
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaRDDLike.takeAsync"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaRDDLike.foreachPartitionAsync"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaRDDLike.countAsync"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaRDDLike.foreachAsync"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaRDDLike.collectAsync")
|
2014-10-29 17:01:00 -04:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-3822
|
|
|
|
ProblemFilters.exclude[IncompatibleResultTypeProblem](
|
|
|
|
"org.apache.spark.SparkContext.org$apache$spark$SparkContext$$createTaskScheduler")
|
2014-11-10 01:11:20 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-1209
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.hadoop.mapreduce.SparkHadoopMapReduceUtil"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem](
|
|
|
|
"org.apache.hadoop.mapred.SparkHadoopMapRedUtil"),
|
|
|
|
ProblemFilters.exclude[MissingTypesProblem](
|
|
|
|
"org.apache.spark.rdd.PairRDDFunctions")
|
2014-11-14 17:33:37 -05:00
|
|
|
) ++ Seq(
|
|
|
|
// SPARK-4062
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.streaming.kafka.KafkaReceiver#MessageHandler.this")
|
[MLlib] [SPARK-2885] DIMSUM: All-pairs similarity
# All-pairs similarity via DIMSUM
Compute all pairs of similar vectors using brute force approach, and also DIMSUM sampling approach.
Laying down some notation: we are looking for all pairs of similar columns in an m x n RowMatrix whose entries are denoted a_ij, with the i’th row denoted r_i and the j’th column denoted c_j. There is an oversampling parameter labeled ɣ that should be set to 4 log(n)/s to get provably correct results (with high probability), where s is the similarity threshold.
The algorithm is stated with a Map and Reduce, with proofs of correctness and efficiency in published papers [1] [2]. The reducer is simply the summation reducer. The mapper is more interesting, and is also the heart of the scheme. As an exercise, you should try to see why in expectation, the map-reduce below outputs cosine similarities.
![dimsumv2](https://cloud.githubusercontent.com/assets/3220351/3807272/d1d9514e-1c62-11e4-9f12-3cfdb1d78b3a.png)
[1] Bosagh-Zadeh, Reza and Carlsson, Gunnar (2013), Dimension Independent Matrix Square using MapReduce, arXiv:1304.1467 http://arxiv.org/abs/1304.1467
[2] Bosagh-Zadeh, Reza and Goel, Ashish (2012), Dimension Independent Similarity Computation, arXiv:1206.2082 http://arxiv.org/abs/1206.2082
# Testing
Tests for all invocations included.
Added L1 and L2 norm computation to MultivariateStatisticalSummary since it was needed. Added tests for both of them.
Author: Reza Zadeh <rizlar@gmail.com>
Author: Xiangrui Meng <meng@databricks.com>
Closes #1778 from rezazadeh/dimsumv2 and squashes the following commits:
404c64c [Reza Zadeh] Merge remote-tracking branch 'upstream/master' into dimsumv2
4eb71c6 [Reza Zadeh] Add excludes for normL1 and normL2
ee8bd65 [Reza Zadeh] Merge remote-tracking branch 'upstream/master' into dimsumv2
976ddd4 [Reza Zadeh] Broadcast colMags. Avoid div by zero.
3467cff [Reza Zadeh] Merge remote-tracking branch 'upstream/master' into dimsumv2
aea0247 [Reza Zadeh] Allow large thresholds to promote sparsity
9fe17c0 [Xiangrui Meng] organize imports
2196ba5 [Xiangrui Meng] Merge branch 'rezazadeh-dimsumv2' into dimsumv2
254ca08 [Reza Zadeh] Merge remote-tracking branch 'upstream/master' into dimsumv2
f2947e4 [Xiangrui Meng] some optimization
3c4cf41 [Xiangrui Meng] Merge branch 'master' into rezazadeh-dimsumv2
0e4eda4 [Reza Zadeh] Use partition index for RNG
251bb9c [Reza Zadeh] Documentation
25e9d0d [Reza Zadeh] Line length for style
fb296f6 [Reza Zadeh] renamed to normL1 and normL2
3764983 [Reza Zadeh] Documentation
e9c6791 [Reza Zadeh] New interface and documentation
613f261 [Reza Zadeh] Column magnitude summary
75a0b51 [Reza Zadeh] Use Ints instead of Longs in the shuffle
0f12ade [Reza Zadeh] Style changes
eb1dc20 [Reza Zadeh] Use Double.PositiveInfinity instead of Double.Max
f56a882 [Reza Zadeh] Remove changes to MultivariateOnlineSummarizer
dbc55ba [Reza Zadeh] Make colMagnitudes a method in RowMatrix
41e8ece [Reza Zadeh] style changes
139c8e1 [Reza Zadeh] Syntax changes
029aa9c [Reza Zadeh] javadoc and new test
75edb25 [Reza Zadeh] All tests passing!
05e59b8 [Reza Zadeh] Add test
502ce52 [Reza Zadeh] new interface
654c4fb [Reza Zadeh] default methods
3726ca9 [Reza Zadeh] Remove MatrixAlgebra
6bebabb [Reza Zadeh] remove changes to MatrixSuite
5b8cd7d [Reza Zadeh] Initial files
2014-09-29 14:15:09 -04:00
|
|
|
)
|
2014-09-16 00:14:00 -04:00
|
|
|
|
2014-07-23 20:12:28 -04:00
|
|
|
case v if v.startsWith("1.1") =>
|
|
|
|
Seq(
|
|
|
|
MimaBuild.excludeSparkPackage("deploy"),
|
|
|
|
MimaBuild.excludeSparkPackage("graphx")
|
|
|
|
) ++
|
|
|
|
Seq(
|
|
|
|
// Adding new method to JavaRDLike trait - we should probably mark this as a developer API.
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.api.java.JavaRDDLike.partitions"),
|
2014-09-02 02:28:19 -04:00
|
|
|
// Should probably mark this as Experimental
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaRDDLike.foreachAsync"),
|
2014-07-23 20:12:28 -04:00
|
|
|
// We made a mistake earlier (ed06500d3) in the Java API to use default parameter values
|
|
|
|
// for countApproxDistinct* functions, which does not work in Java. We later removed
|
|
|
|
// them, and use the following to tell Mima to not care about them.
|
|
|
|
ProblemFilters.exclude[IncompatibleResultTypeProblem](
|
|
|
|
"org.apache.spark.api.java.JavaPairRDD.countApproxDistinctByKey"),
|
|
|
|
ProblemFilters.exclude[IncompatibleResultTypeProblem](
|
|
|
|
"org.apache.spark.api.java.JavaPairRDD.countApproxDistinctByKey"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaPairRDD.countApproxDistinct$default$1"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaPairRDD.countApproxDistinctByKey$default$1"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaRDD.countApproxDistinct$default$1"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaRDDLike.countApproxDistinct$default$1"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.api.java.JavaDoubleRDD.countApproxDistinct$default$1"),
|
2014-08-30 02:05:18 -04:00
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.storage.DiskStore.getValues"),
|
2014-07-23 20:12:28 -04:00
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.storage.MemoryStore.Entry")
|
|
|
|
) ++
|
2014-08-16 02:12:34 -04:00
|
|
|
Seq(
|
|
|
|
// Serializer interface change. See SPARK-3045.
|
|
|
|
ProblemFilters.exclude[IncompatibleTemplateDefProblem](
|
|
|
|
"org.apache.spark.serializer.DeserializationStream"),
|
|
|
|
ProblemFilters.exclude[IncompatibleTemplateDefProblem](
|
|
|
|
"org.apache.spark.serializer.Serializer"),
|
|
|
|
ProblemFilters.exclude[IncompatibleTemplateDefProblem](
|
|
|
|
"org.apache.spark.serializer.SerializationStream"),
|
|
|
|
ProblemFilters.exclude[IncompatibleTemplateDefProblem](
|
|
|
|
"org.apache.spark.serializer.SerializerInstance")
|
|
|
|
)++
|
2014-07-23 20:12:28 -04:00
|
|
|
Seq(
|
2014-07-27 19:08:16 -04:00
|
|
|
// Renamed putValues -> putArray + putIterator
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.storage.MemoryStore.putValues"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.storage.DiskStore.putValues"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.storage.TachyonStore.putValues")
|
|
|
|
) ++
|
|
|
|
Seq(
|
2014-08-01 07:32:46 -04:00
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.streaming.flume.FlumeReceiver.this"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem](
|
|
|
|
"org.apache.spark.streaming.kafka.KafkaUtils.createStream"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem](
|
|
|
|
"org.apache.spark.streaming.kafka.KafkaReceiver.this")
|
2014-07-23 20:12:28 -04:00
|
|
|
) ++
|
|
|
|
Seq( // Ignore some private methods in ALS.
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$^dateFeatures"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem]( // The only public constructor is the one without arguments.
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.this"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$$<init>$default$7"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem](
|
|
|
|
"org.apache.spark.mllib.recommendation.ALS.org$apache$spark$mllib$recommendation$ALS$^dateFeatures")
|
|
|
|
) ++
|
|
|
|
MimaBuild.excludeSparkClass("mllib.linalg.distributed.ColumnStatisticsAggregator") ++
|
|
|
|
MimaBuild.excludeSparkClass("rdd.ZippedRDD") ++
|
|
|
|
MimaBuild.excludeSparkClass("rdd.ZippedPartition") ++
|
|
|
|
MimaBuild.excludeSparkClass("util.SerializableHyperLogLog") ++
|
|
|
|
MimaBuild.excludeSparkClass("storage.Values") ++
|
|
|
|
MimaBuild.excludeSparkClass("storage.Entry") ++
|
|
|
|
MimaBuild.excludeSparkClass("storage.MemoryStore$Entry") ++
|
2014-09-03 17:57:38 -04:00
|
|
|
// Class was missing "@DeveloperApi" annotation in 1.0.
|
|
|
|
MimaBuild.excludeSparkClass("scheduler.SparkListenerApplicationStart") ++
|
2014-07-23 20:12:28 -04:00
|
|
|
Seq(
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem](
|
|
|
|
"org.apache.spark.mllib.tree.impurity.Gini.calculate"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem](
|
|
|
|
"org.apache.spark.mllib.tree.impurity.Entropy.calculate"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem](
|
|
|
|
"org.apache.spark.mllib.tree.impurity.Variance.calculate")
|
2014-07-30 20:34:32 -04:00
|
|
|
) ++
|
2014-09-03 17:57:38 -04:00
|
|
|
Seq( // Package-private classes removed in SPARK-2341
|
2014-07-30 20:34:32 -04:00
|
|
|
ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.BinaryLabelParser"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.BinaryLabelParser$"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.LabelParser"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.LabelParser$"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.MulticlassLabelParser"),
|
|
|
|
ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.mllib.util.MulticlassLabelParser$")
|
2014-09-03 17:57:38 -04:00
|
|
|
) ++
|
2014-08-12 01:33:45 -04:00
|
|
|
Seq( // package-private classes removed in MLlib
|
2014-08-08 18:07:31 -04:00
|
|
|
ProblemFilters.exclude[MissingMethodProblem](
|
|
|
|
"org.apache.spark.mllib.regression.GeneralizedLinearAlgorithm.org$apache$spark$mllib$regression$GeneralizedLinearAlgorithm$$prependOne")
|
2014-08-12 01:33:45 -04:00
|
|
|
) ++
|
|
|
|
Seq( // new Vector methods in MLlib (binary compatible assuming users do not implement Vector)
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.mllib.linalg.Vector.copy")
|
2014-08-15 11:53:52 -04:00
|
|
|
) ++
|
2014-08-16 18:13:34 -04:00
|
|
|
Seq( // synthetic methods generated in LabeledPoint
|
|
|
|
ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.mllib.regression.LabeledPoint$"),
|
|
|
|
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.mllib.regression.LabeledPoint.apply"),
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.mllib.regression.LabeledPoint.toString")
|
|
|
|
) ++
|
2014-08-15 11:53:52 -04:00
|
|
|
Seq ( // Scala 2.11 compatibility fix
|
|
|
|
ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.streaming.StreamingContext.<init>$default$2")
|
2014-08-02 02:55:30 -04:00
|
|
|
)
|
2014-07-23 20:12:28 -04:00
|
|
|
case v if v.startsWith("1.0") =>
|
|
|
|
Seq(
|
|
|
|
MimaBuild.excludeSparkPackage("api.java"),
|
|
|
|
MimaBuild.excludeSparkPackage("mllib"),
|
|
|
|
MimaBuild.excludeSparkPackage("streaming")
|
|
|
|
) ++
|
|
|
|
MimaBuild.excludeSparkClass("rdd.ClassTags") ++
|
|
|
|
MimaBuild.excludeSparkClass("util.XORShiftRandom") ++
|
|
|
|
MimaBuild.excludeSparkClass("graphx.EdgeRDD") ++
|
|
|
|
MimaBuild.excludeSparkClass("graphx.VertexRDD") ++
|
|
|
|
MimaBuild.excludeSparkClass("graphx.impl.GraphImpl") ++
|
|
|
|
MimaBuild.excludeSparkClass("graphx.impl.RoutingTable") ++
|
|
|
|
MimaBuild.excludeSparkClass("graphx.util.collection.PrimitiveKeyOpenHashMap") ++
|
|
|
|
MimaBuild.excludeSparkClass("graphx.util.collection.GraphXPrimitiveKeyOpenHashMap") ++
|
|
|
|
MimaBuild.excludeSparkClass("mllib.recommendation.MFDataGenerator") ++
|
|
|
|
MimaBuild.excludeSparkClass("mllib.optimization.SquaredGradient") ++
|
|
|
|
MimaBuild.excludeSparkClass("mllib.regression.RidgeRegressionWithSGD") ++
|
|
|
|
MimaBuild.excludeSparkClass("mllib.regression.LassoWithSGD") ++
|
|
|
|
MimaBuild.excludeSparkClass("mllib.regression.LinearRegressionWithSGD")
|
|
|
|
case _ => Seq()
|
|
|
|
}
|
2014-06-01 20:27:05 -04:00
|
|
|
}
|