diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java index 5b42843717..f219c5605b 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java @@ -86,7 +86,7 @@ public final class UnsafeInMemorySorter { private final PrefixComparators.RadixSortSupport radixSortSupport; /** - * Within this buffer, position {@code 2 * i} holds a pointer pointer to the record at + * Within this buffer, position {@code 2 * i} holds a pointer to the record at * index {@code i}, while position {@code 2 * i + 1} in the array holds an 8-byte key prefix. * * Only part of the array will be used to store the pointers, the rest part is preserved as diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSortDataFormat.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSortDataFormat.java index 430bf677ed..d9f84d10e9 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSortDataFormat.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSortDataFormat.java @@ -25,7 +25,7 @@ import org.apache.spark.util.collection.SortDataFormat; * Supports sorting an array of (record pointer, key prefix) pairs. * Used in {@link UnsafeInMemorySorter}. *

- * Within each long[] buffer, position {@code 2 * i} holds a pointer pointer to the record at + * Within each long[] buffer, position {@code 2 * i} holds a pointer to the record at * index {@code i}, while position {@code 2 * i + 1} in the array holds an 8-byte key prefix. */ public final class UnsafeSortDataFormat diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala index 6f5c31d7ab..4ca442b629 100644 --- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala +++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala @@ -317,7 +317,7 @@ private[spark] class MapOutputTrackerMaster(conf: SparkConf, pool } - // Make sure that that we aren't going to exceed the max RPC message size by making sure + // Make sure that we aren't going to exceed the max RPC message size by making sure // we use broadcast to send large map output statuses. if (minSizeForBroadcast > maxRpcMessageSize) { val msg = s"spark.shuffle.mapOutput.minSizeForBroadcast ($minSizeForBroadcast bytes) must " + diff --git a/core/src/main/scala/org/apache/spark/TaskEndReason.scala b/core/src/main/scala/org/apache/spark/TaskEndReason.scala index 7745387dbc..8c1b5f7bf0 100644 --- a/core/src/main/scala/org/apache/spark/TaskEndReason.scala +++ b/core/src/main/scala/org/apache/spark/TaskEndReason.scala @@ -98,7 +98,7 @@ case class FetchFailed( * 4 task failures, instead we immediately go back to the stage which generated the map output, * and regenerate the missing data. (2) we don't count fetch failures for blacklisting, since * presumably its not the fault of the executor where the task ran, but the executor which - * stored the data. This is especially important because we we might rack up a bunch of + * stored the data. This is especially important because we might rack up a bunch of * fetch-failures in rapid succession, on all nodes of the cluster, due to one bad node. */ override def countTowardsTaskFailures: Boolean = false diff --git a/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala b/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala index 79f4d06c84..320af5cf97 100644 --- a/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala +++ b/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala @@ -43,7 +43,7 @@ import org.apache.spark.util.{ThreadUtils, Utils} * Execute using * ./bin/spark-class org.apache.spark.deploy.FaultToleranceTest * - * Make sure that that the environment includes the following properties in SPARK_DAEMON_JAVA_OPTS + * Make sure that the environment includes the following properties in SPARK_DAEMON_JAVA_OPTS * *and* SPARK_JAVA_OPTS: * - spark.deploy.recoveryMode=ZOOKEEPER * - spark.deploy.zookeeper.url=172.17.42.1:2181 diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala index 2b00a4a6b3..54f39f7620 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala @@ -291,7 +291,7 @@ object HistoryServer extends Logging { /** * Create a security manager. - * This turns off security in the SecurityManager, so that the the History Server can start + * This turns off security in the SecurityManager, so that the History Server can start * in a Spark cluster where security is enabled. * @param config configuration for the SecurityManager constructor * @return the security manager for use in constructing the History Server. diff --git a/core/src/main/scala/org/apache/spark/executor/ShuffleReadMetrics.scala b/core/src/main/scala/org/apache/spark/executor/ShuffleReadMetrics.scala index f7a991770d..8dd1a1ea05 100644 --- a/core/src/main/scala/org/apache/spark/executor/ShuffleReadMetrics.scala +++ b/core/src/main/scala/org/apache/spark/executor/ShuffleReadMetrics.scala @@ -92,7 +92,7 @@ class ShuffleReadMetrics private[spark] () extends Serializable { private[spark] def setRecordsRead(v: Long): Unit = _recordsRead.setValue(v) /** - * Resets the value of the current metrics (`this`) and and merges all the independent + * Resets the value of the current metrics (`this`) and merges all the independent * [[TempShuffleReadMetrics]] into `this`. */ private[spark] def setMergeValues(metrics: Seq[TempShuffleReadMetrics]): Unit = { diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala index dc70eb82d2..3d4ea3cccc 100644 --- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala +++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala @@ -37,7 +37,7 @@ import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.util.Utils /** - * A BlockTransferService that uses Netty to fetch a set of blocks at at time. + * A BlockTransferService that uses Netty to fetch a set of blocks at time. */ private[spark] class NettyBlockTransferService( conf: SparkConf, diff --git a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala index 386fdfd218..3bfdf95db8 100644 --- a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala +++ b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala @@ -350,7 +350,7 @@ object SizeEstimator extends Logging { // 3. consistent fields layouts throughout the hierarchy: This means we should layout // superclass first. And we can use superclass's shellSize as a starting point to layout the // other fields in this class. - // 4. class alignment: HotSpot rounds field blocks up to to HeapOopSize not 4 bytes, confirmed + // 4. class alignment: HotSpot rounds field blocks up to HeapOopSize not 4 bytes, confirmed // with Aleksey. see https://bugs.openjdk.java.net/browse/CODETOOLS-7901322 // // The real world field layout is much more complicated. There are three kinds of fields diff --git a/core/src/test/scala/org/apache/spark/deploy/history/ApplicationCacheSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/ApplicationCacheSuite.scala index e3304be792..7998e3702c 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/ApplicationCacheSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/ApplicationCacheSuite.scala @@ -253,7 +253,7 @@ class ApplicationCacheSuite extends SparkFunSuite with Logging with MockitoSugar assertNotFound(appId, None) } - test("Test that if an attempt ID is is set, it must be used in lookups") { + test("Test that if an attempt ID is set, it must be used in lookups") { val operations = new StubCacheOperations() val clock = new ManualClock(1) implicit val cache = new ApplicationCache(operations, retainedApplications = 10, clock = clock) diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala index 5e8a854e46..f3d3f701af 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala @@ -1819,7 +1819,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeou assert(mapOutputTracker.getMapSizesByExecutorId(shuffleId, 0).map(_._1).toSet === HashSet(makeBlockManagerId("hostA"))) - // Reducer should run where RDD 2 has preferences, even though though it also has a shuffle dep + // Reducer should run where RDD 2 has preferences, even though it also has a shuffle dep val reduceTaskSet = taskSets(1) assertLocations(reduceTaskSet, Seq(Seq("hostB"))) complete(reduceTaskSet, Seq((Success, 42))) @@ -2058,7 +2058,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeou // Now complete tasks in the second task set val newTaskSet = taskSets(1) - assert(newTaskSet.tasks.size === 2) // Both tasks 0 and 1 were on on hostA + assert(newTaskSet.tasks.size === 2) // Both tasks 0 and 1 were on hostA runEvent(makeCompletionEvent(newTaskSet.tasks(0), Success, makeMapStatus("hostB", 2))) assert(results.size === 0) // Map stage job should not be complete yet runEvent(makeCompletionEvent(newTaskSet.tasks(1), Success, makeMapStatus("hostB", 2))) diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala index 7f0838268a..c8b6a3346a 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala @@ -53,7 +53,7 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext { conf } - test("single insert insert") { + test("single insert") { val conf = createSparkConf(loadDefaults = false) sc = new SparkContext("local", "test", conf) val map = createExternalMap[Int] diff --git a/docs/ml-features.md b/docs/ml-features.md index 1d3449746c..d67fce3c95 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -752,7 +752,7 @@ for more details on the API. `Interaction` is a `Transformer` which takes vector or double-valued columns, and generates a single vector column that contains the product of all combinations of one value from each input column. -For example, if you have 2 vector type columns each of which has 3 dimensions as input columns, then then you'll get a 9-dimensional vector as the output column. +For example, if you have 2 vector type columns each of which has 3 dimensions as input columns, then you'll get a 9-dimensional vector as the output column. **Examples** diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md index 12797bd868..430c069045 100644 --- a/docs/mllib-statistics.md +++ b/docs/mllib-statistics.md @@ -354,7 +354,7 @@ v = u.map(lambda x: 1.0 + 2.0 * x) useful for visualizing empirical probability distributions without requiring assumptions about the particular distribution that the observed samples are drawn from. It computes an estimate of the probability density function of a random variables, evaluated at a given set of points. It achieves -this estimate by expressing the PDF of the empirical distribution at a particular point as the the +this estimate by expressing the PDF of the empirical distribution at a particular point as the mean of PDFs of normal distributions centered around each of the samples.

diff --git a/docs/structured-streaming-kafka-integration.md b/docs/structured-streaming-kafka-integration.md index 2458bb5ffa..9b82e8e744 100644 --- a/docs/structured-streaming-kafka-integration.md +++ b/docs/structured-streaming-kafka-integration.md @@ -244,7 +244,7 @@ Note that the following Kafka params cannot be set and the Kafka source will thr - **group.id**: Kafka source will create a unique group id for each query automatically. - **auto.offset.reset**: Set the source option `startingOffsets` to specify where to start instead. Structured Streaming manages which offsets are consumed internally, rather - than rely on the kafka Consumer to do it. This will ensure that no data is missed when when new + than rely on the kafka Consumer to do it. This will ensure that no data is missed when new topics/partitions are dynamically subscribed. Note that `startingOffsets` only applies when a new Streaming query is started, and that resuming will always pick up from where the query left off. - **key.deserializer**: Keys are always deserialized as byte arrays with ByteArrayDeserializer. Use diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md index 799f636505..6cd050e4f2 100644 --- a/docs/structured-streaming-programming-guide.md +++ b/docs/structured-streaming-programming-guide.md @@ -680,7 +680,7 @@ windowedCounts = words.groupBy( ### Handling Late Data and Watermarking Now consider what happens if one of the events arrives late to the application. -For example, say, a word generated at 12:04 (i.e. event time) could be received received by +For example, say, a word generated at 12:04 (i.e. event time) could be received by the application at 12:11. The application should use the time 12:04 instead of 12:11 to update the older counts for the window `12:00 - 12:10`. This occurs naturally in our window-based grouping – Structured Streaming can maintain the intermediate state diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java index 9041244279..0e5d00565b 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java @@ -52,7 +52,7 @@ public class JavaLDAExample { double ll = model.logLikelihood(dataset); double lp = model.logPerplexity(dataset); System.out.println("The lower bound on the log likelihood of the entire corpus: " + ll); - System.out.println("The upper bound bound on perplexity: " + lp); + System.out.println("The upper bound on perplexity: " + lp); // Describe topics. Dataset topics = model.describeTopics(3); diff --git a/examples/src/main/python/ml/lda_example.py b/examples/src/main/python/ml/lda_example.py index 2dc1742ff7..a8b346f72c 100644 --- a/examples/src/main/python/ml/lda_example.py +++ b/examples/src/main/python/ml/lda_example.py @@ -46,7 +46,7 @@ if __name__ == "__main__": ll = model.logLikelihood(dataset) lp = model.logPerplexity(dataset) print("The lower bound on the log likelihood of the entire corpus: " + str(ll)) - print("The upper bound bound on perplexity: " + str(lp)) + print("The upper bound on perplexity: " + str(lp)) # Describe topics. topics = model.describeTopics(3) diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala index 22b3b0e3ad..4215d37cb5 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/LDAExample.scala @@ -50,7 +50,7 @@ object LDAExample { val ll = model.logLikelihood(dataset) val lp = model.logPerplexity(dataset) println(s"The lower bound on the log likelihood of the entire corpus: $ll") - println(s"The upper bound bound on perplexity: $lp") + println(s"The upper bound on perplexity: $lp") // Describe topics. val topics = model.describeTopics(3) diff --git a/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkSink.scala b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkSink.scala index 41f27e9376..e5b63aa1a7 100644 --- a/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkSink.scala +++ b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/SparkSink.scala @@ -45,7 +45,7 @@ import org.apache.flume.sink.AbstractSink * the thread itself is blocked and a reference to it saved off. * * When the ack for that batch is received, - * the thread which created the transaction is is retrieved and it commits the transaction with the + * the thread which created the transaction is retrieved and it commits the transaction with the * channel from the same thread it was originally created in (since Flume transactions are * thread local). If a nack is received instead, the sink rolls back the transaction. If no ack * is received within the specified timeout, the transaction is rolled back too. If an ack comes diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala index aa01238f91..ff9965b854 100644 --- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala +++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala @@ -212,7 +212,7 @@ private[kafka010] class KafkaSourceProvider extends StreamSourceProvider |Instead set the source option '$STARTING_OFFSETS_OPTION_KEY' to 'earliest' or 'latest' |to specify where to start. Structured Streaming manages which offsets are consumed |internally, rather than relying on the kafkaConsumer to do it. This will ensure that no - |data is missed when when new topics/partitions are dynamically subscribed. Note that + |data is missed when new topics/partitions are dynamically subscribed. Note that |'$STARTING_OFFSETS_OPTION_KEY' only applies when a new Streaming query is started, and |that resuming will always pick up from where the query left off. See the docs for more |details. diff --git a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala b/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala index a4d81a6809..18a5a1509a 100644 --- a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala +++ b/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala @@ -129,7 +129,7 @@ abstract class KinesisBackedBlockRDDTests(aggregateTestData: Boolean) /** * Test the WriteAheadLogBackedRDD, by writing some partitions of the data to block manager - * and the rest to a write ahead log, and then reading reading it all back using the RDD. + * and the rest to a write ahead log, and then reading it all back using the RDD. * It can also test if the partitions that were read from the log were again stored in * block manager. * diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala index 583e5e0928..728a883b1a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala @@ -512,7 +512,7 @@ abstract class LDAModel private[ml] ( } /** - * Calculate an upper bound bound on perplexity. (Lower is better.) + * Calculate an upper bound on perplexity. (Lower is better.) * See Equation (16) in the Online LDA paper (Hoffman et al., 2010). * * WARNING: If this model is an instance of [[DistributedLDAModel]] (produced when [[optimizer]] diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala index 5cbfbff3e4..4d6520d0b2 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala @@ -54,7 +54,7 @@ private[python] class Word2VecModelWrapper(model: Word2VecModel) { } /** - * Finds words similar to the the vector representation of a word without + * Finds words similar to the vector representation of a word without * filtering results. * @param vector a vector * @param num number of synonyms to find diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 25ffd8561f..933a5f1d52 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -245,7 +245,7 @@ class LocalLDAModel private[spark] ( } /** - * Calculate an upper bound bound on perplexity. (Lower is better.) + * Calculate an upper bound on perplexity. (Lower is better.) * See Equation (16) in original Online LDA paper. * * @param documents test corpus to use for calculating perplexity diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala index 46deb545af..f44c8fe351 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala @@ -29,7 +29,7 @@ import org.apache.spark.streaming.dstream.DStream /** * :: DeveloperApi :: * StreamingLinearAlgorithm implements methods for continuously - * training a generalized linear model model on streaming data, + * training a generalized linear model on streaming data, * and using it for prediction on (possibly different) streaming data. * * This class takes as type parameters a GeneralizedLinearModel, diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 35d0aefa04..54510e0bea 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -699,7 +699,7 @@ class LDAModel(JavaModel): @since("2.0.0") def logPerplexity(self, dataset): """ - Calculate an upper bound bound on perplexity. (Lower is better.) + Calculate an upper bound on perplexity. (Lower is better.) See Equation (16) in the Online LDA paper (Hoffman et al., 2010). WARNING: If this model is an instance of :py:class:`DistributedLDAModel` (produced when diff --git a/python/pyspark/ml/linalg/__init__.py b/python/pyspark/ml/linalg/__init__.py index 1705c156ce..b765343251 100644 --- a/python/pyspark/ml/linalg/__init__.py +++ b/python/pyspark/ml/linalg/__init__.py @@ -481,7 +481,7 @@ class SparseVector(Vector): >>> SparseVector(4, {1:1.0, 6:2.0}) Traceback (most recent call last): ... - AssertionError: Index 6 is out of the the size of vector with size=4 + AssertionError: Index 6 is out of the size of vector with size=4 >>> SparseVector(4, {-1:1.0}) Traceback (most recent call last): ... @@ -521,7 +521,7 @@ class SparseVector(Vector): if self.indices.size > 0: assert np.max(self.indices) < self.size, \ - "Index %d is out of the the size of vector with size=%d" \ + "Index %d is out of the size of vector with size=%d" \ % (np.max(self.indices), self.size) assert np.min(self.indices) >= 0, \ "Contains negative index %d" % (np.min(self.indices)) diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py index 2a85ec01bc..7bc6a59ad3 100644 --- a/python/pyspark/sql/utils.py +++ b/python/pyspark/sql/utils.py @@ -95,7 +95,7 @@ def install_exception_handler(): original = py4j.protocol.get_return_value # The original `get_return_value` is not patched, it's idempotent. patched = capture_sql_exception(original) - # only patch the one used in in py4j.java_gateway (call Java API) + # only patch the one used in py4j.java_gateway (call Java API) py4j.java_gateway.get_return_value = patched diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala index 8772e26f43..fb2d61f621 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala @@ -32,7 +32,7 @@ private[yarn] case class ContainerLocalityPreferences(nodes: Array[String], rack /** * This strategy is calculating the optimal locality preferences of YARN containers by considering - * the node ratio of pending tasks, number of required cores/containers and and locality of current + * the node ratio of pending tasks, number of required cores/containers and locality of current * existing and pending allocated containers. The target of this algorithm is to maximize the number * of tasks that would run locally. * diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java index d205547698..86de90984c 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java @@ -196,7 +196,7 @@ public final class UnsafeRow extends InternalRow implements Externalizable, Kryo assertIndexIsValid(i); BitSetMethods.set(baseObject, baseOffset, i); // To preserve row equality, zero out the value when setting the column to null. - // Since this row does does not currently support updates to variable-length values, we don't + // Since this row does not currently support updates to variable-length values, we don't // have to worry about zeroing out that data. Platform.putLong(baseObject, getFieldOffset(i), 0); } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala index 13115f4728..07d294b108 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala @@ -516,7 +516,7 @@ case class CumeDist() extends RowNumberLike with SizeBasedWindowFunction { * into the number of buckets); both variables are based on the size of the current partition. * During the calculation process the function keeps track of the current row number, the current * bucket number, and the row number at which the bucket will change (bucketThreshold). When the - * current row number reaches bucket threshold, the bucket value is increased by one and the the + * current row number reaches bucket threshold, the bucket value is increased by one and the * threshold is increased by the bucket size (plus one extra if the current bucket is padded). * * This documentation has been based upon similar documentation for the Hive and Presto projects. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index d583fa31b0..c977e788b0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -795,7 +795,7 @@ case object OneRowRelation extends LeafNode { /** * Computes [[Statistics]] for this plan. The default implementation assumes the output - * cardinality is the product of of all child plan's cardinality, i.e. applies in the case + * cardinality is the product of all child plan's cardinality, i.e. applies in the case * of cartesian joins. * * [[LeafNode]]s must override this. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 235ca8d263..a96a3b7af2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -142,7 +142,7 @@ object DateTimeUtils { } /** - * Returns the number of days since epoch from from java.sql.Date. + * Returns the number of days since epoch from java.sql.Date. */ def fromJavaDate(date: Date): SQLDate = { millisToDays(date.getTime) @@ -503,7 +503,7 @@ object DateTimeUtils { } /** - * Calculates the year and and the number of the day in the year for the given + * Calculates the year and the number of the day in the year for the given * number of days. The given days is the number of days since 1.1.1970. * * The calculation uses the fact that the period 1.1.2001 until 31.12.2400 is diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/AttributeSetSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/AttributeSetSuite.scala index 97cfb5f06d..273f95f91e 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/AttributeSetSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/AttributeSetSuite.scala @@ -52,7 +52,7 @@ class AttributeSetSuite extends SparkFunSuite { assert((aSet ++ bSet).contains(aLower) === true) } - test("extracts all references references") { + test("extracts all references ") { val addSet = AttributeSet(Add(aUpper, Alias(bUpper, "test")()):: Nil) assert(addSet.contains(aUpper)) assert(addSet.contains(aLower)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index c1cedd8541..2a06f3c47c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -361,7 +361,7 @@ class Dataset[T] private[sql]( * method used to map columns depend on the type of `U`: * - When `U` is a class, fields for the class will be mapped to columns of the same name * (case sensitivity is determined by `spark.sql.caseSensitive`). - * - When `U` is a tuple, the columns will be be mapped by ordinal (i.e. the first column will + * - When `U` is a tuple, the columns will be mapped by ordinal (i.e. the first column will * be assigned to `_1`). * - When `U` is a primitive type (i.e. String, Int, etc), then the first column of the * `DataFrame` will be used. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala index bc290702dc..bad59961ac 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala @@ -41,7 +41,7 @@ object PartitionPath { } /** - * Holds a directory in a partitioned collection of files as well as as the partition values + * Holds a directory in a partitioned collection of files as well as the partition values * in the form of a Row. Before scanning, the files at `path` need to be enumerated. */ case class PartitionPath(values: InternalRow, path: Path) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala index 0ce47b152c..0b39965b31 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala @@ -285,7 +285,7 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) { /** * Starts the execution of the streaming query, which will continually send results to the given - * `ForeachWriter` as as new data arrives. The `ForeachWriter` can be used to send the data + * `ForeachWriter` as new data arrives. The `ForeachWriter` can be used to send the data * generated by the `DataFrame`/`Dataset` to an external system. * * Scala example: diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTimeWindowingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTimeWindowingSuite.scala index 4296ec543e..22d5c47a6f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTimeWindowingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTimeWindowingSuite.scala @@ -257,7 +257,7 @@ class DataFrameTimeWindowingSuite extends QueryTest with SharedSQLContext with B } } - test("time window in SQL with with two expressions") { + test("time window in SQL with two expressions") { withTempTable { table => checkAnswer( spark.sql( @@ -272,7 +272,7 @@ class DataFrameTimeWindowingSuite extends QueryTest with SharedSQLContext with B } } - test("time window in SQL with with three expressions") { + test("time window in SQL with three expressions") { withTempTable { table => checkAnswer( spark.sql( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala index 375da224aa..0bfc92fdb6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala @@ -363,7 +363,7 @@ class PlannerSuite extends SharedSQLContext { // This is a regression test for SPARK-9703 test("EnsureRequirements should not repartition if only ordering requirement is unsatisfied") { // Consider an operator that imposes both output distribution and ordering requirements on its - // children, such as sort sort merge join. If the distribution requirements are satisfied but + // children, such as sort merge join. If the distribution requirements are satisfied but // the output ordering requirements are unsatisfied, then the planner should only add sorts and // should not need to add additional shuffles / exchanges. val outputOrdering = Seq(SortOrder(Literal(1), Ascending)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala index 22f59f63d6..f67444fbc4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala @@ -144,7 +144,7 @@ class FileStreamSinkSuite extends StreamTest { } // This tests whether FileStreamSink works with aggregations. Specifically, it tests - // whether the the correct streaming QueryExecution (i.e. IncrementalExecution) is used to + // whether the correct streaming QueryExecution (i.e. IncrementalExecution) is used to // to execute the trigger for writing data to file sink. See SPARK-18440 for more details. test("writing with aggregation") { diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/AbstractService.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/AbstractService.java index c2a2b2d478..9dd0efc039 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/AbstractService.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/AbstractService.java @@ -151,7 +151,7 @@ public abstract class AbstractService implements Service { } /** - * Verify that that a service is in a given state. + * Verify that a service is in a given state. * * @param currentState * the desired state diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceOperations.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceOperations.java index 8946219d85..a2c580d6ac 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceOperations.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceOperations.java @@ -33,7 +33,7 @@ public final class ServiceOperations { } /** - * Verify that that a service is in a given state. + * Verify that a service is in a given state. * @param state the actual state a service is in * @param expectedState the desired state * @throws IllegalStateException if the service state is different from diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceStateChangeListener.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceStateChangeListener.java index d1aadad04c..a1ff10dc2b 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceStateChangeListener.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/ServiceStateChangeListener.java @@ -29,7 +29,7 @@ public interface ServiceStateChangeListener { * have changed state before this callback is invoked. * * This operation is invoked on the thread that initiated the state change, - * while the service itself in in a synchronized section. + * while the service itself in a synchronized section. *
    *
  1. Any long-lived operation here will prevent the service state * change from completing in a timely manner.
  2. diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/TypeDescriptor.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/TypeDescriptor.java index 562b3f5e67..b80fd67884 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/TypeDescriptor.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/TypeDescriptor.java @@ -98,7 +98,7 @@ public class TypeDescriptor { * For datetime types this is the length in characters of the String representation * (assuming the maximum allowed precision of the fractional seconds component). * For binary data this is the length in bytes. - * Null is returned for for data types where the column size is not applicable. + * Null is returned for data types where the column size is not applicable. */ public Integer getColumnSize() { if (type.isNumericType()) { diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala index 5cd4935e22..d217e9b4fe 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala @@ -178,7 +178,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { "skewjoin", "database", - // These tests fail and and exit the JVM. + // These tests fail and exit the JVM. "auto_join18_multi_distinct", "join18_multi_distinct", "input44", diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala index c80695bd3e..7ee5fc543c 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala @@ -40,7 +40,7 @@ import org.apache.spark.util.Utils * The Hive table scan operator. Column and partition pruning are both handled. * * @param requestedAttributes Attributes to be fetched from the Hive table. - * @param relation The Hive table be be scanned. + * @param relation The Hive table be scanned. * @param partitionPruningPred An optional partition pruning predicate for partitioned table. */ private[hive] diff --git a/streaming/src/main/scala/org/apache/spark/streaming/State.scala b/streaming/src/main/scala/org/apache/spark/streaming/State.scala index 3f560f889f..23cf48eb06 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/State.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/State.scala @@ -178,7 +178,7 @@ private[streaming] class StateImpl[S] extends State[S] { removed } - /** Whether the state has been been updated */ + /** Whether the state has been updated */ def isUpdated(): Boolean = { updated } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala index a3c125c306..9a760e2947 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala @@ -88,7 +88,7 @@ abstract class InputDStream[T: ClassTag](_ssc: StreamingContext) if (!super.isTimeValid(time)) { false // Time not valid } else { - // Time is valid, but check it it is more than lastValidTime + // Time is valid, but check it is more than lastValidTime if (lastValidTime != null && time < lastValidTime) { logWarning(s"isTimeValid called with $time whereas the last valid time " + s"is $lastValidTime") diff --git a/streaming/src/test/scala/org/apache/spark/streaming/rdd/MapWithStateRDDSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/rdd/MapWithStateRDDSuite.scala index e8c814ba71..9b6bc71c7a 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/rdd/MapWithStateRDDSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/rdd/MapWithStateRDDSuite.scala @@ -326,7 +326,7 @@ class MapWithStateRDDSuite extends SparkFunSuite with RDDCheckpointTester with B // Create a MapWithStateRDD that has a long lineage using the data RDD with a long lineage val stateRDDWithLongLineage = makeStateRDDWithLongLineageDataRDD(longLineageRDD) - // Create a new MapWithStateRDD, with the lineage lineage MapWithStateRDD as the parent + // Create a new MapWithStateRDD, with the lineage MapWithStateRDD as the parent new MapWithStateRDD[Int, Int, Int, Int]( stateRDDWithLongLineage, stateRDDWithLongLineage.sparkContext.emptyRDD[(Int, Int)].partitionBy(partitioner), diff --git a/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala index a37fac8730..c5e695a33a 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala @@ -108,7 +108,7 @@ class WriteAheadLogBackedBlockRDDSuite /** * Test the WriteAheadLogBackedRDD, by writing some partitions of the data to block manager - * and the rest to a write ahead log, and then reading reading it all back using the RDD. + * and the rest to a write ahead log, and then reading it all back using the RDD. * It can also test if the partitions that were read from the log were again stored in * block manager. * diff --git a/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala index a1d0561bf3..b70383ecde 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala @@ -90,7 +90,7 @@ class BlockGeneratorSuite extends SparkFunSuite with BeforeAndAfter { listener.pushedData.asScala.toSeq should contain theSameElementsInOrderAs (data1) assert(listener.onAddDataCalled === false) // should be called only with addDataWithCallback() - // Verify addDataWithCallback() add data+metadata and and callbacks are called correctly + // Verify addDataWithCallback() add data+metadata and callbacks are called correctly val data2 = 11 to 20 val metadata2 = data2.map { _.toString } data2.zip(metadata2).foreach { case (d, m) => blockGenerator.addDataWithCallback(d, m) } @@ -103,7 +103,7 @@ class BlockGeneratorSuite extends SparkFunSuite with BeforeAndAfter { listener.pushedData.asScala.toSeq should contain theSameElementsInOrderAs combined } - // Verify addMultipleDataWithCallback() add data+metadata and and callbacks are called correctly + // Verify addMultipleDataWithCallback() add data+metadata and callbacks are called correctly val data3 = 21 to 30 val metadata3 = "metadata" blockGenerator.addMultipleDataWithCallback(data3.iterator, metadata3)