From 485145326a9c97ede260b0e267ee116f182cfd56 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Mon, 30 Nov 2020 13:59:51 +0900 Subject: [PATCH] [MINOR] Spelling bin core docs external mllib repl ### What changes were proposed in this pull request? This PR intends to fix typos in the sub-modules: * `bin` * `core` * `docs` * `external` * `mllib` * `repl` * `pom.xml` Split per srowen https://github.com/apache/spark/pull/30323#issuecomment-728981618 NOTE: The misspellings have been reported at https://github.com/jsoref/spark/commit/706a726f87a0bbf5e31467fae9015218773db85b#commitcomment-44064356 ### Why are the changes needed? Misspelled words make it harder to read / understand content. ### Does this PR introduce _any_ user-facing change? There are various fixes to documentation, etc... ### How was this patch tested? No testing was performed Closes #30530 from jsoref/spelling-bin-core-docs-external-mllib-repl. Authored-by: Josh Soref Signed-off-by: Takeshi Yamamuro --- bin/docker-image-tool.sh | 2 +- .../apache/spark/ui/static/spark-dag-viz.js | 2 +- .../org/apache/spark/ui/static/utils.js | 2 +- .../spark/ExecutorAllocationManager.scala | 4 +- .../apache/spark/api/java/JavaPairRDD.scala | 4 +- .../apache/spark/api/java/JavaRDDLike.scala | 2 +- .../apache/spark/api/python/PythonRDD.scala | 6 +- .../apache/spark/deploy/JsonProtocol.scala | 2 +- .../org/apache/spark/deploy/SparkSubmit.scala | 2 +- .../deploy/history/FsHistoryProvider.scala | 2 +- .../spark/deploy/history/HybridStore.scala | 2 +- .../org/apache/spark/executor/Executor.scala | 4 +- .../apache/spark/metrics/MetricsConfig.scala | 2 +- .../metrics/sink/PrometheusServlet.scala | 6 +- .../apache/spark/rdd/DoubleRDDFunctions.scala | 2 +- .../spark/rdd/OrderedRDDFunctions.scala | 4 +- .../main/scala/org/apache/spark/rdd/RDD.scala | 2 +- .../spark/resource/TaskResourceRequest.scala | 2 +- .../apache/spark/rpc/netty/NettyRpcEnv.scala | 4 +- .../BarrierJobAllocationFailed.scala | 4 +- .../apache/spark/scheduler/DAGScheduler.scala | 8 +- .../spark/scheduler/HealthTracker.scala | 4 +- .../spark/scheduler/TaskSetManager.scala | 2 +- .../spark/security/CryptoStreamUtils.scala | 2 +- .../apache/spark/storage/BlockManager.scala | 4 +- .../storage/BlockManagerMasterEndpoint.scala | 2 +- .../apache/spark/ui/jobs/AllJobsPage.scala | 2 +- .../org/apache/spark/ui/jobs/JobPage.scala | 2 +- .../apache/spark/util/ClosureCleaner.scala | 2 +- .../scala/org/apache/spark/util/Utils.scala | 22 ++-- .../spark/util/io/ChunkedByteBuffer.scala | 2 +- .../sort/UnsafeShuffleWriterSuite.java | 10 +- .../test/org/apache/spark/JavaAPISuite.java | 2 +- .../org/apache/spark/CheckpointSuite.scala | 12 +- .../apache/spark/ContextCleanerSuite.scala | 10 +- .../ExecutorAllocationManagerSuite.scala | 2 +- .../scala/org/apache/spark/FileSuite.scala | 2 +- .../spark/benchmark/BenchmarkBase.scala | 2 +- .../history/FsHistoryProviderSuite.scala | 4 +- .../spark/deploy/master/MasterSuite.scala | 2 +- .../spark/deploy/worker/WorkerSuite.scala | 2 +- .../apache/spark/executor/ExecutorSuite.scala | 2 +- ...FileCommitProtocolInstantiationSuite.scala | 4 +- .../metrics/InputOutputMetricsSuite.scala | 2 +- .../NettyBlockTransferServiceSuite.scala | 2 +- .../spark/rdd/PairRDDFunctionsSuite.scala | 34 +++--- .../scala/org/apache/spark/rdd/RDDSuite.scala | 2 +- .../spark/resource/ResourceUtilsSuite.scala | 2 +- .../spark/rpc/netty/NettyRpcEnvSuite.scala | 2 +- .../spark/scheduler/DAGSchedulerSuite.scala | 6 +- .../spark/scheduler/ReplayListenerSuite.scala | 2 +- .../scheduler/SchedulerIntegrationSuite.scala | 8 +- .../spark/scheduler/SparkListenerSuite.scala | 6 +- .../spark/scheduler/TaskSetManagerSuite.scala | 6 +- .../spark/status/AppStatusListenerSuite.scala | 2 +- .../spark/storage/BlockManagerSuite.scala | 4 +- .../apache/spark/util/JsonProtocolSuite.scala | 8 +- .../spark/util/SizeEstimatorSuite.scala | 2 +- docs/_plugins/include_example.rb | 4 +- docs/building-spark.md | 2 +- docs/configuration.md | 2 +- docs/css/main.css | 4 +- docs/graphx-programming-guide.md | 4 +- docs/ml-migration-guide.md | 2 +- docs/mllib-clustering.md | 2 +- docs/mllib-data-types.md | 2 +- docs/monitoring.md | 6 +- docs/running-on-kubernetes.md | 4 +- docs/running-on-mesos.md | 2 +- docs/running-on-yarn.md | 2 +- docs/sparkr.md | 2 +- docs/sql-data-sources-jdbc.md | 2 +- docs/sql-migration-guide.md | 6 +- ...l-ref-syntax-aux-conf-mgmt-set-timezone.md | 2 +- ...-ref-syntax-ddl-create-table-hiveformat.md | 8 +- docs/sql-ref-syntax-dml-insert-into.md | 114 +++++++++--------- ...l-ref-syntax-dml-insert-overwrite-table.md | 52 ++++---- docs/sql-ref-syntax-qry-select-groupby.md | 4 +- .../sql-ref-syntax-qry-select-lateral-view.md | 6 +- docs/sql-ref-syntax-qry-select-orderby.md | 2 +- .../ml/evaluation/ClusteringMetrics.scala | 4 +- .../apache/spark/ml/feature/Binarizer.scala | 6 +- .../apache/spark/ml/feature/Selector.scala | 2 +- .../spark/ml/feature/StopWordsRemover.scala | 6 +- .../apache/spark/ml/image/ImageSchema.scala | 2 +- .../ml/r/AFTSurvivalRegressionWrapper.scala | 4 +- .../spark/ml/regression/FMRegressor.scala | 2 +- .../spark/mllib/classification/SVM.scala | 2 +- .../mllib/clustering/DistanceMeasure.scala | 6 +- .../spark/mllib/clustering/LDAOptimizer.scala | 2 +- .../mllib/clustering/StreamingKMeans.scala | 2 +- .../org/apache/spark/mllib/feature/PCA.scala | 4 +- .../apache/spark/mllib/feature/Word2Vec.scala | 2 +- .../spark/mllib/fpm/AssociationRules.scala | 4 +- .../mllib/linalg/distributed/RowMatrix.scala | 4 +- .../stat/test/KolmogorovSmirnovTest.scala | 2 +- .../ml/feature/JavaStopWordsRemoverSuite.java | 2 +- .../ml/clustering/GaussianMixtureSuite.scala | 2 +- .../evaluation/RegressionEvaluatorSuite.scala | 2 +- .../spark/ml/feature/ANOVASelectorSuite.scala | 10 +- .../apache/spark/ml/feature/DCTSuite.scala | 2 +- .../org/apache/spark/ml/feature/LSHTest.scala | 2 +- .../VarianceThresholdSelectorSuite.scala | 2 +- .../GeneralizedLinearRegressionSuite.scala | 4 +- pom.xml | 4 +- .../spark/repl/ExecutorClassLoaderSuite.scala | 5 +- 106 files changed, 288 insertions(+), 289 deletions(-) diff --git a/bin/docker-image-tool.sh b/bin/docker-image-tool.sh index 6d74f8328a..2ec1ab8861 100755 --- a/bin/docker-image-tool.sh +++ b/bin/docker-image-tool.sh @@ -274,7 +274,7 @@ Examples: - Build and push JDK11-based image for multiple archs to docker.io/myrepo $0 -r docker.io/myrepo -t v3.0.0 -X -b java_image_tag=11-jre-slim build # Note: buildx, which does cross building, needs to do the push during build - # So there is no seperate push step with -X + # So there is no separate push step with -X EOF } diff --git a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js index 474c453643..1fc1fb4b45 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js +++ b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js @@ -334,7 +334,7 @@ function preprocessGraphLayout(g, forJob) { } /* - * Helper function to size the SVG appropriately such that all elements are displyed. + * Helper function to size the SVG appropriately such that all elements are displayed. * This assumes that all outermost elements are clusters (rectangles). */ function resizeSvg(svg) { diff --git a/core/src/main/resources/org/apache/spark/ui/static/utils.js b/core/src/main/resources/org/apache/spark/ui/static/utils.js index 4cd83332cd..7e6dd678e2 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/utils.js +++ b/core/src/main/resources/org/apache/spark/ui/static/utils.js @@ -74,7 +74,7 @@ function getTimeZone() { return Intl.DateTimeFormat().resolvedOptions().timeZone; } catch(ex) { // Get time zone from a string representing the date, - // eg. "Thu Nov 16 2017 01:13:32 GMT+0800 (CST)" -> "CST" + // e.g. "Thu Nov 16 2017 01:13:32 GMT+0800 (CST)" -> "CST" return new Date().toString().match(/\((.*)\)/)[1]; } } diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala index e445f188e1..61ab635842 100644 --- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala +++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala @@ -248,7 +248,7 @@ private[spark] class ExecutorAllocationManager( executor.scheduleWithFixedDelay(scheduleTask, 0, intervalMillis, TimeUnit.MILLISECONDS) } - // copy the maps inside synchonize to ensure not being modified + // copy the maps inside synchronize to ensure not being modified val (numExecutorsTarget, numLocalityAware) = synchronized { val numTarget = numExecutorsTargetPerResourceProfileId.toMap val numLocality = numLocalityAwareTasksPerResourceProfileId.toMap @@ -379,7 +379,7 @@ private[spark] class ExecutorAllocationManager( // We lower the target number of executors but don't actively kill any yet. Killing is // controlled separately by an idle timeout. It's still helpful to reduce - // the target number in case an executor just happens to get lost (eg., bad hardware, + // the target number in case an executor just happens to get lost (e.g., bad hardware, // or the cluster manager preempts it) -- in that case, there is no point in trying // to immediately get a new executor, since we wouldn't even use it yet. decrementExecutorsFromTarget(maxNeeded, rpId, updatesNeeded) diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala index 1bcd203f2e..6dd3630937 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala @@ -941,7 +941,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) /** * Return a RDD containing only the elements in the inclusive range `lower` to `upper`. * If the RDD has been partitioned using a `RangePartitioner`, then this operation can be - * performed efficiently by only scanning the partitions that might containt matching elements. + * performed efficiently by only scanning the partitions that might contain matching elements. * Otherwise, a standard `filter` is applied to all partitions. * * @since 3.1.0 @@ -955,7 +955,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) /** * Return a RDD containing only the elements in the inclusive range `lower` to `upper`. * If the RDD has been partitioned using a `RangePartitioner`, then this operation can be - * performed efficiently by only scanning the partitions that might containt matching elements. + * performed efficiently by only scanning the partitions that might contain matching elements. * Otherwise, a standard `filter` is applied to all partitions. * * @since 3.1.0 diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala index 89b33945df..306af24ada 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala @@ -78,7 +78,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable { /** * Internal method to this RDD; will read from cache if applicable, or otherwise compute it. - * This should ''not'' be called by users directly, but is available for implementors of custom + * This should ''not'' be called by users directly, but is available for implementers of custom * subclasses of RDD. */ def iterator(split: Partition, taskContext: TaskContext): JIterator[T] = diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala index 86a1ac31c0..6d4dc3d3df 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala @@ -48,14 +48,14 @@ import org.apache.spark.util._ private[spark] class PythonRDD( parent: RDD[_], func: PythonFunction, - preservePartitoning: Boolean, + preservePartitioning: Boolean, isFromBarrier: Boolean = false) extends RDD[Array[Byte]](parent) { override def getPartitions: Array[Partition] = firstParent.partitions override val partitioner: Option[Partitioner] = { - if (preservePartitoning) firstParent.partitioner else None + if (preservePartitioning) firstParent.partitioner else None } val asJavaRDD: JavaRDD[Array[Byte]] = JavaRDD.fromRDD(this) @@ -837,7 +837,7 @@ private[spark] class PythonBroadcast(@transient var path: String) extends Serial * We might be serializing a really large object from python -- we don't want * python to buffer the whole thing in memory, nor can it write to a file, * so we don't know the length in advance. So python writes it in chunks, each chunk - * preceeded by a length, till we get a "length" of -1 which serves as EOF. + * preceded by a length, till we get a "length" of -1 which serves as EOF. * * Tested from python tests. */ diff --git a/core/src/main/scala/org/apache/spark/deploy/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/deploy/JsonProtocol.scala index d76fb7f9a2..f697892aac 100644 --- a/core/src/main/scala/org/apache/spark/deploy/JsonProtocol.scala +++ b/core/src/main/scala/org/apache/spark/deploy/JsonProtocol.scala @@ -80,7 +80,7 @@ private[deploy] object JsonProtocol { } /** - * Export the [[ApplicationInfo]] to a Json objec. An [[ApplicationInfo]] consists of the + * Export the [[ApplicationInfo]] to a Json object. An [[ApplicationInfo]] consists of the * information of an application. * * @return a Json object containing the following fields: diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index 7332c6d54c..4aa393c514 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -311,7 +311,7 @@ private[spark] class SparkSubmit extends Logging { // In K8s client mode, when in the driver, add resolved jars early as we might need // them at the submit time for artifact downloading. // For example we might use the dependencies for downloading - // files from a Hadoop Compatible fs eg. S3. In this case the user might pass: + // files from a Hadoop Compatible fs e.g. S3. In this case the user might pass: // --packages com.amazonaws:aws-java-sdk:1.7.4:org.apache.hadoop:hadoop-aws:2.7.6 if (isKubernetesClusterModeDriver) { val loader = getSubmitClassLoader(sparkConf) diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala index e5341aff8c..e6df260bde 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala @@ -722,7 +722,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) /** * Replay the given log file, saving the application in the listing db. - * Visable for testing + * Visible for testing */ private[history] def doMergeApplicationListing( reader: EventLogFileReader, diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HybridStore.scala b/core/src/main/scala/org/apache/spark/deploy/history/HybridStore.scala index 1b8c7ff26e..4eb5c15d4e 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/HybridStore.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/HybridStore.scala @@ -52,7 +52,7 @@ private[history] class HybridStore extends KVStore { // A background thread that dumps data from inMemoryStore to levelDB private var backgroundThread: Thread = null - // A hash map that stores all classes that had been writen to inMemoryStore + // A hash map that stores all classes that had been written to inMemoryStore // Visible for testing private[history] val klassMap = new ConcurrentHashMap[Class[_], Boolean] diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index efb0b2c26d..c81ac778a3 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -685,7 +685,7 @@ private[spark] class Executor( // SPARK-20904: Do not report failure to driver if if happened during shut down. Because // libraries may set up shutdown hooks that race with running tasks during shutdown, // spurious failures may occur and can result in improper accounting in the driver (e.g. - // the task failure would not be ignored if the shutdown happened because of premption, + // the task failure would not be ignored if the shutdown happened because of preemption, // instead of an app issue). if (!ShutdownHookManager.inShutdown()) { val (accums, accUpdates) = collectAccumulatorsAndResetStatusOnFailure(taskStartTimeNs) @@ -744,7 +744,7 @@ private[spark] class Executor( * sending a Thread.interrupt(), and monitoring the task until it finishes. * * Spark's current task cancellation / task killing mechanism is "best effort" because some tasks - * may not be interruptable or may not respond to their "killed" flags being set. If a significant + * may not be interruptible or may not respond to their "killed" flags being set. If a significant * fraction of a cluster's task slots are occupied by tasks that have been marked as killed but * remain running then this can lead to a situation where new jobs and tasks are starved of * resources that are being used by these zombie tasks. diff --git a/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala b/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala index d98d5e3b81..bddd18adc6 100644 --- a/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala +++ b/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala @@ -102,7 +102,7 @@ private[spark] class MetricsConfig(conf: SparkConf) extends Logging { * * @param prop the flat list of properties to "unflatten" based on prefixes * @param regex the regex that the prefix has to comply with - * @return an unflatted map, mapping prefix with sub-properties under that prefix + * @return an unflattened map, mapping prefix with sub-properties under that prefix */ def subProperties(prop: Properties, regex: Regex): mutable.HashMap[String, Properties] = { val subProperties = new mutable.HashMap[String, Properties] diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/PrometheusServlet.scala b/core/src/main/scala/org/apache/spark/metrics/sink/PrometheusServlet.scala index 59b863b89f..e9c2974622 100644 --- a/core/src/main/scala/org/apache/spark/metrics/sink/PrometheusServlet.scala +++ b/core/src/main/scala/org/apache/spark/metrics/sink/PrometheusServlet.scala @@ -56,7 +56,7 @@ private[spark] class PrometheusServlet( def getMetricsSnapshot(request: HttpServletRequest): String = { import scala.collection.JavaConverters._ - val guagesLabel = """{type="gauges"}""" + val gaugesLabel = """{type="gauges"}""" val countersLabel = """{type="counters"}""" val metersLabel = countersLabel val histogramslabels = """{type="histograms"}""" @@ -65,8 +65,8 @@ private[spark] class PrometheusServlet( val sb = new StringBuilder() registry.getGauges.asScala.foreach { case (k, v) => if (!v.getValue.isInstanceOf[String]) { - sb.append(s"${normalizeKey(k)}Number$guagesLabel ${v.getValue}\n") - sb.append(s"${normalizeKey(k)}Value$guagesLabel ${v.getValue}\n") + sb.append(s"${normalizeKey(k)}Number$gaugesLabel ${v.getValue}\n") + sb.append(s"${normalizeKey(k)}Value$gaugesLabel ${v.getValue}\n") } } registry.getCounters.asScala.foreach { case (k, v) => diff --git a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala index 943abae17a..39f6956798 100644 --- a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala @@ -173,7 +173,7 @@ class DoubleRDDFunctions(self: RDD[Double]) extends Logging with Serializable { if (buckets.length < 2) { throw new IllegalArgumentException("buckets array must have at least two elements") } - // The histogramPartition function computes the partail histogram for a given + // The histogramPartition function computes the partial histogram for a given // partition. The provided bucketFunction determines which bucket in the array // to increment or returns None if there is no bucket. This is done so we can // specialize for uniformly distributed buckets and save the O(log n) binary diff --git a/core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala index 5b1c024257..3cefcb16d6 100644 --- a/core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala @@ -88,10 +88,10 @@ class OrderedRDDFunctions[K : Ordering : ClassTag, val rddToFilter: RDD[P] = self.partitioner match { case Some(rp: RangePartitioner[K, V]) => - val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match { + val partitionIndices = (rp.getPartition(lower), rp.getPartition(upper)) match { case (l, u) => Math.min(l, u) to Math.max(l, u) } - PartitionPruningRDD.create(self, partitionIndicies.contains) + PartitionPruningRDD.create(self, partitionIndices.contains) case _ => self } diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index 15b00a4496..65b39c4b65 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -327,7 +327,7 @@ abstract class RDD[T: ClassTag]( /** * Internal method to this RDD; will read from cache if applicable, or otherwise compute it. - * This should ''not'' be called by users directly, but is available for implementors of custom + * This should ''not'' be called by users directly, but is available for implementers of custom * subclasses of RDD. */ final def iterator(split: Partition, context: TaskContext): Iterator[T] = { diff --git a/core/src/main/scala/org/apache/spark/resource/TaskResourceRequest.scala b/core/src/main/scala/org/apache/spark/resource/TaskResourceRequest.scala index d3f979fa86..12ef34241f 100644 --- a/core/src/main/scala/org/apache/spark/resource/TaskResourceRequest.scala +++ b/core/src/main/scala/org/apache/spark/resource/TaskResourceRequest.scala @@ -20,7 +20,7 @@ package org.apache.spark.resource import org.apache.spark.annotation.{Evolving, Since} /** - * A task resource request. This is used in conjuntion with the ResourceProfile to + * A task resource request. This is used in conjunction with the ResourceProfile to * programmatically specify the resources needed for an RDD that will be applied at the * stage level. * diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala index fcb9fe422c..5864e9e2ce 100644 --- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala +++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala @@ -254,14 +254,14 @@ private[netty] class NettyRpcEnv( val timeoutCancelable = timeoutScheduler.schedule(new Runnable { override def run(): Unit = { - val remoteReceAddr = if (remoteAddr == null) { + val remoteRecAddr = if (remoteAddr == null) { Try { message.receiver.client.getChannel.remoteAddress() }.toOption.orNull } else { remoteAddr } - onFailure(new TimeoutException(s"Cannot receive any reply from ${remoteReceAddr} " + + onFailure(new TimeoutException(s"Cannot receive any reply from ${remoteRecAddr} " + s"in ${timeout.duration}")) } }, timeout.duration.toNanos, TimeUnit.NANOSECONDS) diff --git a/core/src/main/scala/org/apache/spark/scheduler/BarrierJobAllocationFailed.scala b/core/src/main/scala/org/apache/spark/scheduler/BarrierJobAllocationFailed.scala index 043c6b9038..8f0764ed1a 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/BarrierJobAllocationFailed.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/BarrierJobAllocationFailed.scala @@ -45,10 +45,10 @@ private[spark] object BarrierJobAllocationFailed { val ERROR_MESSAGE_RUN_BARRIER_WITH_UNSUPPORTED_RDD_CHAIN_PATTERN = "[SPARK-24820][SPARK-24821]: Barrier execution mode does not allow the following pattern of " + "RDD chain within a barrier stage:\n1. Ancestor RDDs that have different number of " + - "partitions from the resulting RDD (eg. union()/coalesce()/first()/take()/" + + "partitions from the resulting RDD (e.g. union()/coalesce()/first()/take()/" + "PartitionPruningRDD). A workaround for first()/take() can be barrierRdd.collect().head " + "(scala) or barrierRdd.collect()[0] (python).\n" + - "2. An RDD that depends on multiple barrier RDDs (eg. barrierRdd1.zip(barrierRdd2))." + "2. An RDD that depends on multiple barrier RDDs (e.g. barrierRdd1.zip(barrierRdd2))." // Error message when running a barrier stage with dynamic resource allocation enabled. val ERROR_MESSAGE_RUN_BARRIER_WITH_DYN_ALLOCATION = diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index 6fb0fb93f2..02f5bb8ccc 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -409,9 +409,9 @@ private[spark] class DAGScheduler( /** * Check to make sure we don't launch a barrier stage with unsupported RDD chain pattern. The * following patterns are not supported: - * 1. Ancestor RDDs that have different number of partitions from the resulting RDD (eg. + * 1. Ancestor RDDs that have different number of partitions from the resulting RDD (e.g. * union()/coalesce()/first()/take()/PartitionPruningRDD); - * 2. An RDD that depends on multiple barrier RDDs (eg. barrierRdd1.zip(barrierRdd2)). + * 2. An RDD that depends on multiple barrier RDDs (e.g. barrierRdd1.zip(barrierRdd2)). */ private def checkBarrierStageWithRDDChainPattern(rdd: RDD[_], numTasksInStage: Int): Unit = { if (rdd.isBarrier() && @@ -459,7 +459,7 @@ private[spark] class DAGScheduler( /** * We don't support run a barrier stage with dynamic resource allocation enabled, it shall lead - * to some confusing behaviors (eg. with dynamic resource allocation enabled, it may happen that + * to some confusing behaviors (e.g. with dynamic resource allocation enabled, it may happen that * we acquire some executors (but not enough to launch all the tasks in a barrier stage) and * later release them due to executor idle time expire, and then acquire again). * @@ -1555,7 +1555,7 @@ private[spark] class DAGScheduler( event.reason) if (!stageIdToStage.contains(task.stageId)) { - // The stage may have already finished when we get this event -- eg. maybe it was a + // The stage may have already finished when we get this event -- e.g. maybe it was a // speculative task. It is important that we send the TaskEnd event in any case, so listeners // are properly notified and can chose to handle it. For instance, some listeners are // doing their own accounting and if they don't get the task end event they think diff --git a/core/src/main/scala/org/apache/spark/scheduler/HealthTracker.scala b/core/src/main/scala/org/apache/spark/scheduler/HealthTracker.scala index 9bbacea94b..c6b8dca359 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/HealthTracker.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/HealthTracker.scala @@ -32,7 +32,7 @@ import org.apache.spark.util.{Clock, SystemClock, Utils} * additional logic for exclusion of executors and nodes for individual tasks and stages which * works in concert with the logic here. * - * The tracker needs to deal with a variety of workloads, eg.: + * The tracker needs to deal with a variety of workloads, e.g.: * * * bad user code -- this may lead to many task failures, but that should not count against * individual executors @@ -362,7 +362,7 @@ private[scheduler] class HealthTracker ( * Apply the timeout to individual tasks. This is to prevent one-off failures that are very * spread out in time (and likely have nothing to do with problems on the executor) from * triggering exlusion. However, note that we do *not* remove executors and nodes from - * being excluded as we expire individual task failures -- each have their own timeout. Eg., + * being excluded as we expire individual task failures -- each have their own timeout. E.g., * suppose: * * timeout = 10, maxFailuresPerExec = 2 * * Task 1 fails on exec 1 at time 0 diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala index 0cfa76583b..914fccc1a6 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala @@ -216,7 +216,7 @@ private[spark] class TaskSetManager( /** * Track the set of locality levels which are valid given the tasks locality preferences and * the set of currently available executors. This is updated as executors are added and removed. - * This allows a performance optimization, of skipping levels that aren't relevant (eg., skip + * This allows a performance optimization, of skipping levels that aren't relevant (e.g., skip * PROCESS_LOCAL if no tasks could be run PROCESS_LOCAL for the current set of executors). */ private[scheduler] var myLocalityLevels = computeValidLocalityLevels() diff --git a/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala b/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala index a4df0d543e..4ebb7b0def 100644 --- a/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala +++ b/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala @@ -167,7 +167,7 @@ private[spark] object CryptoStreamUtils extends Logging { } /** - * SPARK-25535. The commons-cryto library will throw InternalError if something goes + * SPARK-25535. The commons-crypto library will throw InternalError if something goes * wrong, and leave bad state behind in the Java wrappers, so it's not safe to use them * afterwards. This wrapper detects that situation and avoids further calls into the * commons-crypto code, while still allowing the underlying streams to be closed. diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala index 924601f92c..072702b343 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala @@ -1103,7 +1103,7 @@ private[spark] class BlockManager( blockSize: Long): Option[ManagedBuffer] = { val file = ExecutorDiskUtils.getFile(localDirs, subDirsPerLocalDir, blockId.name) if (file.exists()) { - val mangedBuffer = securityManager.getIOEncryptionKey() match { + val managedBuffer = securityManager.getIOEncryptionKey() match { case Some(key) => // Encrypted blocks cannot be memory mapped; return a special object that does decryption // and provides InputStream / FileRegion implementations for reading the data. @@ -1114,7 +1114,7 @@ private[spark] class BlockManager( val transportConf = SparkTransportConf.fromSparkConf(conf, "shuffle") new FileSegmentManagedBuffer(transportConf, file, 0, file.length) } - Some(mangedBuffer) + Some(managedBuffer) } else { None } diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala index 4d56551170..eada4b3ee2 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala @@ -357,7 +357,7 @@ class BlockManagerMasterEndpoint( blockLocations.remove(blockId) logWarning(s"No more replicas available for $blockId !") } else if (proactivelyReplicate && (blockId.isRDD || blockId.isInstanceOf[TestBlockId])) { - // As a heursitic, assume single executor failure to find out the number of replicas that + // As a heuristic, assume single executor failure to find out the number of replicas that // existed before failure val maxReplicas = locations.size + 1 val i = (new Random(blockId.hashCode)).nextInt(locations.size) diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala index 5f5a08fe0e..cfe15eb832 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala @@ -85,7 +85,7 @@ private[ui] class AllJobsPage(parent: JobsTab, store: AppStatusStore) extends We } // The timeline library treats contents as HTML, so we have to escape them. We need to add - // extra layers of escaping in order to embed this in a Javascript string literal. + // extra layers of escaping in order to embed this in a JavaScript string literal. val escapedDesc = Utility.escape(jobDescription) val jsEscapedDescForTooltip = StringEscapeUtils.escapeEcmaScript(Utility.escape(escapedDesc)) val jsEscapedDescForLabel = StringEscapeUtils.escapeEcmaScript(escapedDesc) diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala index 19eccc5209..c40e1bc248 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala @@ -68,7 +68,7 @@ private[ui] class JobPage(parent: JobsTab, store: AppStatusStore) extends WebUIP .getOrElse(System.currentTimeMillis()) // The timeline library treats contents as HTML, so we have to escape them. We need to add - // extra layers of escaping in order to embed this in a Javascript string literal. + // extra layers of escaping in order to embed this in a JavaScript string literal. val escapedName = Utility.escape(name) val jsEscapedNameForTooltip = StringEscapeUtils.escapeEcmaScript(Utility.escape(escapedName)) val jsEscapedNameForLabel = StringEscapeUtils.escapeEcmaScript(escapedName) diff --git a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala index 6ffd6605f7..7e2b9c72ad 100644 --- a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala +++ b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala @@ -285,7 +285,7 @@ private[spark] object ClosureCleaner extends Logging { logDebug(s" + outermost object is a closure, so we clone it: ${outermostClass}") } else if (outermostClass.getName.startsWith("$line")) { // SPARK-14558: if the outermost object is a REPL line object, we should clone - // and clean it as it may carray a lot of unnecessary information, + // and clean it as it may carry a lot of unnecessary information, // e.g. hadoop conf, spark conf, etc. logDebug(s" + outermost object is a REPL line object, so we clone it:" + s" ${outermostClass}") diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 71a310a427..accf3d7c0d 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -376,7 +376,7 @@ private[spark] object Utils extends Logging { * This returns a new InputStream which contains the same data as the original input stream. * It may be entirely on in-memory buffer, or it may be a combination of in-memory data, and then * continue to read from the original stream. The only real use of this is if the original input - * stream will potentially detect corruption while the data is being read (eg. from compression). + * stream will potentially detect corruption while the data is being read (e.g. from compression). * This allows for an eager check of corruption in the first maxSize bytes of data. * * @return An InputStream which includes all data from the original stream (combining buffered @@ -1067,20 +1067,20 @@ private[spark] object Utils extends Logging { } // checks if the hostport contains IPV6 ip and parses the host, port if (hostPort != null && hostPort.split(":").length > 2) { - val indx: Int = hostPort.lastIndexOf("]:") - if (-1 == indx) { + val index: Int = hostPort.lastIndexOf("]:") + if (-1 == index) { return setDefaultPortValue } - val port = hostPort.substring(indx + 2).trim() - val retval = (hostPort.substring(0, indx + 1).trim(), if (port.isEmpty) 0 else port.toInt) + val port = hostPort.substring(index + 2).trim() + val retval = (hostPort.substring(0, index + 1).trim(), if (port.isEmpty) 0 else port.toInt) hostPortParseResults.putIfAbsent(hostPort, retval) } else { - val indx: Int = hostPort.lastIndexOf(':') - if (-1 == indx) { + val index: Int = hostPort.lastIndexOf(':') + if (-1 == index) { return setDefaultPortValue } - val port = hostPort.substring(indx + 1).trim() - val retval = (hostPort.substring(0, indx).trim(), if (port.isEmpty) 0 else port.toInt) + val port = hostPort.substring(index + 1).trim() + val retval = (hostPort.substring(0, index).trim(), if (port.isEmpty) 0 else port.toInt) hostPortParseResults.putIfAbsent(hostPort, retval) } @@ -2854,11 +2854,11 @@ private[spark] object Utils extends Logging { if (lastDollarIndex < s.length - 1) { // The last char is not a dollar sign if (lastDollarIndex == -1 || !s.contains("$iw")) { - // The name does not have dollar sign or is not an intepreter + // The name does not have dollar sign or is not an interpreter // generated class, so we should return the full string s } else { - // The class name is intepreter generated, + // The class name is interpreter generated, // return the part after the last dollar sign // This is the same behavior as getClass.getSimpleName s.substring(lastDollarIndex + 1) diff --git a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala index 2c3730de08..8635f1a3d7 100644 --- a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala +++ b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala @@ -193,7 +193,7 @@ private[spark] object ChunkedByteBuffer { length: Long): ChunkedByteBuffer = { // We do *not* memory map the file, because we may end up putting this into the memory store, // and spark currently is not expecting memory-mapped buffers in the memory store, it conflicts - // with other parts that manage the lifecyle of buffers and dispose them. See SPARK-25422. + // with other parts that manage the lifecycle of buffers and dispose them. See SPARK-25422. val is = new FileInputStream(file) ByteStreams.skipFully(is, offset) val in = new LimitedInputStream(is, length) diff --git a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java index ee8e38c24b..df1d306e62 100644 --- a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java +++ b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java @@ -68,10 +68,10 @@ import static org.mockito.Mockito.*; public class UnsafeShuffleWriterSuite { static final int DEFAULT_INITIAL_SORT_BUFFER_SIZE = 4096; - static final int NUM_PARTITITONS = 4; + static final int NUM_PARTITIONS = 4; TestMemoryManager memoryManager; TaskMemoryManager taskMemoryManager; - final HashPartitioner hashPartitioner = new HashPartitioner(NUM_PARTITITONS); + final HashPartitioner hashPartitioner = new HashPartitioner(NUM_PARTITIONS); File mergedOutputFile; File tempDir; long[] partitionSizesInMergedFile; @@ -194,7 +194,7 @@ public class UnsafeShuffleWriterSuite { private List> readRecordsFromFile() throws IOException { final ArrayList> recordsList = new ArrayList<>(); long startOffset = 0; - for (int i = 0; i < NUM_PARTITITONS; i++) { + for (int i = 0; i < NUM_PARTITIONS; i++) { final long partitionSize = partitionSizesInMergedFile[i]; if (partitionSize > 0) { FileInputStream fin = new FileInputStream(mergedOutputFile); @@ -253,7 +253,7 @@ public class UnsafeShuffleWriterSuite { assertTrue(mapStatus.isDefined()); assertTrue(mergedOutputFile.exists()); assertEquals(0, spillFilesCreated.size()); - assertArrayEquals(new long[NUM_PARTITITONS], partitionSizesInMergedFile); + assertArrayEquals(new long[NUM_PARTITIONS], partitionSizesInMergedFile); assertEquals(0, taskMetrics.shuffleWriteMetrics().recordsWritten()); assertEquals(0, taskMetrics.shuffleWriteMetrics().bytesWritten()); assertEquals(0, taskMetrics.diskBytesSpilled()); @@ -264,7 +264,7 @@ public class UnsafeShuffleWriterSuite { public void writeWithoutSpilling() throws Exception { // In this example, each partition should have exactly one record: final ArrayList> dataToWrite = new ArrayList<>(); - for (int i = 0; i < NUM_PARTITITONS; i++) { + for (int i = 0; i < NUM_PARTITIONS; i++) { dataToWrite.add(new Tuple2<>(i, i)); } final UnsafeShuffleWriter writer = createWriter(true); diff --git a/core/src/test/java/test/org/apache/spark/JavaAPISuite.java b/core/src/test/java/test/org/apache/spark/JavaAPISuite.java index dbaca71c5f..e73ac0e9fb 100644 --- a/core/src/test/java/test/org/apache/spark/JavaAPISuite.java +++ b/core/src/test/java/test/org/apache/spark/JavaAPISuite.java @@ -1518,7 +1518,7 @@ public class JavaAPISuite implements Serializable { JavaFutureAction future = rdd.map(new BuggyMapFunction<>()).countAsync(); try { future.get(2, TimeUnit.SECONDS); - fail("Expected future.get() for failed job to throw ExcecutionException"); + fail("Expected future.get() for failed job to throw ExecutionException"); } catch (ExecutionException ee) { assertTrue(Throwables.getStackTraceAsString(ee).contains("Custom exception!")); } diff --git a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala index 21090e98ea..e42df08215 100644 --- a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala +++ b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala @@ -635,12 +635,12 @@ class CheckpointStorageSuite extends SparkFunSuite with LocalSparkContext { // Verify that RDD is checkpointed assert(rdd.firstParent.isInstanceOf[ReliableCheckpointRDD[_]]) val checkpointedRDD = rdd.firstParent.asInstanceOf[ReliableCheckpointRDD[_]] - val partiton = checkpointedRDD.partitions(0) - assert(!checkpointedRDD.cachedPreferredLocations.asMap.containsKey(partiton)) + val partition = checkpointedRDD.partitions(0) + assert(!checkpointedRDD.cachedPreferredLocations.asMap.containsKey(partition)) - val preferredLoc = checkpointedRDD.preferredLocations(partiton) - assert(checkpointedRDD.cachedPreferredLocations.asMap.containsKey(partiton)) - assert(preferredLoc == checkpointedRDD.cachedPreferredLocations.get(partiton)) + val preferredLoc = checkpointedRDD.preferredLocations(partition) + assert(checkpointedRDD.cachedPreferredLocations.asMap.containsKey(partition)) + assert(preferredLoc == checkpointedRDD.cachedPreferredLocations.get(partition)) } } @@ -653,7 +653,7 @@ class CheckpointStorageSuite extends SparkFunSuite with LocalSparkContext { val rdd = sc.makeRDD(1 to 200, numSlices = 4).repartition(1).mapPartitions { iter => iter.map { i => if (i > 100 && TaskContext.get().stageAttemptNumber() == 0) { - // throw new SparkException("Make first attemp failed.") + // throw new SparkException("Make first attempt failed.") // Throw FetchFailedException to explicitly trigger stage resubmission. // A normal exception will only trigger task resubmission in the same stage. throw new FetchFailedException(null, 0, 0L, 0, 0, "Fake") diff --git a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala index 81530a8fda..5434e82c95 100644 --- a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala +++ b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala @@ -368,7 +368,7 @@ class CleanerTester( val toBeCleanedRDDIds = new HashSet[Int] ++= rddIds val toBeCleanedShuffleIds = new HashSet[Int] ++= shuffleIds - val toBeCleanedBroadcstIds = new HashSet[Long] ++= broadcastIds + val toBeCleanedBroadcastIds = new HashSet[Long] ++= broadcastIds val toBeCheckpointIds = new HashSet[Long] ++= checkpointIds val isDistributed = !sc.isLocal @@ -384,7 +384,7 @@ class CleanerTester( } def broadcastCleaned(broadcastId: Long): Unit = { - toBeCleanedBroadcstIds.synchronized { toBeCleanedBroadcstIds -= broadcastId } + toBeCleanedBroadcastIds.synchronized { toBeCleanedBroadcastIds -= broadcastId } logInfo("Broadcast " + broadcastId + " cleaned") } @@ -508,8 +508,8 @@ class CleanerTester( val s2 = toBeCleanedShuffleIds.synchronized { toBeCleanedShuffleIds.toSeq.sorted.mkString("[", ", ", "]") } - val s3 = toBeCleanedBroadcstIds.synchronized { - toBeCleanedBroadcstIds.toSeq.sorted.mkString("[", ", ", "]") + val s3 = toBeCleanedBroadcastIds.synchronized { + toBeCleanedBroadcastIds.toSeq.sorted.mkString("[", ", ", "]") } s""" |\tRDDs = $s1 @@ -521,7 +521,7 @@ class CleanerTester( private def isAllCleanedUp = toBeCleanedRDDIds.synchronized { toBeCleanedRDDIds.isEmpty } && toBeCleanedShuffleIds.synchronized { toBeCleanedShuffleIds.isEmpty } && - toBeCleanedBroadcstIds.synchronized { toBeCleanedBroadcstIds.isEmpty } && + toBeCleanedBroadcastIds.synchronized { toBeCleanedBroadcastIds.isEmpty } && toBeCheckpointIds.synchronized { toBeCheckpointIds.isEmpty } private def getRDDBlocks(rddId: Int): Seq[BlockId] = { diff --git a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala index d1edb80e40..c1269a9c91 100644 --- a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala @@ -268,7 +268,7 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { test("add executors multiple profiles initial num same as needed") { // test when the initial number of executors equals the number needed for the first - // stage using a non default profile to make sure we request the intitial number + // stage using a non default profile to make sure we request the initial number // properly. Here initial is 2, each executor in ResourceProfile 1 can have 2 tasks // per executor, and start a stage with 4 tasks, which would need 2 executors. val clock = new ManualClock(8888L) diff --git a/core/src/test/scala/org/apache/spark/FileSuite.scala b/core/src/test/scala/org/apache/spark/FileSuite.scala index e9ee6b5dfb..f953bf4043 100644 --- a/core/src/test/scala/org/apache/spark/FileSuite.scala +++ b/core/src/test/scala/org/apache/spark/FileSuite.scala @@ -170,7 +170,7 @@ class FileSuite extends SparkFunSuite with LocalSparkContext { val nums = sc.makeRDD(1 to 3).map(x => (x, "a" * x)) // (1,a), (2,aa), (3,aaa) nums.saveAsSequenceFile(outputDir) // Similar to the tests above, we read a SequenceFile, but this time we pass type params - // that are convertable to Writable instead of calling sequenceFile[IntWritable, Text] + // that are convertible to Writable instead of calling sequenceFile[IntWritable, Text] val output1 = sc.sequenceFile[Int, String](outputDir) assert(output1.collect().toList === List((1, "a"), (2, "aa"), (3, "aaa"))) // Also try having one type be a subclass of Writable and one not diff --git a/core/src/test/scala/org/apache/spark/benchmark/BenchmarkBase.scala b/core/src/test/scala/org/apache/spark/benchmark/BenchmarkBase.scala index e97b9d5d6b..eff4fd20d7 100644 --- a/core/src/test/scala/org/apache/spark/benchmark/BenchmarkBase.scala +++ b/core/src/test/scala/org/apache/spark/benchmark/BenchmarkBase.scala @@ -21,7 +21,7 @@ import java.io.{File, FileOutputStream, OutputStream} /** * A base class for generate benchmark results to a file. - * For JDK9+, JDK major version number is added to the file names to distingush the results. + * For JDK9+, JDK major version number is added to the file names to distinguish the results. */ abstract class BenchmarkBase { var output: Option[OutputStream] = None diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala index 0b0754be2f..3b8677742c 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala @@ -926,8 +926,8 @@ class FsHistoryProviderSuite extends SparkFunSuite with Matchers with Logging { oldProvider.listing.setMetadata(meta) oldProvider.stop() - val mistatchedVersionProvider = new FsHistoryProvider(conf) - assert(mistatchedVersionProvider.listing.count(classOf[ApplicationInfoWrapper]) === 0) + val mismatchedVersionProvider = new FsHistoryProvider(conf) + assert(mismatchedVersionProvider.listing.count(classOf[ApplicationInfoWrapper]) === 0) } test("invalidate cached UI") { diff --git a/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala index a46799df06..b1b97a61ed 100644 --- a/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala @@ -784,7 +784,7 @@ class MasterSuite extends SparkFunSuite var worker: MockExecutorLaunchFailWorker = null try { val conf = new SparkConf() - // SPARK-32250: When running test on Github Action machine, the available processors in JVM + // SPARK-32250: When running test on GitHub Action machine, the available processors in JVM // is only 2, while on Jenkins it's 32. For this specific test, 2 available processors, which // also decides number of threads in Dispatcher, is not enough to consume the messages. In // the worst situation, MockExecutorLaunchFailWorker would occupy these 2 threads for diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerSuite.scala index 5bbd60f99f..8ed861ad34 100644 --- a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerSuite.scala @@ -342,7 +342,7 @@ class WorkerSuite extends SparkFunSuite with Matchers with BeforeAndAfter { testWorkDirCleanupAndRemoveMetadataWithConfig(true) } - test("WorkdDirCleanup cleans only app dirs when" + + test("WorkDirCleanup cleans only app dirs when" + "spark.shuffle.service.db.enabled=false") { testWorkDirCleanupAndRemoveMetadataWithConfig(false) } diff --git a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala index 1326ae3c11..5b868604ec 100644 --- a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala +++ b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala @@ -552,7 +552,7 @@ class ExecutorSuite extends SparkFunSuite if (poll) { executor.metricsPoller.poll() } - executor.killAllTasks(true, "Killed task, eg. because of speculative execution") + executor.killAllTasks(true, "Killed task, e.g. because of speculative execution") } else { timedOut.set(true) } diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala index 2bd32fc927..778f748f83 100644 --- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala +++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala @@ -75,7 +75,7 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite { /** * Create a classic two-arg protocol instance. - * @param dynamic dyanmic partitioning mode + * @param dynamic dynamic partitioning mode * @return the instance */ private def instantiateClassic(dynamic: Boolean): ClassicConstructorCommitProtocol = { @@ -88,7 +88,7 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite { /** * Create a three-arg protocol instance. - * @param dynamic dyanmic partitioning mode + * @param dynamic dynamic partitioning mode * @return the instance */ private def instantiateNew( diff --git a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala index 330347299a..905bb81107 100644 --- a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala +++ b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala @@ -213,7 +213,7 @@ class InputOutputMetricsSuite extends SparkFunSuite with SharedSparkContext } // Computing the amount of bytes read for a cartesian operation is a little involved. - // Cartesian interleaves reads between two partitions eg. p1 and p2. + // Cartesian interleaves reads between two partitions e.g. p1 and p2. // Here are the steps: // 1) First it creates an iterator for p1 // 2) Creates an iterator for p2 diff --git a/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferServiceSuite.scala b/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferServiceSuite.scala index 182c3c09e0..c8a8f37212 100644 --- a/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferServiceSuite.scala +++ b/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferServiceSuite.scala @@ -88,7 +88,7 @@ class NettyBlockTransferServiceSuite } test("SPARK-27637: test fetch block with executor dead") { - implicit val exectionContext = ExecutionContext.global + implicit val executionContext = ExecutionContext.global val port = 17634 + Random.nextInt(10000) logInfo("random port for test: " + port) diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala index 2de4b109e4..a669993352 100644 --- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala @@ -28,7 +28,7 @@ import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.mapred._ import org.apache.hadoop.mapreduce.{Job => NewJob, JobContext => NewJobContext, OutputCommitter => NewOutputCommitter, OutputFormat => NewOutputFormat, - RecordWriter => NewRecordWriter, TaskAttemptContext => NewTaskAttempContext} + RecordWriter => NewRecordWriter, TaskAttemptContext => NewTaskAttemptContext} import org.apache.hadoop.util.Progressable import org.scalatest.Assertions @@ -892,7 +892,7 @@ class FakeOutputFormat() extends OutputFormat[Integer, Integer]() { */ class NewFakeWriter extends NewRecordWriter[Integer, Integer] { - def close(p1: NewTaskAttempContext): Unit = () + def close(p1: NewTaskAttemptContext): Unit = () def write(p1: Integer, p2: Integer): Unit = () @@ -901,24 +901,24 @@ class NewFakeWriter extends NewRecordWriter[Integer, Integer] { class NewFakeCommitter extends NewOutputCommitter { def setupJob(p1: NewJobContext): Unit = () - def needsTaskCommit(p1: NewTaskAttempContext): Boolean = false + def needsTaskCommit(p1: NewTaskAttemptContext): Boolean = false - def setupTask(p1: NewTaskAttempContext): Unit = () + def setupTask(p1: NewTaskAttemptContext): Unit = () - def commitTask(p1: NewTaskAttempContext): Unit = () + def commitTask(p1: NewTaskAttemptContext): Unit = () - def abortTask(p1: NewTaskAttempContext): Unit = () + def abortTask(p1: NewTaskAttemptContext): Unit = () } class NewFakeFormat() extends NewOutputFormat[Integer, Integer]() { def checkOutputSpecs(p1: NewJobContext): Unit = () - def getRecordWriter(p1: NewTaskAttempContext): NewRecordWriter[Integer, Integer] = { + def getRecordWriter(p1: NewTaskAttemptContext): NewRecordWriter[Integer, Integer] = { new NewFakeWriter() } - def getOutputCommitter(p1: NewTaskAttempContext): NewOutputCommitter = { + def getOutputCommitter(p1: NewTaskAttemptContext): NewOutputCommitter = { new NewFakeCommitter() } } @@ -958,7 +958,7 @@ class FakeFormatWithCallback() extends FakeOutputFormat { } class NewFakeWriterWithCallback extends NewFakeWriter { - override def close(p1: NewTaskAttempContext): Unit = { + override def close(p1: NewTaskAttemptContext): Unit = { FakeWriterWithCallback.calledBy += "close" } @@ -972,7 +972,7 @@ class NewFakeWriterWithCallback extends NewFakeWriter { } class NewFakeFormatWithCallback() extends NewFakeFormat { - override def getRecordWriter(p1: NewTaskAttempContext): NewRecordWriter[Integer, Integer] = { + override def getRecordWriter(p1: NewTaskAttemptContext): NewRecordWriter[Integer, Integer] = { new NewFakeWriterWithCallback() } } @@ -982,27 +982,27 @@ class YetAnotherFakeCommitter extends NewOutputCommitter with Assertions { JobID.jobid = j.getJobID().getId } - def needsTaskCommit(t: NewTaskAttempContext): Boolean = false + def needsTaskCommit(t: NewTaskAttemptContext): Boolean = false - def setupTask(t: NewTaskAttempContext): Unit = { + def setupTask(t: NewTaskAttemptContext): Unit = { val jobId = t.getTaskAttemptID().getJobID().getId assert(jobId === JobID.jobid) } - def commitTask(t: NewTaskAttempContext): Unit = {} + def commitTask(t: NewTaskAttemptContext): Unit = {} - def abortTask(t: NewTaskAttempContext): Unit = {} + def abortTask(t: NewTaskAttemptContext): Unit = {} } class YetAnotherFakeFormat() extends NewOutputFormat[Integer, Integer]() { def checkOutputSpecs(j: NewJobContext): Unit = {} - def getRecordWriter(t: NewTaskAttempContext): NewRecordWriter[Integer, Integer] = { + def getRecordWriter(t: NewTaskAttemptContext): NewRecordWriter[Integer, Integer] = { new NewFakeWriter() } - def getOutputCommitter(t: NewTaskAttempContext): NewOutputCommitter = { + def getOutputCommitter(t: NewTaskAttemptContext): NewOutputCommitter = { new YetAnotherFakeCommitter() } } @@ -1021,7 +1021,7 @@ class ConfigTestFormat() extends NewFakeFormat() with Configurable { def getConf: Configuration = null - override def getRecordWriter(p1: NewTaskAttempContext): NewRecordWriter[Integer, Integer] = { + override def getRecordWriter(p1: NewTaskAttemptContext): NewRecordWriter[Integer, Integer] = { assert(setConfCalled, "setConf was never called") super.getRecordWriter(p1) } diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala index 8962fd6740..df8ac2ef74 100644 --- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala @@ -1102,7 +1102,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { } } - test("RDD.partitions() fails fast when partitions indicies are incorrect (SPARK-13021)") { + test("RDD.partitions() fails fast when partitions indices are incorrect (SPARK-13021)") { class BadRDD[T: ClassTag](prev: RDD[T]) extends RDD[T](prev) { override def compute(part: Partition, context: TaskContext): Iterator[T] = { diff --git a/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala b/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala index e8e8682e20..eac45e6ac5 100644 --- a/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala @@ -221,7 +221,7 @@ class ResourceUtilsSuite extends SparkFunSuite val conf = new SparkConf assume(!(Utils.isWindows)) withTempDir { dir => - val gpuDiscovery = createTempScriptWithExpectedOutput(dir, "gpuDisocveryScript", + val gpuDiscovery = createTempScriptWithExpectedOutput(dir, "gpuDiscoveryScript", """{"name": "gpu", "addresses": ["0", "1"]}""") conf.set(DRIVER_GPU_ID.amountConf, "2") conf.set(DRIVER_GPU_ID.discoveryScriptConf, gpuDiscovery) diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala index c2730f90ed..fe6d0db837 100644 --- a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala +++ b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala @@ -73,7 +73,7 @@ class NettyRpcEnvSuite extends RpcEnvSuite with MockitoSugar with TimeLimits { val nettyEnv = env.asInstanceOf[NettyRpcEnv] val client = mock[TransportClient] - val senderAddress = RpcAddress("locahost", 12345) + val senderAddress = RpcAddress("localhost", 12345) val receiverAddress = RpcEndpointAddress("localhost", 54321, "test") val receiver = new NettyRpcEndpointRef(nettyEnv.conf, receiverAddress, nettyEnv) diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala index 58aa246b73..194e0dfe31 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala @@ -2569,7 +2569,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti val newTaskSet = taskSets(1) // 2 tasks should have been re-submitted, for tasks 0 and 1 (which ran on hostA). assert(newTaskSet.tasks.size === 2) - // Complete task 0 from the original task set (i.e., not hte one that's currently active). + // Complete task 0 from the original task set (i.e., not the one that's currently active). // This should still be counted towards the job being complete (but there's still one // outstanding task). runEvent(makeCompletionEvent(newTaskSet.tasks(0), Success, makeMapStatus("hostB", 2))) @@ -3057,7 +3057,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti assertResultStageFailToRollback(shuffleMapRdd) } - private def assertResultStageNotRollbacked(mapRdd: MyRDD): Unit = { + private def assertResultStageNotRolledBack(mapRdd: MyRDD): Unit = { val shuffleDep = new ShuffleDependency(mapRdd, new HashPartitioner(2)) val shuffleId = shuffleDep.shuffleId val finalRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker) @@ -3097,7 +3097,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti val shuffleMapRdd = new MyCheckpointRDD(sc, 2, Nil, indeterminate = true) shuffleMapRdd.checkpoint() shuffleMapRdd.doCheckpoint() - assertResultStageNotRollbacked(shuffleMapRdd) + assertResultStageNotRolledBack(shuffleMapRdd) } } diff --git a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala index e6fbf9b09d..cb50c7c959 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala @@ -255,7 +255,7 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp /* * This is a dummy input stream that wraps another input stream but ends prematurely when - * reading at the specified position, throwing an EOFExeption. + * reading at the specified position, throwing an EOFException. */ private class EarlyEOFInputStream(in: InputStream, failAtPos: Int) extends InputStream { private val countDown = new AtomicInteger(failAtPos) diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala index 0874163b0e..88d2868b95 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala @@ -44,7 +44,7 @@ import org.apache.spark.util.{CallSite, ThreadUtils, Utils} * TaskSetManagers. * * Test cases are configured by providing a set of jobs to submit, and then simulating interaction - * with spark's executors via a mocked backend (eg., task completion, task failure, executors + * with spark's executors via a mocked backend (e.g., task completion, task failure, executors * disconnecting, etc.). */ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends SparkFunSuite @@ -372,7 +372,7 @@ private[spark] abstract class MockBackend( /** * Accessed by both scheduling and backend thread, so should be protected by this. - * Most likely the only thing that needs to be protected are the inidividual ExecutorTaskStatus, + * Most likely the only thing that needs to be protected are the individual ExecutorTaskStatus, * but for simplicity in this mock just lock the whole backend. */ def executorIdToExecutor: Map[String, ExecutorTaskStatus] @@ -535,8 +535,8 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor */ testScheduler("super simple job") { def runBackend(): Unit = { - val (taskDescripition, _) = backend.beginTask() - backend.taskSuccess(taskDescripition, 42) + val (taskDescription, _) = backend.beginTask() + backend.taskSuccess(taskDescription, 42) } withBackend(runBackend _) { val jobFuture = submit(new MockRDD(sc, 10, Nil), (0 until 10).toArray) diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala index a4a84b0e89..d72744c5cc 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala @@ -571,9 +571,9 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match } } - test("event queue size can be configued through spark conf") { + test("event queue size can be configured through spark conf") { // configure the shared queue size to be 1, event log queue size to be 2, - // and listner bus event queue size to be 5 + // and listener bus event queue size to be 5 val conf = new SparkConf(false) .set(LISTENER_BUS_EVENT_QUEUE_CAPACITY, 5) .set(s"spark.scheduler.listenerbus.eventqueue.${SHARED_QUEUE}.capacity", "1") @@ -593,7 +593,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match // check the size of shared queue is 1 as configured assert(bus.getQueueCapacity(SHARED_QUEUE) == Some(1)) // no specific size of status queue is configured, - // it shoud use the LISTENER_BUS_EVENT_QUEUE_CAPACITY + // it should use the LISTENER_BUS_EVENT_QUEUE_CAPACITY assert(bus.getQueueCapacity(APP_STATUS_QUEUE) == Some(5)) // check the size of event log queue is 5 as configured assert(bus.getQueueCapacity(EVENT_LOG_QUEUE) == Some(2)) diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala index a760dda389..3bf6cc226c 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala @@ -377,8 +377,8 @@ class TaskSetManagerSuite // offers not accepted due to task set zombies are not delay schedule rejects manager.isZombie = true - val (taskDesciption, delayReject) = manager.resourceOffer("exec2", "host2", ANY) - assert(taskDesciption.isEmpty) + val (taskDescription, delayReject) = manager.resourceOffer("exec2", "host2", ANY) + assert(taskDescription.isEmpty) assert(delayReject === false) manager.isZombie = false @@ -1322,7 +1322,7 @@ class TaskSetManagerSuite test("SPARK-19868: DagScheduler only notified of taskEnd when state is ready") { // dagScheduler.taskEnded() is async, so it may *seem* ok to call it before we've set all - // appropriate state, eg. isZombie. However, this sets up a race that could go the wrong way. + // appropriate state, e.g. isZombie. However, this sets up a race that could go the wrong way. // This is a super-focused regression test which checks the zombie state as soon as // dagScheduler.taskEnded() is called, to ensure we haven't introduced a race. sc = new SparkContext("local", "test") diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala index 6ca1109791..a251c164a7 100644 --- a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala @@ -234,7 +234,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { // Send two executor metrics update. Only update one metric to avoid a lot of boilerplate code. // The tasks are distributed among the two executors, so the executor-level metrics should - // hold half of the cummulative value of the metric being updated. + // hold half of the cumulative value of the metric being updated. Seq(1L, 2L).foreach { value => s1Tasks.foreach { task => val accum = new AccumulableInfo(1L, Some(InternalAccumulator.MEMORY_BYTES_SPILLED), diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala index 144489c5f7..44b6f1b82e 100644 --- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala @@ -1712,12 +1712,12 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE val externalShuffleServicePort = StorageUtils.externalShuffleServicePort(conf) val port = store.blockTransferService.port val rack = Some("rack") - val blockManagerWithTopolgyInfo = BlockManagerId( + val blockManagerWithTopologyInfo = BlockManagerId( store.blockManagerId.executorId, store.blockManagerId.host, store.blockManagerId.port, rack) - store.blockManagerId = blockManagerWithTopolgyInfo + store.blockManagerId = blockManagerWithTopologyInfo val locations = Seq( BlockManagerId("executor4", otherHost, externalShuffleServicePort, rack), BlockManagerId("executor3", otherHost, port, rack), diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala index 4cd1fc19f1..7640c17166 100644 --- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala @@ -506,9 +506,9 @@ class JsonProtocolSuite extends SparkFunSuite { val oldExecutorMetricsJson = JsonProtocol.executorMetricsToJson(executorMetrics) .removeField( _._1 == "MappedPoolMemory") - val exepectedExecutorMetrics = new ExecutorMetrics(Array(12L, 23L, 45L, 67L, + val expectedExecutorMetrics = new ExecutorMetrics(Array(12L, 23L, 45L, 67L, 78L, 89L, 90L, 123L, 456L, 0L, 40L, 20L, 20L, 10L, 20L, 10L)) - assertEquals(exepectedExecutorMetrics, + assertEquals(expectedExecutorMetrics, JsonProtocol.executorMetricsFromJson(oldExecutorMetricsJson)) } @@ -978,8 +978,8 @@ private[spark] object JsonProtocolSuite extends Assertions { private val stackTrace = { Array[StackTraceElement]( new StackTraceElement("Apollo", "Venus", "Mercury", 42), - new StackTraceElement("Afollo", "Vemus", "Mercurry", 420), - new StackTraceElement("Ayollo", "Vesus", "Blackberry", 4200) + new StackTraceElement("Afollo", "Vemus", "Mercurry", 420), /* odd spellings intentional */ + new StackTraceElement("Ayollo", "Vesus", "Blackberry", 4200) /* odd spellings intentional */ ) } diff --git a/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala b/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala index 6183ba9faa..d669f2c655 100644 --- a/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala @@ -94,7 +94,7 @@ class SizeEstimatorSuite override def beforeEach(): Unit = { super.beforeEach() // Set the arch to 64-bit and compressedOops to true so that SizeEstimator - // provides identical results accross all systems in these tests. + // provides identical results across all systems in these tests. reinitializeSizeEstimator("amd64", "true") } diff --git a/docs/_plugins/include_example.rb b/docs/_plugins/include_example.rb index 6b4b1c652a..7d0e787380 100644 --- a/docs/_plugins/include_example.rb +++ b/docs/_plugins/include_example.rb @@ -66,10 +66,10 @@ module Jekyll rendered_code + hint end - # Trim the code block so as to have the same indention, regardless of their positions in the + # Trim the code block so as to have the same indentation, regardless of their positions in the # code file. def trim_codeblock(lines) - # Select the minimum indention of the current code block. + # Select the minimum indentation of the current code block. min_start_spaces = lines .select { |l| l.strip.size !=0 } .map { |l| l[/\A */].size } diff --git a/docs/building-spark.md b/docs/building-spark.md index 73c527b7a5..5106f2abd4 100644 --- a/docs/building-spark.md +++ b/docs/building-spark.md @@ -273,7 +273,7 @@ Enable the profile (e.g. 2.13): # For sbt ./build/sbt -Pscala-2.13 compile -## Running Jenkins tests with Github Enterprise +## Running Jenkins tests with GitHub Enterprise To run tests with Jenkins: diff --git a/docs/configuration.md b/docs/configuration.md index 14ff38dac9..76494b04c9 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -2170,7 +2170,7 @@ Apart from these, the following properties are also available, and may be useful 120s The timeout in seconds to wait to acquire a new executor and schedule a task before aborting a - TaskSet which is unschedulable because all executors are exluded due to task failures. + TaskSet which is unschedulable because all executors are excluded due to task failures. 2.4.1 diff --git a/docs/css/main.css b/docs/css/main.css index 8b279a157c..271113c904 100755 --- a/docs/css/main.css +++ b/docs/css/main.css @@ -254,7 +254,7 @@ a:hover code { position: relative; background-color: #FFF; max-width: 914px; - line-height: 1.6; /* Inspired by Github's wiki style */ + line-height: 1.6; /* Inspired by GitHub's wiki style */ padding-left: 15px; } @@ -263,7 +263,7 @@ a:hover code { position: relative; background-color: #FFF; max-width: 914px; - line-height: 1.6; /* Inspired by Github's wiki style */ + line-height: 1.6; /* Inspired by GitHub's wiki style */ padding-left: 30px; min-height: 100vh; } diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md index 50c9366a09..a1026669dc 100644 --- a/docs/graphx-programming-guide.md +++ b/docs/graphx-programming-guide.md @@ -571,7 +571,7 @@ messages to the source and destination attributes. Think of `sendMsg` as the reduce function in map-reduce. -The [`aggregateMessages`][Graph.aggregateMessages] operator returns a `VertexRDD[Msg]` +The [`aggregateMessages`][Graph.aggregateMessages] operator returns an `VertexRDD[Msg]` containing the aggregate message (of type `Msg`) destined to each vertex. Vertices that did not receive a message are not included in the returned `VertexRDD`[VertexRDD]. @@ -874,7 +874,7 @@ change the `VertexId` thereby enabling the same `HashMap` data structures to be `HashMap` and implement the join by linear scan rather than costly point lookups. The `aggregateUsingIndex` operator is useful for efficient construction of a new `VertexRDD`[VertexRDD] from an -`RDD[(VertexId, A)]`. Conceptually, if I have constructed a `VertexRDD[B]` over a set of vertices, +`RDD[(VertexId, A)]`. Conceptually, if I have constructed an `VertexRDD[B]` over a set of vertices, *which is a super-set* of the vertices in some `RDD[(VertexId, A)]` then I can reuse the index to both aggregate and then subsequently index the `RDD[(VertexId, A)]`. For example: diff --git a/docs/ml-migration-guide.md b/docs/ml-migration-guide.md index 4e6d68f5a8..43b8de83a9 100644 --- a/docs/ml-migration-guide.md +++ b/docs/ml-migration-guide.md @@ -281,7 +281,7 @@ Several deprecated methods were removed in the `spark.mllib` and `spark.ml` pack * `weights` in `LinearRegression` and `LogisticRegression` in `spark.ml` * `setMaxNumIterations` in `mllib.optimization.LBFGS` (marked as `DeveloperApi`) * `treeReduce` and `treeAggregate` in `mllib.rdd.RDDFunctions` (these functions are available on `RDD`s directly, and were marked as `DeveloperApi`) -* `defaultStategy` in `mllib.tree.configuration.Strategy` +* `defaultStrategy` in `mllib.tree.configuration.Strategy` * `build` in `mllib.tree.Node` * libsvm loaders for multiclass and load/save labeledData methods in `mllib.util.MLUtils` diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md index 4cb2e259cc..cc0c0e39e6 100644 --- a/docs/mllib-clustering.md +++ b/docs/mllib-clustering.md @@ -189,7 +189,7 @@ Refer to the [`PowerIterationClustering` Scala docs](api/scala/org/apache/spark/ [`PowerIterationClustering`](api/java/org/apache/spark/mllib/clustering/PowerIterationClustering.html) implements the PIC algorithm. -It takes an `JavaRDD` of `(srcId: Long, dstId: Long, similarity: Double)` tuples representing the +It takes a `JavaRDD` of `(srcId: Long, dstId: Long, similarity: Double)` tuples representing the affinity matrix. Calling `PowerIterationClustering.run` returns a [`PowerIterationClusteringModel`](api/java/org/apache/spark/mllib/clustering/PowerIterationClusteringModel.html) diff --git a/docs/mllib-data-types.md b/docs/mllib-data-types.md index 6d3b1a599d..ce4e6b8e05 100644 --- a/docs/mllib-data-types.md +++ b/docs/mllib-data-types.md @@ -643,7 +643,7 @@ entries = sc.parallelize([MatrixEntry(0, 0, 1.2), MatrixEntry(1, 0, 2.1), Matrix # - or using (long, long, float) tuples: entries = sc.parallelize([(0, 0, 1.2), (1, 0, 2.1), (2, 1, 3.7)]) -# Create an CoordinateMatrix from an RDD of MatrixEntries. +# Create a CoordinateMatrix from an RDD of MatrixEntries. mat = CoordinateMatrix(entries) # Get its size. diff --git a/docs/monitoring.md b/docs/monitoring.md index 15a6cbd910..c6105188f0 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -421,7 +421,7 @@ to handle the Spark Context setup and tear down. In addition to viewing the metrics in the UI, they are also available as JSON. This gives developers an easy way to create new visualizations and monitoring tools for Spark. The JSON is available for -both running applications, and in the history server. The endpoints are mounted at `/api/v1`. Eg., +both running applications, and in the history server. The endpoints are mounted at `/api/v1`. For example, for the history server, they would typically be accessible at `http://:18080/api/v1`, and for a running application, at `http://localhost:4040/api/v1`. @@ -951,11 +951,11 @@ These endpoints have been strongly versioned to make it easier to develop applic * Individual fields will never be removed for any given endpoint * New endpoints may be added * New fields may be added to existing endpoints -* New versions of the api may be added in the future as a separate endpoint (eg., `api/v2`). New versions are *not* required to be backwards compatible. +* New versions of the api may be added in the future as a separate endpoint (e.g., `api/v2`). New versions are *not* required to be backwards compatible. * Api versions may be dropped, but only after at least one minor release of co-existing with a new api version. Note that even when examining the UI of running applications, the `applications/[app-id]` portion is -still required, though there is only one application available. Eg. to see the list of jobs for the +still required, though there is only one application available. E.g. to see the list of jobs for the running app, you would go to `http://localhost:4040/api/v1/applications/[app-id]/jobs`. This is to keep the paths consistent in both modes. diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md index 5ec7a2c6f0..71b7df8176 100644 --- a/docs/running-on-kubernetes.md +++ b/docs/running-on-kubernetes.md @@ -1079,7 +1079,7 @@ See the [configuration page](configuration.html) for information on Spark config 0.1 This sets the Memory Overhead Factor that will allocate memory to non-JVM memory, which includes off-heap memory allocations, non-JVM tasks, and various systems processes. For JVM-based jobs this value will default to 0.10 and 0.40 for non-JVM jobs. - This is done as non-JVM tasks need more non-JVM heap space and such tasks commonly fail with "Memory Overhead Exceeded" errors. This prempts this error with a higher default. + This is done as non-JVM tasks need more non-JVM heap space and such tasks commonly fail with "Memory Overhead Exceeded" errors. This preempts this error with a higher default. 2.4.0 @@ -1402,4 +1402,4 @@ Kubernetes does not tell Spark the addresses of the resources allocated to each ### Stage Level Scheduling Overview Stage level scheduling is supported on Kubernetes when dynamic allocation is enabled. This also requires spark.dynamicAllocation.shuffleTracking.enabled to be enabled since Kubernetes doesn't support an external shuffle service at this time. The order in which containers for different profiles is requested from Kubernetes is not guaranteed. Note that since dynamic allocation on Kubernetes requires the shuffle tracking feature, this means that executors from previous stages that used a different ResourceProfile may not idle timeout due to having shuffle data on them. This could result in using more cluster resources and in the worst case if there are no remaining resources on the Kubernetes cluster then Spark could potentially hang. You may consider looking at config spark.dynamicAllocation.shuffleTracking.timeout to set a timeout, but that could result in data having to be recomputed if the shuffle data is really needed. -Note, there is a difference in the way pod template resources are handled between the base default profile and custom ResourceProfiles. Any resources specified in the pod template file will only be used with the base default profile. If you create custom ResourceProfiles be sure to include all necessary resources there since the resources from the template file will not be propogated to custom ResourceProfiles. +Note, there is a difference in the way pod template resources are handled between the base default profile and custom ResourceProfiles. Any resources specified in the pod template file will only be used with the base default profile. If you create custom ResourceProfiles be sure to include all necessary resources there since the resources from the template file will not be propagated to custom ResourceProfiles. diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md index 80591bd086..8c0bac1815 100644 --- a/docs/running-on-mesos.md +++ b/docs/running-on-mesos.md @@ -857,7 +857,7 @@ See the [configuration page](configuration.html) for information on Spark config host Provides support for the `local:///` scheme to reference the app jar resource in cluster mode. - If user uses a local resource (`local:///path/to/jar`) and the config option is not used it defaults to `host` eg. + If user uses a local resource (`local:///path/to/jar`) and the config option is not used it defaults to `host` e.g. the mesos fetcher tries to get the resource from the host's file system. If the value is unknown it prints a warning msg in the dispatcher logs and defaults to `host`. If the value is `container` then spark submit in the container will use the jar in the container's path: diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md index 73c4930dad..797d18a0d4 100644 --- a/docs/running-on-yarn.md +++ b/docs/running-on-yarn.md @@ -644,7 +644,7 @@ YARN does not tell Spark the addresses of the resources allocated to each contai # Stage Level Scheduling Overview Stage level scheduling is supported on YARN when dynamic allocation is enabled. One thing to note that is YARN specific is that each ResourceProfile requires a different container priority on YARN. The mapping is simply the ResourceProfile id becomes the priority, on YARN lower numbers are higher priority. This means that profiles created earlier will have a higher priority in YARN. Normally this won't matter as Spark finishes one stage before starting another one, the only case this might have an affect is in a job server type scenario, so its something to keep in mind. -Note there is a difference in the way custom resources are handled between the base default profile and custom ResourceProfiles. To allow for the user to request YARN containers with extra resources without Spark scheduling on them, the user can specify resources via the spark.yarn.executor.resource. config. Those configs are only used in the base default profile though and do not get propogated into any other custom ResourceProfiles. This is because there would be no way to remove them if you wanted a stage to not have them. This results in your default profile getting custom resources defined in spark.yarn.executor.resource. plus spark defined resources of GPU or FPGA. Spark converts GPU and FPGA resources into the YARN built in types yarn.io/gpu) and yarn.io/fpga, but does not know the mapping of any other resources. Any other Spark custom resources are not propogated to YARN for the default profile. So if you want Spark to schedule based off a custom resource and have it requested from YARN, you must specify it in both YARN (spark.yarn.{driver/executor}.resource.) and Spark (spark.{driver/executor}.resource.) configs. Leave the Spark config off if you only want YARN containers with the extra resources but Spark not to schedule using them. Now for custom ResourceProfiles, it doesn't currently have a way to only specify YARN resources without Spark scheduling off of them. This means for custom ResourceProfiles we propogate all the resources defined in the ResourceProfile to YARN. We still convert GPU and FPGA to the YARN build in types as well. This requires that the name of any custom resources you specify match what they are defined as in YARN. +Note there is a difference in the way custom resources are handled between the base default profile and custom ResourceProfiles. To allow for the user to request YARN containers with extra resources without Spark scheduling on them, the user can specify resources via the spark.yarn.executor.resource. config. Those configs are only used in the base default profile though and do not get propagated into any other custom ResourceProfiles. This is because there would be no way to remove them if you wanted a stage to not have them. This results in your default profile getting custom resources defined in spark.yarn.executor.resource. plus spark defined resources of GPU or FPGA. Spark converts GPU and FPGA resources into the YARN built in types yarn.io/gpu) and yarn.io/fpga, but does not know the mapping of any other resources. Any other Spark custom resources are not propagated to YARN for the default profile. So if you want Spark to schedule based off a custom resource and have it requested from YARN, you must specify it in both YARN (spark.yarn.{driver/executor}.resource.) and Spark (spark.{driver/executor}.resource.) configs. Leave the Spark config off if you only want YARN containers with the extra resources but Spark not to schedule using them. Now for custom ResourceProfiles, it doesn't currently have a way to only specify YARN resources without Spark scheduling off of them. This means for custom ResourceProfiles we propagate all the resources defined in the ResourceProfile to YARN. We still convert GPU and FPGA to the YARN build in types as well. This requires that the name of any custom resources you specify match what they are defined as in YARN. # Important notes diff --git a/docs/sparkr.md b/docs/sparkr.md index 05310f89f2..002da5a56f 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -671,7 +671,7 @@ Arrow R library is available on CRAN and it can be installed as below. ```bash Rscript -e 'install.packages("arrow", repos="https://cloud.r-project.org/")' ``` -Please refer [the official documentation of Apache Arrow](https://arrow.apache.org/docs/r/) for more detials. +Please refer [the official documentation of Apache Arrow](https://arrow.apache.org/docs/r/) for more details. Note that you must ensure that Arrow R package is installed and available on all cluster nodes. The current supported minimum version is 1.0.0; however, this might change between the minor releases since Arrow optimization in SparkR is experimental. diff --git a/docs/sql-data-sources-jdbc.md b/docs/sql-data-sources-jdbc.md index b95be09745..7d60915e2a 100644 --- a/docs/sql-data-sources-jdbc.md +++ b/docs/sql-data-sources-jdbc.md @@ -131,7 +131,7 @@ the following case-insensitive options: fetchsize - The JDBC fetch size, which determines how many rows to fetch per round trip. This can help performance on JDBC drivers which default to low fetch size (eg. Oracle with 10 rows). This option applies only to reading. + The JDBC fetch size, which determines how many rows to fetch per round trip. This can help performance on JDBC drivers which default to low fetch size (e.g. Oracle with 10 rows). This option applies only to reading. diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 7997090e71..2c86e7a932 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -333,7 +333,7 @@ license: | - - Since Spark 2.4, when there is a struct field in front of the IN operator before a subquery, the inner query must contain a struct field as well. In previous versions, instead, the fields of the struct were compared to the output of the inner query. Eg. if `a` is a `struct(a string, b int)`, in Spark 2.4 `a in (select (1 as a, 'a' as b) from range(1))` is a valid query, while `a in (select 1, 'a' from range(1))` is not. In previous version it was the opposite. + - Since Spark 2.4, when there is a struct field in front of the IN operator before a subquery, the inner query must contain a struct field as well. In previous versions, instead, the fields of the struct were compared to the output of the inner query. For example, if `a` is a `struct(a string, b int)`, in Spark 2.4 `a in (select (1 as a, 'a' as b) from range(1))` is a valid query, while `a in (select 1, 'a' from range(1))` is not. In previous version it was the opposite. - In versions 2.2.1+ and 2.3, if `spark.sql.caseSensitive` is set to true, then the `CURRENT_DATE` and `CURRENT_TIMESTAMP` functions incorrectly became case-sensitive and would resolve to columns (unless typed in lower case). In Spark 2.4 this has been fixed and the functions are no longer case-sensitive. @@ -532,11 +532,11 @@ license: | - Since Spark 2.3, by default arithmetic operations between decimals return a rounded value if an exact representation is not possible (instead of returning NULL). This is compliant with SQL ANSI 2011 specification and Hive's new behavior introduced in Hive 2.2 (HIVE-15331). This involves the following changes - - The rules to determine the result type of an arithmetic operation have been updated. In particular, if the precision / scale needed are out of the range of available values, the scale is reduced up to 6, in order to prevent the truncation of the integer part of the decimals. All the arithmetic operations are affected by the change, ie. addition (`+`), subtraction (`-`), multiplication (`*`), division (`/`), remainder (`%`) and positive module (`pmod`). + - The rules to determine the result type of an arithmetic operation have been updated. In particular, if the precision / scale needed are out of the range of available values, the scale is reduced up to 6, in order to prevent the truncation of the integer part of the decimals. All the arithmetic operations are affected by the change, i.e. addition (`+`), subtraction (`-`), multiplication (`*`), division (`/`), remainder (`%`) and positive modulus (`pmod`). - Literal values used in SQL operations are converted to DECIMAL with the exact precision and scale needed by them. - - The configuration `spark.sql.decimalOperations.allowPrecisionLoss` has been introduced. It defaults to `true`, which means the new behavior described here; if set to `false`, Spark uses previous rules, ie. it doesn't adjust the needed scale to represent the values and it returns NULL if an exact representation of the value is not possible. + - The configuration `spark.sql.decimalOperations.allowPrecisionLoss` has been introduced. It defaults to `true`, which means the new behavior described here; if set to `false`, Spark uses previous rules, i.e. it doesn't adjust the needed scale to represent the values and it returns NULL if an exact representation of the value is not possible. - Un-aliased subquery's semantic has not been well defined with confusing behaviors. Since Spark 2.3, we invalidate such confusing cases, for example: `SELECT v.i from (SELECT i FROM v)`, Spark will throw an analysis exception in this case because users should not be able to use the qualifier inside a subquery. See [SPARK-20690](https://issues.apache.org/jira/browse/SPARK-20690) and [SPARK-21335](https://issues.apache.org/jira/browse/SPARK-21335) for more details. diff --git a/docs/sql-ref-syntax-aux-conf-mgmt-set-timezone.md b/docs/sql-ref-syntax-aux-conf-mgmt-set-timezone.md index 47dd2be77a..ada86d8dd3 100644 --- a/docs/sql-ref-syntax-aux-conf-mgmt-set-timezone.md +++ b/docs/sql-ref-syntax-aux-conf-mgmt-set-timezone.md @@ -43,7 +43,7 @@ SET TIME ZONE INTERVAL interval_literal * **interval_literal** - The [interval literal](sql-ref-literals.html#interval-literal) represents the difference between the session time zone to the 'UTC'. It must be in the range of [-18, 18] hours and max to second precision, e.g. `INTERVAL 2 HOURS 30 MINITUES` or `INTERVAL '15:40:32' HOUR TO SECOND`. + The [interval literal](sql-ref-literals.html#interval-literal) represents the difference between the session time zone to the 'UTC'. It must be in the range of [-18, 18] hours and max to second precision, e.g. `INTERVAL 2 HOURS 30 MINUTES` or `INTERVAL '15:40:32' HOUR TO SECOND`. ### Examples diff --git a/docs/sql-ref-syntax-ddl-create-table-hiveformat.md b/docs/sql-ref-syntax-ddl-create-table-hiveformat.md index 3a8c8d5b11..11ec2f1d9e 100644 --- a/docs/sql-ref-syntax-ddl-create-table-hiveformat.md +++ b/docs/sql-ref-syntax-ddl-create-table-hiveformat.md @@ -42,10 +42,10 @@ CREATE [ EXTERNAL ] TABLE [ IF NOT EXISTS ] table_identifier row_format: : SERDE serde_class [ WITH SERDEPROPERTIES (k1=v1, k2=v2, ... ) ] - | DELIMITED [ FIELDS TERMINATED BY fields_termiated_char [ ESCAPED BY escaped_char ] ] - [ COLLECTION ITEMS TERMINATED BY collection_items_termiated_char ] - [ MAP KEYS TERMINATED BY map_key_termiated_char ] - [ LINES TERMINATED BY row_termiated_char ] + | DELIMITED [ FIELDS TERMINATED BY fields_terminated_char [ ESCAPED BY escaped_char ] ] + [ COLLECTION ITEMS TERMINATED BY collection_items_terminated_char ] + [ MAP KEYS TERMINATED BY map_key_terminated_char ] + [ LINES TERMINATED BY row_terminated_char ] [ NULL DEFINED AS null_char ] ``` diff --git a/docs/sql-ref-syntax-dml-insert-into.md b/docs/sql-ref-syntax-dml-insert-into.md index ed5da2b2d2..39d15808d0 100644 --- a/docs/sql-ref-syntax-dml-insert-into.md +++ b/docs/sql-ref-syntax-dml-insert-into.md @@ -69,11 +69,11 @@ INSERT INTO students VALUES ('Amy Smith', '123 Park Ave, San Jose', 111111); SELECT * FROM students; -+---------+---------------------+----------+ -| name| address|student_id| -+---------+---------------------+----------+ -|Amy Smith|123 Park Ave,San Jose| 111111| -+---------+---------------------+----------+ ++---------+----------------------+----------+ +| name| address|student_id| ++---------+----------------------+----------+ +|Amy Smith|123 Park Ave, San Jose| 111111| ++---------+----------------------+----------+ ``` #### Multi-Row Insert Using a VALUES Clause @@ -100,29 +100,29 @@ SELECT * FROM students; ```sql -- Assuming the persons table has already been created and populated. SELECT * FROM persons; -+-------------+-------------------------+---------+ -| name| address| ssn| -+-------------+-------------------------+---------+ -|Dora Williams|134 Forest Ave, Melo Park|123456789| -+-------------+-------------------------+---------+ -| Eddie Davis| 245 Market St, Milpitas|345678901| -+-------------+-------------------------+---------+ ++-------------+--------------------------+---------+ +| name| address| ssn| ++-------------+--------------------------+---------+ +|Dora Williams|134 Forest Ave, Menlo Park|123456789| ++-------------+--------------------------+---------+ +| Eddie Davis| 245 Market St, Milpitas|345678901| ++-------------+--------------------------+---------+ INSERT INTO students PARTITION (student_id = 444444) SELECT name, address FROM persons WHERE name = "Dora Williams"; SELECT * FROM students; -+-------------+-------------------------+----------+ -| name| address|student_id| -+-------------+-------------------------+----------+ -| Amy Smith| 123 Park Ave, San Jose| 111111| -+-------------+-------------------------+----------+ -| Bob Brown| 456 Taylor St, Cupertino| 222222| -+-------------+-------------------------+----------+ -|Cathy Johnson| 789 Race Ave, Palo Alto| 333333| -+-------------+-------------------------+----------+ -|Dora Williams|134 Forest Ave, Melo Park| 444444| -+-------------+-------------------------+----------+ ++-------------+--------------------------+----------+ +| name| address|student_id| ++-------------+--------------------------+----------+ +| Amy Smith| 123 Park Ave, San Jose| 111111| ++-------------+--------------------------+----------+ +| Bob Brown| 456 Taylor St, Cupertino| 222222| ++-------------+--------------------------+----------+ +|Cathy Johnson| 789 Race Ave, Palo Alto| 333333| ++-------------+--------------------------+----------+ +|Dora Williams|134 Forest Ave, Menlo Park| 444444| ++-------------+--------------------------+----------+ ``` #### Insert Using a TABLE Statement @@ -141,21 +141,21 @@ SELECT * FROM visiting_students; INSERT INTO students TABLE visiting_students; SELECT * FROM students; -+-------------+-------------------------+----------+ -| name| address|student_id| -+-------------+-------------------------+----------+ -| Amy Smith| 123 Park Ave,San Jose| 111111| -+-------------+-------------------------+----------+ -| Bob Brown| 456 Taylor St, Cupertino| 222222| -+-------------+-------------------------+----------+ -|Cathy Johnson| 789 Race Ave, Palo Alto| 333333| -+-------------+-------------------------+----------+ -|Dora Williams|134 Forest Ave, Melo Park| 444444| -+-------------+-------------------------+----------+ -|Fleur Laurent| 345 Copper St, London| 777777| -+-------------+-------------------------+----------+ -|Gordon Martin| 779 Lake Ave, Oxford| 888888| -+-------------+-------------------------+----------+ ++-------------+--------------------------+----------+ +| name| address|student_id| ++-------------+--------------------------+----------+ +| Amy Smith| 123 Park Ave, San Jose| 111111| ++-------------+--------------------------+----------+ +| Bob Brown| 456 Taylor St, Cupertino| 222222| ++-------------+--------------------------+----------+ +|Cathy Johnson| 789 Race Ave, Palo Alto| 333333| ++-------------+--------------------------+----------+ +|Dora Williams|134 Forest Ave, Menlo Park| 444444| ++-------------+--------------------------+----------+ +|Fleur Laurent| 345 Copper St, London| 777777| ++-------------+--------------------------+----------+ +|Gordon Martin| 779 Lake Ave, Oxford| 888888| ++-------------+--------------------------+----------+ ``` #### Insert Using a FROM Statement @@ -177,25 +177,25 @@ INSERT INTO students FROM applicants SELECT name, address, id applicants WHERE qualified = true; SELECT * FROM students; -+-------------+-------------------------+----------+ -| name| address|student_id| -+-------------+-------------------------+----------+ -| Amy Smith| 123 Park Ave, San Jose| 111111| -+-------------+-------------------------+----------+ -| Bob Brown| 456 Taylor St, Cupertino| 222222| -+-------------+-------------------------+----------+ -|Cathy Johnson| 789 Race Ave, Palo Alto| 333333| -+-------------+-------------------------+----------+ -|Dora Williams|134 Forest Ave, Melo Park| 444444| -+-------------+-------------------------+----------+ -|Fleur Laurent| 345 Copper St, London| 777777| -+-------------+-------------------------+----------+ -|Gordon Martin| 779 Lake Ave, Oxford| 888888| -+-------------+-------------------------+----------+ -| Helen Davis|469 Mission St, San Diego| 999999| -+-------------+-------------------------+----------+ -| Jason Wang| 908 Bird St, Saratoga| 121212| -+-------------+-------------------------+----------+ ++-------------+--------------------------+----------+ +| name| address|student_id| ++-------------+--------------------------+----------+ +| Amy Smith| 123 Park Ave, San Jose| 111111| ++-------------+--------------------------+----------+ +| Bob Brown| 456 Taylor St, Cupertino| 222222| ++-------------+--------------------------+----------+ +|Cathy Johnson| 789 Race Ave, Palo Alto| 333333| ++-------------+--------------------------+----------+ +|Dora Williams|134 Forest Ave, Menlo Park| 444444| ++-------------+--------------------------+----------+ +|Fleur Laurent| 345 Copper St, London| 777777| ++-------------+--------------------------+----------+ +|Gordon Martin| 779 Lake Ave, Oxford| 888888| ++-------------+--------------------------+----------+ +| Helen Davis| 469 Mission St, San Diego| 999999| ++-------------+--------------------------+----------+ +| Jason Wang| 908 Bird St, Saratoga| 121212| ++-------------+--------------------------+----------+ ``` ### Related Statements diff --git a/docs/sql-ref-syntax-dml-insert-overwrite-table.md b/docs/sql-ref-syntax-dml-insert-overwrite-table.md index ecfd060dfd..638dcb34bb 100644 --- a/docs/sql-ref-syntax-dml-insert-overwrite-table.md +++ b/docs/sql-ref-syntax-dml-insert-overwrite-table.md @@ -64,18 +64,18 @@ INSERT OVERWRITE [ TABLE ] table_identifier [ partition_spec [ IF NOT EXISTS ] ] ```sql -- Assuming the students table has already been created and populated. SELECT * FROM students; -+-------------+-------------------------+----------+ -| name| address|student_id| -+-------------+-------------------------+----------+ -| Amy Smith| 123 Park Ave, San Jose| 111111| -| Bob Brown| 456 Taylor St, Cupertino| 222222| -|Cathy Johnson| 789 Race Ave, Palo Alto| 333333| -|Dora Williams|134 Forest Ave, Melo Park| 444444| -|Fleur Laurent| 345 Copper St, London| 777777| -|Gordon Martin| 779 Lake Ave, Oxford| 888888| -| Helen Davis|469 Mission St, San Diego| 999999| -| Jason Wang| 908 Bird St, Saratoga| 121212| -+-------------+-------------------------+----------+ ++-------------+--------------------------+----------+ +| name| address|student_id| ++-------------+--------------------------+----------+ +| Amy Smith| 123 Park Ave, San Jose| 111111| +| Bob Brown| 456 Taylor St, Cupertino| 222222| +|Cathy Johnson| 789 Race Ave, Palo Alto| 333333| +|Dora Williams|134 Forest Ave, Menlo Park| 444444| +|Fleur Laurent| 345 Copper St, London| 777777| +|Gordon Martin| 779 Lake Ave, Oxford| 888888| +| Helen Davis| 469 Mission St, San Diego| 999999| +| Jason Wang| 908 Bird St, Saratoga| 121212| ++-------------+--------------------------+----------+ INSERT OVERWRITE students VALUES ('Ashua Hill', '456 Erica Ct, Cupertino', 111111), @@ -95,25 +95,25 @@ SELECT * FROM students; ```sql -- Assuming the persons table has already been created and populated. SELECT * FROM persons; -+-------------+-------------------------+---------+ -| name| address| ssn| -+-------------+-------------------------+---------+ -|Dora Williams|134 Forest Ave, Melo Park|123456789| -+-------------+-------------------------+---------+ -| Eddie Davis| 245 Market St,Milpitas|345678901| -+-------------+-------------------------+---------+ ++-------------+--------------------------+---------+ +| name| address| ssn| ++-------------+--------------------------+---------+ +|Dora Williams|134 Forest Ave, Menlo Park|123456789| ++-------------+--------------------------+---------+ +| Eddie Davis| 245 Market St, Milpitas|345678901| ++-------------+--------------------------+---------+ INSERT OVERWRITE students PARTITION (student_id = 222222) SELECT name, address FROM persons WHERE name = "Dora Williams"; SELECT * FROM students; -+-------------+-------------------------+----------+ -| name| address|student_id| -+-------------+-------------------------+----------+ -| Ashua Hill| 456 Erica Ct, Cupertino| 111111| -+-------------+-------------------------+----------+ -|Dora Williams|134 Forest Ave, Melo Park| 222222| -+-------------+-------------------------+----------+ ++-------------+--------------------------+----------+ +| name| address|student_id| ++-------------+--------------------------+----------+ +| Ashua Hill| 456 Erica Ct, Cupertino| 111111| ++-------------+--------------------------+----------+ +|Dora Williams|134 Forest Ave, Menlo Park| 222222| ++-------------+--------------------------+----------+ ``` #### Insert Using a TABLE Statement diff --git a/docs/sql-ref-syntax-qry-select-groupby.md b/docs/sql-ref-syntax-qry-select-groupby.md index 934e5f70d4..ef9de1f594 100644 --- a/docs/sql-ref-syntax-qry-select-groupby.md +++ b/docs/sql-ref-syntax-qry-select-groupby.md @@ -269,7 +269,7 @@ INSERT INTO person VALUES (300, 'Mike', 80), (400, 'Dan', 50); ---Select the first row in cloumn age +--Select the first row in column age SELECT FIRST(age) FROM person; +--------------------+ | first(age, false) | @@ -277,7 +277,7 @@ SELECT FIRST(age) FROM person; | NULL | +--------------------+ ---Get the first row in cloumn `age` ignore nulls,last row in column `id` and sum of cloumn `id`. +--Get the first row in column `age` ignore nulls,last row in column `id` and sum of column `id`. SELECT FIRST(age IGNORE NULLS), LAST(id), SUM(id) FROM person; +-------------------+------------------+----------+ | first(age, true) | last(id, false) | sum(id) | diff --git a/docs/sql-ref-syntax-qry-select-lateral-view.md b/docs/sql-ref-syntax-qry-select-lateral-view.md index f742c8fa57..c854625a1a 100644 --- a/docs/sql-ref-syntax-qry-select-lateral-view.md +++ b/docs/sql-ref-syntax-qry-select-lateral-view.md @@ -58,7 +58,7 @@ INSERT INTO person VALUES (400, 'Dan', 50, 4, 'Street 4'); SELECT * FROM person - LATERAL VIEW EXPLODE(ARRAY(30, 60)) tabelName AS c_age + LATERAL VIEW EXPLODE(ARRAY(30, 60)) tableName AS c_age LATERAL VIEW EXPLODE(ARRAY(40, 80)) AS d_age; +------+-------+-------+--------+-----------+--------+--------+ | id | name | age | class | address | c_age | d_age | @@ -93,14 +93,14 @@ GROUP BY c_age; +--------+-----------+ SELECT * FROM person - LATERAL VIEW EXPLODE(ARRAY()) tabelName AS c_age; + LATERAL VIEW EXPLODE(ARRAY()) tableName AS c_age; +-----+-------+------+--------+----------+--------+ | id | name | age | class | address | c_age | +-----+-------+------+--------+----------+--------+ +-----+-------+------+--------+----------+--------+ SELECT * FROM person - LATERAL VIEW OUTER EXPLODE(ARRAY()) tabelName AS c_age; + LATERAL VIEW OUTER EXPLODE(ARRAY()) tableName AS c_age; +------+-------+-------+--------+-----------+--------+ | id | name | age | class | address | c_age | +------+-------+-------+--------+-----------+--------+ diff --git a/docs/sql-ref-syntax-qry-select-orderby.md b/docs/sql-ref-syntax-qry-select-orderby.md index 13f0ae40cb..552ee9be66 100644 --- a/docs/sql-ref-syntax-qry-select-orderby.md +++ b/docs/sql-ref-syntax-qry-select-orderby.md @@ -28,7 +28,7 @@ clause, this clause guarantees a total order in the output. ### Syntax ```sql -ORDER BY { expression [ sort_direction | nulls_sort_oder ] [ , ... ] } +ORDER BY { expression [ sort_direction | nulls_sort_order ] [ , ... ] } ``` ### Parameters diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringMetrics.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringMetrics.scala index a785d063f1..3dea244c77 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringMetrics.scala @@ -127,7 +127,7 @@ private[evaluation] abstract class Silhouette { * `$a_{i}$` can be interpreted as how well `i` is assigned to its cluster * (the smaller the value, the better the assignment), while `$b_{i}$` is * a measure of how well `i` has not been assigned to its "neighboring cluster", - * ie. the nearest cluster to `i`. + * i.e. the nearest cluster to `i`. * * Unfortunately, the naive implementation of the algorithm requires to compute * the distance of each couple of points in the dataset. Since the computation of @@ -486,7 +486,7 @@ private[evaluation] object CosineSilhouette extends Silhouette { * for the point. * @param weightCol The name of the column which contains the instance weight. * @return A [[scala.collection.immutable.Map]] which associates each cluster id to a - * its statistics (ie. the precomputed values `N` and `$\Omega_{\Gamma}$`). + * its statistics (i.e. the precomputed values `N` and `$\Omega_{\Gamma}$`). */ def computeClusterStats( df: DataFrame, diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala index 5ed7619fce..2ec7a8632e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala @@ -112,7 +112,7 @@ final class Binarizer @Since("1.4.0") (@Since("1.4.0") override val uid: String) (Seq($(inputCol)), Seq($(outputCol)), Seq($(threshold))) } - val ouputCols = inputColNames.zip(tds).map { case (inputColName, td) => + val mappedOutputCols = inputColNames.zip(tds).map { case (inputColName, td) => val binarizerUDF = dataset.schema(inputColName).dataType match { case DoubleType => udf { in: Double => if (in > td) 1.0 else 0.0 } @@ -147,8 +147,8 @@ final class Binarizer @Since("1.4.0") (@Since("1.4.0") override val uid: String) binarizerUDF(col(inputColName)) } - val ouputMetadata = outputColNames.map(outputSchema(_).metadata) - dataset.withColumns(outputColNames, ouputCols, ouputMetadata) + val outputMetadata = outputColNames.map(outputSchema(_).metadata) + dataset.withColumns(outputColNames, mappedOutputCols, outputMetadata) } @Since("1.4.0") diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Selector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Selector.scala index 46052a89fd..41de26dff0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Selector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Selector.scala @@ -77,7 +77,7 @@ private[feature] trait SelectorParams extends Params * @group param */ @Since("3.1.0") - final val fpr = new DoubleParam(this, "fpr", "The higest p-value for features to be kept.", + final val fpr = new DoubleParam(this, "fpr", "The highest p-value for features to be kept.", ParamValidators.inRange(0, 1)) /** @group getParam */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index b6ed4f2b00..8bcd7909b6 100755 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -166,11 +166,11 @@ class StopWordsRemover @Since("1.5.0") (@Since("1.5.0") override val uid: String } val (inputColNames, outputColNames) = getInOutCols() - val ouputCols = inputColNames.map { inputColName => + val outputCols = inputColNames.map { inputColName => t(col(inputColName)) } - val ouputMetadata = outputColNames.map(outputSchema(_).metadata) - dataset.withColumns(outputColNames, ouputCols, ouputMetadata) + val outputMetadata = outputColNames.map(outputSchema(_).metadata) + dataset.withColumns(outputColNames, outputCols, outputMetadata) } @Since("1.5.0") diff --git a/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala b/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala index 5efcf0dce6..37b715930a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala @@ -133,7 +133,7 @@ object ImageSchema { val img = try { ImageIO.read(new ByteArrayInputStream(bytes)) } catch { - // Catch runtime exception because `ImageIO` may throw unexcepted `RuntimeException`. + // Catch runtime exception because `ImageIO` may throw unexpected `RuntimeException`. // But do not catch the declared `IOException` (regarded as FileSystem failure) case _: RuntimeException => null } diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala index 1b5f77a9ae..594d9f315f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala @@ -88,9 +88,9 @@ private[r] object AFTSurvivalRegressionWrapper extends MLReadable[AFTSurvivalReg aggregationDepth: Int, stringIndexerOrderType: String): AFTSurvivalRegressionWrapper = { - val (rewritedFormula, censorCol) = formulaRewrite(formula) + val (rewrittenFormula, censorCol) = formulaRewrite(formula) - val rFormula = new RFormula().setFormula(rewritedFormula) + val rFormula = new RFormula().setFormula(rewrittenFormula) .setStringIndexerOrderType(stringIndexerOrderType) RWrapperUtils.checkDataColumns(rFormula, data) val rFormulaModel = rFormula.fit(data) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/FMRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/FMRegressor.scala index 84c0985245..f70baa4ddd 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/FMRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/FMRegressor.scala @@ -555,7 +555,7 @@ object FMRegressionModel extends MLReadable[FMRegressionModel] { * \hat{y} = p\left( y_{fm} \right) * }}} * p is the prediction function, for binary classification task is sigmoid. - * The loss funcation gradient formula: + * The loss function gradient formula: * {{{ * \frac{\partial}{\partial\theta} l\left( \hat{y},y \right) = * \frac{\partial}{\partial\theta} l\left( p\left( y_{fm} \right),y \right) = diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala index 087c2c2639..90cc4fb13b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala @@ -146,7 +146,7 @@ class SVMWithSGD private ( /** * Construct a SVM object with default parameters: {stepSize: 1.0, numIterations: 100, - * regParm: 0.01, miniBatchFraction: 1.0}. + * regParam: 0.01, miniBatchFraction: 1.0}. */ @Since("0.8.0") def this() = this(1.0, 100, 0.01, 1.0) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/DistanceMeasure.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/DistanceMeasure.scala index bffed61c29..9ac473aabe 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/DistanceMeasure.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/DistanceMeasure.scala @@ -41,7 +41,7 @@ private[spark] abstract class DistanceMeasure extends Serializable { * 1, if i != j: a bound r = matrix(i,j) to help avoiding unnecessary distance * computation. Given point x, let i be current closest center, and d be current best * distance, if d < f(r), then we no longer need to compute the distance to center j; - * 2, if i == j: a bound r = matrix(i,i) = min_k{maxtrix(i,k)|k!=i}. If distance + * 2, if i == j: a bound r = matrix(i,i) = min_k{matrix(i,k)|k!=i}. If distance * between point x and center i is less than f(r), then center i is the closest center * to point x. */ @@ -268,7 +268,7 @@ private[spark] class EuclideanDistanceMeasure extends DistanceMeasure { * squared distance, if d < r, then we no longer need to compute the distance to center * j. matrix(i,j) equals to squared of half of Euclidean distance between centers i * and j; - * 2, if i == j: a bound r = matrix(i,i) = min_k{maxtrix(i,k)|k!=i}. If squared + * 2, if i == j: a bound r = matrix(i,i) = min_k{matrix(i,k)|k!=i}. If squared * distance between point x and center i is less than r, then center i is the closest * center to point x. */ @@ -405,7 +405,7 @@ private[spark] class CosineDistanceMeasure extends DistanceMeasure { * is used instead of Cosine distance to compute matrix(i,j): for centers i and j, * compute the radian/angle between them, halving it, and converting it back to Cosine * distance at the end; - * 2, if i == j: a bound r = matrix(i,i) = min_k{maxtrix(i,k)|k!=i}. If Cosine + * 2, if i == j: a bound r = matrix(i,i) = min_k{matrix(i,k)|k!=i}. If Cosine * distance between point x and center i is less than r, then center i is the closest * center to point x. */ diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala index b2742ee6ec..c9f6d789d6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala @@ -466,7 +466,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer with Logging { val seed = randomGenerator.nextLong() // If and only if optimizeDocConcentration is set true, // we calculate logphat in the same pass as other statistics. - // No calculation of loghat happens otherwise. + // No calculation of logphat happens otherwise. val logphatPartOptionBase = () => if (optimizeDocConcentration) { Some(BDV.zeros[Double](k)) } else { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala index 3c9b806d61..111030dada 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala @@ -36,7 +36,7 @@ import org.apache.spark.util.random.XORShiftRandom * doing a single iteration of the standard k-means algorithm. * * The update algorithm uses the "mini-batch" KMeans rule, - * generalized to incorporate forgetfullness (i.e. decay). + * generalized to incorporate forgetfulness (i.e. decay). * The update rule (for each cluster) is: * *
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala index c165d4810c..f7c6d09f5e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala @@ -48,11 +48,11 @@ class PCA @Since("1.4.0") (@Since("1.4.0") val k: Int) { val mat = if (numFeatures > 65535) { val summary = Statistics.colStats(sources.map((_, 1.0)), Seq("mean")) val mean = Vectors.fromML(summary.mean) - val meanCentredRdd = sources.map { row => + val meanCenteredRdd = sources.map { row => BLAS.axpy(-1, mean, row) row } - new RowMatrix(meanCentredRdd) + new RowMatrix(meanCenteredRdd) } else { require(PCAUtil.memoryCost(k, numFeatures) < Int.MaxValue, "The param k and numFeatures is too large for SVD computation. " + diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index 13899fa829..eeb583f84c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -560,7 +560,7 @@ class Word2VecModel private[spark] ( /** * Find synonyms of the vector representation of a word, possibly - * including any words in the model vocabulary whose vector respresentation + * including any words in the model vocabulary whose vector representation * is the supplied vector. * @param vector vector representation of a word * @param num number of synonyms to find diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala index 601c7da30f..606e2f2f21 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala @@ -88,8 +88,8 @@ class AssociationRules private[fpm] ( // Join to get (X, ((Y, freq(X union Y)), freq(X))), generate rules, and filter by confidence candidates.join(freqItemsets.map(x => (x.items.toSeq, x.freq))) - .map { case (antecendent, ((consequent, freqUnion), freqAntecedent)) => - new Rule(antecendent.toArray, + .map { case (antecedent, ((consequent, freqUnion), freqAntecedent)) => + new Rule(antecedent.toArray, consequent.toArray, freqUnion, freqAntecedent, diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala index c618b71ddc..d546f0c1a8 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala @@ -693,11 +693,11 @@ class RowMatrix @Since("1.0.0") ( val pBV = sc.broadcast(colMagsCorrected.map(c => sg / c)) val qBV = sc.broadcast(colMagsCorrected.map(c => math.min(sg, c))) - val sims = rows.mapPartitionsWithIndex { (indx, iter) => + val sims = rows.mapPartitionsWithIndex { (index, iter) => val p = pBV.value val q = qBV.value - val rand = new XORShiftRandom(indx) + val rand = new XORShiftRandom(index) val scaled = new Array[Double](p.size) iter.flatMap { row => row match { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala index d17f7047c5..778de30e75 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala @@ -91,7 +91,7 @@ private[stat] object KolmogorovSmirnovTest extends Logging { * @param partData `Iterator[Double]` 1 partition of a sorted RDD * @param n `Double` the total size of the RDD * @param cdf `Double => Double` a function the calculates the theoretical CDF of a value - * @return `Iterator[(Double, Double)] `Unadjusted (ie. off by a constant) potential extrema + * @return `Iterator[(Double, Double)] `Unadjusted (i.e. off by a constant) potential extrema * in a partition. The first element corresponds to the (empirical CDF - 1/N) - CDF, * the second element corresponds to empirical CDF - CDF. We can then search the resulting * iterator for the minimum of the first and the maximum of the second element, and provide diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java index 6480b57e1f..af32e03854 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java @@ -41,7 +41,7 @@ public class JavaStopWordsRemoverSuite extends SharedSparkSession { .setOutputCol("filtered"); List data = Arrays.asList( - RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")), + RowFactory.create(Arrays.asList("I", "saw", "the", "red", "balloon")), RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb")) ); StructType schema = new StructType(new StructField[]{ diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala index b35f964c95..0eae23df83 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala @@ -181,7 +181,7 @@ class GaussianMixtureSuite extends MLTest with DefaultReadWriteTest { } } - test("multivariate data and check againt R mvnormalmixEM") { + test("multivariate data and check against R mvnormalmixEM") { /* Using the following R code to generate data and train the model using mixtools package. library(mvtnorm) diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala index 5ee161ce8d..deaad2bd54 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala @@ -36,7 +36,7 @@ class RegressionEvaluatorSuite test("Regression Evaluator: default params") { /** * Here is the instruction describing how to export the test data into CSV format - * so we can validate the metrics compared with R's mmetric package. + * so we can validate the metrics compared with R's mmetric function. * * import org.apache.spark.mllib.util.LinearDataGenerator * val data = sc.parallelize(LinearDataGenerator.generateLinearInput(6.3, diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ANOVASelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ANOVASelectorSuite.scala index 1e1ab206cc..0d664e421d 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/ANOVASelectorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ANOVASelectorSuite.scala @@ -133,35 +133,35 @@ class ANOVASelectorSuite extends MLTest with DefaultReadWriteTest { ParamsSuite.checkParams(new ANOVASelector()) } - test("Test ANOVAFValue calssification selector: numTopFeatures") { + test("Test ANOVAFValue classification selector: numTopFeatures") { val selector = new ANOVASelector() .setOutputCol("filtered").setSelectorType("numTopFeatures").setNumTopFeatures(1) val model = testSelector(selector, dataset) MLTestingUtils.checkCopyAndUids(selector, model) } - test("Test ANOVAFValue calssification selector: percentile") { + test("Test ANOVAFValue classification selector: percentile") { val selector = new ANOVASelector() .setOutputCol("filtered").setSelectorType("percentile").setPercentile(0.17) val model = testSelector(selector, dataset) MLTestingUtils.checkCopyAndUids(selector, model) } - test("Test ANOVAFValue calssification selector: fpr") { + test("Test ANOVAFValue classification selector: fpr") { val selector = new ANOVASelector() .setOutputCol("filtered").setSelectorType("fpr").setFpr(1.0E-12) val model = testSelector(selector, dataset) MLTestingUtils.checkCopyAndUids(selector, model) } - test("Test ANOVAFValue calssification selector: fdr") { + test("Test ANOVAFValue classification selector: fdr") { val selector = new ANOVASelector() .setOutputCol("filtered").setSelectorType("fdr").setFdr(6.0E-12) val model = testSelector(selector, dataset) MLTestingUtils.checkCopyAndUids(selector, model) } - test("Test ANOVAFValue calssification selector: fwe") { + test("Test ANOVAFValue classification selector: fwe") { val selector = new ANOVASelector() .setOutputCol("filtered").setSelectorType("fwe").setFwe(6.0E-12) val model = testSelector(selector, dataset) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/DCTSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/DCTSuite.scala index 19645b517d..8f8365a590 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/DCTSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/DCTSuite.scala @@ -81,7 +81,7 @@ class DCTSuite extends MLTest with DefaultReadWriteTest { .map { case Row(vec: Vector) => vec.size } .head() - // Can not infer size of ouput vector, since no metadata is provided + // Can not infer size of output vector, since no metadata is provided intercept[TestFailedException] { val transformed = transformer.transform(dataset) checkVectorSizeOnDF(transformed, "resultVec", vectorSize) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala index 9356468199..55dade2892 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.types.DataTypes private[ml] object LSHTest { /** - * For any locality sensitive function h in a metric space, we meed to verify whether + * For any locality sensitive function h in a metric space, we need to verify whether * the following property is satisfied. * * There exist dist1, dist2, p1, p2, so that for any two elements e1 and e2, diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VarianceThresholdSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VarianceThresholdSelectorSuite.scala index cc451c0b60..142abf2ccd 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/VarianceThresholdSelectorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VarianceThresholdSelectorSuite.scala @@ -53,7 +53,7 @@ class VarianceThresholdSelectorSuite extends MLTest with DefaultReadWriteTest { ParamsSuite.checkParams(new VarianceThresholdSelector) } - test("Test VarianceThresholdSelector: varainceThreshold not set") { + test("Test VarianceThresholdSelector: varianceThreshold not set") { val selector = new VarianceThresholdSelector().setOutputCol("filtered") val model = testSelector(selector, dataset) MLTestingUtils.checkCopyAndUids(selector, model) diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index a0e17a4b40..bfa9f4b595 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -494,7 +494,7 @@ class GeneralizedLinearRegressionSuite extends MLTest with DefaultReadWriteTest [1] -0.0457441 -0.6833928 [1] 1.8121235 -0.1747493 -0.5815417 - R code for deivance calculation: + R code for deviance calculation: data = cbind(y=c(0,1,0,0,0,1), x1=c(18, 12, 15, 13, 15, 16), x2=c(1,0,0,2,1,1)) summary(glm(y~x1+x2, family=poisson, data=data.frame(data)))$deviance [1] 3.70055 @@ -1661,7 +1661,7 @@ class GeneralizedLinearRegressionSuite extends MLTest with DefaultReadWriteTest } test("evaluate with labels that are not doubles") { - // Evaulate with a dataset that contains Labels not as doubles to verify correct casting + // Evaluate with a dataset that contains Labels not as doubles to verify correct casting val dataset = Seq( Instance(17.0, 1.0, Vectors.dense(0.0, 5.0).toSparse), Instance(19.0, 1.0, Vectors.dense(1.0, 7.0)), diff --git a/pom.xml b/pom.xml index cd7e1767d6..f0ad9b0167 100644 --- a/pom.xml +++ b/pom.xml @@ -229,7 +229,7 @@ declared in the projects that build assemblies. For other projects the scope should remain as "compile", otherwise they are not available - during compilation if the dependency is transivite (e.g. "graphx/" depending on "core/" and + during compilation if the dependency is transitive (e.g. "graphx/" depending on "core/" and needing Hadoop classes in the classpath to compile). --> compile @@ -1758,7 +1758,7 @@ ${hive.deps.scope} - + ${hive.group} hive-metastore diff --git a/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala b/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala index f696e93e9c..386de19e91 100644 --- a/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala +++ b/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala @@ -113,10 +113,9 @@ class ExecutorClassLoaderSuite val classLoader = new ExecutorClassLoader( new SparkConf(), null, url1, parentLoader, true) - // load 'scala.Option', using ClassforName to do the exact same behavior as - // what JavaDeserializationStream does - // scalastyle:off classforname + // load 'scala.Option', using Class.forName to do the exact same behavior as + // what JavaDeserializationStream does val optionClass = Class.forName("scala.Option", false, classLoader) // scalastyle:on classforname