[SPARK-25705][BUILD][STREAMING][TEST-MAVEN] Remove Kafka 0.8 integration

## What changes were proposed in this pull request?

Remove Kafka 0.8 integration

## How was this patch tested?

Existing tests, build scripts

Closes #22703 from srowen/SPARK-25705.

Authored-by: Sean Owen <sean.owen@databricks.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
This commit is contained in:
Sean Owen 2018-10-16 09:10:24 -05:00
parent 2c664edc06
commit 703e6da1ec
51 changed files with 39 additions and 5987 deletions

View file

@ -992,9 +992,9 @@ private[spark] object SparkSubmitUtils {
// Exposed for testing. // Exposed for testing.
// These components are used to make the default exclusion rules for Spark dependencies. // These components are used to make the default exclusion rules for Spark dependencies.
// We need to specify each component explicitly, otherwise we miss spark-streaming-kafka-0-8 and // We need to specify each component explicitly, otherwise we miss
// other spark-streaming utility components. Underscore is there to differentiate between // spark-streaming utility components. Underscore is there to differentiate between
// spark-streaming_2.1x and spark-streaming-kafka-0-8-assembly_2.1x // spark-streaming_2.1x and spark-streaming-kafka-0-10-assembly_2.1x
val IVY_DEFAULT_EXCLUDES = Seq("catalyst_", "core_", "graphx_", "kvstore_", "launcher_", "mllib_", val IVY_DEFAULT_EXCLUDES = Seq("catalyst_", "core_", "graphx_", "kvstore_", "launcher_", "mllib_",
"mllib-local_", "network-common_", "network-shuffle_", "repl_", "sketch_", "sql_", "streaming_", "mllib-local_", "network-common_", "network-shuffle_", "repl_", "sketch_", "sql_", "streaming_",
"tags_", "unsafe_") "tags_", "unsafe_")

View file

@ -115,7 +115,9 @@ SCALA_2_10_PROFILES="-Pscala-2.10"
SCALA_2_11_PROFILES= SCALA_2_11_PROFILES=
if [[ $SPARK_VERSION > "2.3" ]]; then if [[ $SPARK_VERSION > "2.3" ]]; then
BASE_PROFILES="$BASE_PROFILES -Pkubernetes" BASE_PROFILES="$BASE_PROFILES -Pkubernetes"
SCALA_2_11_PROFILES="-Pkafka-0-8" if [[ $SPARK_VERSION < "3.0." ]]; then
SCALA_2_11_PROFILES="-Pkafka-0-8"
fi
else else
PUBLISH_SCALA_2_10=1 PUBLISH_SCALA_2_10=1
fi fi

View file

@ -24,7 +24,7 @@ set -e
FWDIR="$(cd "`dirname "$0"`"/..; pwd)" FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
cd "$FWDIR" cd "$FWDIR"
SPARK_PROFILES="-Pmesos -Pkafka-0-8 -Pkubernetes -Pyarn -Pspark-ganglia-lgpl -Pkinesis-asl -Phive-thriftserver -Phive" SPARK_PROFILES="-Pmesos -Pkubernetes -Pyarn -Pspark-ganglia-lgpl -Pkinesis-asl -Phive-thriftserver -Phive"
TOOLS_CLASSPATH="$(build/sbt -DcopyDependencies=false "export tools/fullClasspath" | tail -n1)" TOOLS_CLASSPATH="$(build/sbt -DcopyDependencies=false "export tools/fullClasspath" | tail -n1)"
OLD_DEPS_CLASSPATH="$(build/sbt -DcopyDependencies=false $SPARK_PROFILES "export oldDeps/fullClasspath" | tail -n1)" OLD_DEPS_CLASSPATH="$(build/sbt -DcopyDependencies=false $SPARK_PROFILES "export oldDeps/fullClasspath" | tail -n1)"

View file

@ -332,7 +332,6 @@ def build_spark_sbt(hadoop_version):
# Enable all of the profiles for the build: # Enable all of the profiles for the build:
build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
sbt_goals = ["test:package", # Build test jars as some tests depend on them sbt_goals = ["test:package", # Build test jars as some tests depend on them
"streaming-kafka-0-8-assembly/assembly",
"streaming-kinesis-asl-assembly/assembly"] "streaming-kinesis-asl-assembly/assembly"]
profiles_and_goals = build_profiles + sbt_goals profiles_and_goals = build_profiles + sbt_goals

View file

@ -23,7 +23,6 @@ ERRORS=$(echo -e "q\n" \
| build/sbt \ | build/sbt \
-Pkinesis-asl \ -Pkinesis-asl \
-Pmesos \ -Pmesos \
-Pkafka-0-8 \
-Pkubernetes \ -Pkubernetes \
-Pyarn \ -Pyarn \
-Phive \ -Phive \

View file

@ -23,7 +23,6 @@ ERRORS=$(echo -e "q\n" \
| build/sbt \ | build/sbt \
-Pkinesis-asl \ -Pkinesis-asl \
-Pmesos \ -Pmesos \
-Pkafka-0-8 \
-Pkubernetes \ -Pkubernetes \
-Pyarn \ -Pyarn \
-Phive \ -Phive \

View file

@ -252,24 +252,6 @@ streaming_kinesis_asl = Module(
) )
streaming_kafka = Module(
name="streaming-kafka-0-8",
dependencies=[streaming],
source_file_regexes=[
"external/kafka-0-8",
"external/kafka-0-8-assembly",
],
build_profile_flags=[
"-Pkafka-0-8",
],
environ={
"ENABLE_KAFKA_0_8_TESTS": "1"
},
sbt_test_goals=[
"streaming-kafka-0-8/test",
]
)
streaming_kafka_0_10 = Module( streaming_kafka_0_10 = Module(
name="streaming-kafka-0-10", name="streaming-kafka-0-10",
dependencies=[streaming], dependencies=[streaming],
@ -374,15 +356,11 @@ pyspark_streaming = Module(
dependencies=[ dependencies=[
pyspark_core, pyspark_core,
streaming, streaming,
streaming_kafka,
streaming_kinesis_asl streaming_kinesis_asl
], ],
source_file_regexes=[ source_file_regexes=[
"python/pyspark/streaming" "python/pyspark/streaming"
], ],
environ={
"ENABLE_KAFKA_0_8_TESTS": "1"
},
python_test_goals=[ python_test_goals=[
"pyspark.streaming.util", "pyspark.streaming.util",
"pyspark.streaming.tests", "pyspark.streaming.tests",

View file

@ -29,7 +29,7 @@ export LC_ALL=C
# TODO: This would be much nicer to do in SBT, once SBT supports Maven-style resolution. # TODO: This would be much nicer to do in SBT, once SBT supports Maven-style resolution.
# NOTE: These should match those in the release publishing script # NOTE: These should match those in the release publishing script
HADOOP2_MODULE_PROFILES="-Phive-thriftserver -Pmesos -Pkafka-0-8 -Pkubernetes -Pyarn -Phive" HADOOP2_MODULE_PROFILES="-Phive-thriftserver -Pmesos -Pkubernetes -Pyarn -Phive"
MVN="build/mvn" MVN="build/mvn"
HADOOP_PROFILES=( HADOOP_PROFILES=(
hadoop-2.7 hadoop-2.7

View file

@ -89,15 +89,6 @@ like ZooKeeper and Hadoop itself.
## Building with Kubernetes support ## Building with Kubernetes support
./build/mvn -Pkubernetes -DskipTests clean package ./build/mvn -Pkubernetes -DskipTests clean package
## Building with Kafka 0.8 support
Kafka 0.8 support must be explicitly enabled with the `kafka-0-8` profile.
Note: Kafka 0.8 support is deprecated as of Spark 2.3.0.
./build/mvn -Pkafka-0-8 -DskipTests clean package
Kafka 0.10 support is still automatically built.
## Building submodules individually ## Building submodules individually

View file

@ -2017,7 +2017,7 @@ showDF(properties, numRows = 200, truncate = FALSE)
<td> <td>
Maximum rate (number of records per second) at which data will be read from each Kafka Maximum rate (number of records per second) at which data will be read from each Kafka
partition when using the new Kafka direct stream API. See the partition when using the new Kafka direct stream API. See the
<a href="streaming-kafka-integration.html">Kafka Integration guide</a> <a href="streaming-kafka-0-10-integration.html">Kafka Integration guide</a>
for more details. for more details.
</td> </td>
</tr> </tr>
@ -2029,16 +2029,6 @@ showDF(properties, numRows = 200, truncate = FALSE)
partition when using the new Kafka direct stream API. partition when using the new Kafka direct stream API.
</td> </td>
</tr> </tr>
<tr>
<td><code>spark.streaming.kafka.maxRetries</code></td>
<td>1</td>
<td>
Maximum number of consecutive retries the driver will make in order to find
the latest offsets on the leader of each partition (a default value of 1
means that the driver will make a maximum of 2 attempts). Only applies to
the new Kafka direct stream API.
</td>
</tr>
<tr> <tr>
<td><code>spark.streaming.ui.retainedBatches</code></td> <td><code>spark.streaming.ui.retainedBatches</code></td>
<td>1000</td> <td>1000</td>

View file

@ -3,7 +3,10 @@ layout: global
title: Spark Streaming + Kafka Integration Guide (Kafka broker version 0.10.0 or higher) title: Spark Streaming + Kafka Integration Guide (Kafka broker version 0.10.0 or higher)
--- ---
The Spark Streaming integration for Kafka 0.10 is similar in design to the 0.8 [Direct Stream approach](streaming-kafka-0-8-integration.html#approach-2-direct-approach-no-receivers). It provides simple parallelism, 1:1 correspondence between Kafka partitions and Spark partitions, and access to offsets and metadata. However, because the newer integration uses the [new Kafka consumer API](http://kafka.apache.org/documentation.html#newconsumerapi) instead of the simple API, there are notable differences in usage. This version of the integration is marked as experimental, so the API is potentially subject to change. The Spark Streaming integration for Kafka 0.10 provides simple parallelism, 1:1 correspondence between Kafka
partitions and Spark partitions, and access to offsets and metadata. However, because the newer integration uses
the [new Kafka consumer API](https://kafka.apache.org/documentation.html#newconsumerapi) instead of the simple API,
there are notable differences in usage.
### Linking ### Linking
For Scala/Java applications using SBT/Maven project definitions, link your streaming application with the following artifact (see [Linking section](streaming-programming-guide.html#linking) in the main programming guide for further information). For Scala/Java applications using SBT/Maven project definitions, link your streaming application with the following artifact (see [Linking section](streaming-programming-guide.html#linking) in the main programming guide for further information).

View file

@ -1,196 +0,0 @@
---
layout: global
title: Spark Streaming + Kafka Integration Guide (Kafka broker version 0.8.2.1 or higher)
---
**Note: Kafka 0.8 support is deprecated as of Spark 2.3.0.**
Here we explain how to configure Spark Streaming to receive data from Kafka. There are two approaches to this - the old approach using Receivers and Kafka's high-level API, and a new approach (introduced in Spark 1.3) without using Receivers. They have different programming models, performance characteristics, and semantics guarantees, so read on for more details. Both approaches are considered stable APIs as of the current version of Spark.
## Approach 1: Receiver-based Approach
This approach uses a Receiver to receive the data. The Receiver is implemented using the Kafka high-level consumer API. As with all receivers, the data received from Kafka through a Receiver is stored in Spark executors, and then jobs launched by Spark Streaming processes the data.
However, under default configuration, this approach can lose data under failures (see [receiver reliability](streaming-programming-guide.html#receiver-reliability). To ensure zero-data loss, you have to additionally enable Write-Ahead Logs in Spark Streaming (introduced in Spark 1.2). This synchronously saves all the received Kafka data into write-ahead logs on a distributed file system (e.g HDFS), so that all the data can be recovered on failure. See [Deploying section](streaming-programming-guide.html#deploying-applications) in the streaming programming guide for more details on Write-Ahead Logs.
Next, we discuss how to use this approach in your streaming application.
1. **Linking:** For Scala/Java applications using SBT/Maven project definitions, link your streaming application with the following artifact (see [Linking section](streaming-programming-guide.html#linking) in the main programming guide for further information).
groupId = org.apache.spark
artifactId = spark-streaming-kafka-0-8_{{site.SCALA_BINARY_VERSION}}
version = {{site.SPARK_VERSION_SHORT}}
For Python applications, you will have to add this above library and its dependencies when deploying your application. See the *Deploying* subsection below.
2. **Programming:** In the streaming application code, import `KafkaUtils` and create an input DStream as follows.
<div class="codetabs">
<div data-lang="scala" markdown="1">
import org.apache.spark.streaming.kafka._
val kafkaStream = KafkaUtils.createStream(streamingContext,
[ZK quorum], [consumer group id], [per-topic number of Kafka partitions to consume])
You can also specify the key and value classes and their corresponding decoder classes using variations of `createStream`. See the [API docs](api/scala/index.html#org.apache.spark.streaming.kafka.KafkaUtils$).
</div>
<div data-lang="java" markdown="1">
import org.apache.spark.streaming.kafka.*;
JavaPairReceiverInputDStream<String, String> kafkaStream =
KafkaUtils.createStream(streamingContext,
[ZK quorum], [consumer group id], [per-topic number of Kafka partitions to consume]);
You can also specify the key and value classes and their corresponding decoder classes using variations of `createStream`. See the [API docs](api/java/index.html?org/apache/spark/streaming/kafka/KafkaUtils.html).
</div>
<div data-lang="python" markdown="1">
from pyspark.streaming.kafka import KafkaUtils
kafkaStream = KafkaUtils.createStream(streamingContext, \
[ZK quorum], [consumer group id], [per-topic number of Kafka partitions to consume])
By default, the Python API will decode Kafka data as UTF8 encoded strings. You can specify your custom decoding function to decode the byte arrays in Kafka records to any arbitrary data type. See the [API docs](api/python/pyspark.streaming.html#pyspark.streaming.kafka.KafkaUtils).
</div>
</div>
**Points to remember:**
- Topic partitions in Kafka do not correlate to partitions of RDDs generated in Spark Streaming. So increasing the number of topic-specific partitions in the `KafkaUtils.createStream()` only increases the number of threads using which topics that are consumed within a single receiver. It does not increase the parallelism of Spark in processing the data. Refer to the main document for more information on that.
- Multiple Kafka input DStreams can be created with different groups and topics for parallel receiving of data using multiple receivers.
- If you have enabled Write-Ahead Logs with a replicated file system like HDFS, the received data is already being replicated in the log. Hence, the storage level in storage level for the input stream to `StorageLevel.MEMORY_AND_DISK_SER` (that is, use
`KafkaUtils.createStream(..., StorageLevel.MEMORY_AND_DISK_SER)`).
3. **Deploying:** As with any Spark applications, `spark-submit` is used to launch your application. However, the details are slightly different for Scala/Java applications and Python applications.
For Scala and Java applications, if you are using SBT or Maven for project management, then package `spark-streaming-kafka-0-8_{{site.SCALA_BINARY_VERSION}}` and its dependencies into the application JAR. Make sure `spark-core_{{site.SCALA_BINARY_VERSION}}` and `spark-streaming_{{site.SCALA_BINARY_VERSION}}` are marked as `provided` dependencies as those are already present in a Spark installation. Then use `spark-submit` to launch your application (see [Deploying section](streaming-programming-guide.html#deploying-applications) in the main programming guide).
For Python applications which lack SBT/Maven project management, `spark-streaming-kafka-0-8_{{site.SCALA_BINARY_VERSION}}` and its dependencies can be directly added to `spark-submit` using `--packages` (see [Application Submission Guide](submitting-applications.html)). That is,
./bin/spark-submit --packages org.apache.spark:spark-streaming-kafka-0-8_{{site.SCALA_BINARY_VERSION}}:{{site.SPARK_VERSION_SHORT}} ...
Alternatively, you can also download the JAR of the Maven artifact `spark-streaming-kafka-0-8-assembly` from the
[Maven repository](https://search.maven.org/#search|ga|1|a%3A%22spark-streaming-kafka-0-8-assembly_{{site.SCALA_BINARY_VERSION}}%22%20AND%20v%3A%22{{site.SPARK_VERSION_SHORT}}%22) and add it to `spark-submit` with `--jars`.
## Approach 2: Direct Approach (No Receivers)
This new receiver-less "direct" approach has been introduced in Spark 1.3 to ensure stronger end-to-end guarantees. Instead of using receivers to receive data, this approach periodically queries Kafka for the latest offsets in each topic+partition, and accordingly defines the offset ranges to process in each batch. When the jobs to process the data are launched, Kafka's simple consumer API is used to read the defined ranges of offsets from Kafka (similar to read files from a file system). Note that this feature was introduced in Spark 1.3 for the Scala and Java API, in Spark 1.4 for the Python API.
This approach has the following advantages over the receiver-based approach (i.e. Approach 1).
- *Simplified Parallelism:* No need to create multiple input Kafka streams and union them. With `directStream`, Spark Streaming will create as many RDD partitions as there are Kafka partitions to consume, which will all read data from Kafka in parallel. So there is a one-to-one mapping between Kafka and RDD partitions, which is easier to understand and tune.
- *Efficiency:* Achieving zero-data loss in the first approach required the data to be stored in a Write-Ahead Log, which further replicated the data. This is actually inefficient as the data effectively gets replicated twice - once by Kafka, and a second time by the Write-Ahead Log. This second approach eliminates the problem as there is no receiver, and hence no need for Write-Ahead Logs. As long as you have sufficient Kafka retention, messages can be recovered from Kafka.
- *Exactly-once semantics:* The first approach uses Kafka's high-level API to store consumed offsets in Zookeeper. This is traditionally the way to consume data from Kafka. While this approach (in combination with-write-ahead logs) can ensure zero data loss (i.e. at-least once semantics), there is a small chance some records may get consumed twice under some failures. This occurs because of inconsistencies between data reliably received by Spark Streaming and offsets tracked by Zookeeper. Hence, in this second approach, we use simple Kafka API that does not use Zookeeper. Offsets are tracked by Spark Streaming within its checkpoints. This eliminates inconsistencies between Spark Streaming and Zookeeper/Kafka, and so each record is received by Spark Streaming effectively exactly once despite failures. In order to achieve exactly-once semantics for output of your results, your output operation that saves the data to an external data store must be either idempotent, or an atomic transaction that saves results and offsets (see [Semantics of output operations](streaming-programming-guide.html#semantics-of-output-operations) in the main programming guide for further information).
Note that one disadvantage of this approach is that it does not update offsets in Zookeeper, hence Zookeeper-based Kafka monitoring tools will not show progress. However, you can access the offsets processed by this approach in each batch and update Zookeeper yourself (see below).
Next, we discuss how to use this approach in your streaming application.
1. **Linking:** This approach is supported only in Scala/Java application. Link your SBT/Maven project with the following artifact (see [Linking section](streaming-programming-guide.html#linking) in the main programming guide for further information).
groupId = org.apache.spark
artifactId = spark-streaming-kafka-0-8_{{site.SCALA_BINARY_VERSION}}
version = {{site.SPARK_VERSION_SHORT}}
2. **Programming:** In the streaming application code, import `KafkaUtils` and create an input DStream as follows.
<div class="codetabs">
<div data-lang="scala" markdown="1">
import org.apache.spark.streaming.kafka._
val directKafkaStream = KafkaUtils.createDirectStream[
[key class], [value class], [key decoder class], [value decoder class] ](
streamingContext, [map of Kafka parameters], [set of topics to consume])
You can also pass a `messageHandler` to `createDirectStream` to access `MessageAndMetadata` that contains metadata about the current message and transform it to any desired type.
See the [API docs](api/scala/index.html#org.apache.spark.streaming.kafka.KafkaUtils$).
</div>
<div data-lang="java" markdown="1">
import org.apache.spark.streaming.kafka.*;
JavaPairInputDStream<String, String> directKafkaStream =
KafkaUtils.createDirectStream(streamingContext,
[key class], [value class], [key decoder class], [value decoder class],
[map of Kafka parameters], [set of topics to consume]);
You can also pass a `messageHandler` to `createDirectStream` to access `MessageAndMetadata` that contains metadata about the current message and transform it to any desired type.
See the [API docs](api/java/index.html?org/apache/spark/streaming/kafka/KafkaUtils.html).
</div>
<div data-lang="python" markdown="1">
from pyspark.streaming.kafka import KafkaUtils
directKafkaStream = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
You can also pass a `messageHandler` to `createDirectStream` to access `KafkaMessageAndMetadata` that contains metadata about the current message and transform it to any desired type.
By default, the Python API will decode Kafka data as UTF8 encoded strings. You can specify your custom decoding function to decode the byte arrays in Kafka records to any arbitrary data type. See the [API docs](api/python/pyspark.streaming.html#pyspark.streaming.kafka.KafkaUtils).
</div>
</div>
In the Kafka parameters, you must specify either `metadata.broker.list` or `bootstrap.servers`.
By default, it will start consuming from the latest offset of each Kafka partition. If you set configuration `auto.offset.reset` in Kafka parameters to `smallest`, then it will start consuming from the smallest offset.
You can also start consuming from any arbitrary offset using other variations of `KafkaUtils.createDirectStream`. Furthermore, if you want to access the Kafka offsets consumed in each batch, you can do the following.
<div class="codetabs">
<div data-lang="scala" markdown="1">
// Hold a reference to the current offset ranges, so it can be used downstream
var offsetRanges = Array.empty[OffsetRange]
directKafkaStream.transform { rdd =>
offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd
}.map {
...
}.foreachRDD { rdd =>
for (o <- offsetRanges) {
println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}")
}
...
}
</div>
<div data-lang="java" markdown="1">
// Hold a reference to the current offset ranges, so it can be used downstream
AtomicReference<OffsetRange[]> offsetRanges = new AtomicReference<>();
directKafkaStream.transformToPair(rdd -> {
OffsetRange[] offsets = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
offsetRanges.set(offsets);
return rdd;
}).map(
...
).foreachRDD(rdd -> {
for (OffsetRange o : offsetRanges.get()) {
System.out.println(
o.topic() + " " + o.partition() + " " + o.fromOffset() + " " + o.untilOffset()
);
}
...
});
</div>
<div data-lang="python" markdown="1">
offsetRanges = []
def storeOffsetRanges(rdd):
global offsetRanges
offsetRanges = rdd.offsetRanges()
return rdd
def printOffsetRanges(rdd):
for o in offsetRanges:
print "%s %s %s %s" % (o.topic, o.partition, o.fromOffset, o.untilOffset)
directKafkaStream \
.transform(storeOffsetRanges) \
.foreachRDD(printOffsetRanges)
</div>
</div>
You can use this to update Zookeeper yourself if you want Zookeeper-based Kafka monitoring tools to show progress of the streaming application.
Note that the typecast to HasOffsetRanges will only succeed if it is done in the first method called on the directKafkaStream, not later down a chain of methods. You can use transform() instead of foreachRDD() as your first method call in order to access offsets, then call further Spark methods. However, be aware that the one-to-one mapping between RDD partition and Kafka partition does not remain after any methods that shuffle or repartition, e.g. reduceByKey() or window().
Another thing to note is that since this approach does not use Receivers, the standard receiver-related (that is, [configurations](configuration.html) of the form `spark.streaming.receiver.*` ) will not apply to the input DStreams created by this approach (will apply to other input DStreams though). Instead, use the [configurations](configuration.html) `spark.streaming.kafka.*`. An important one is `spark.streaming.kafka.maxRatePerPartition` which is the maximum rate (in messages per second) at which each Kafka partition will be read by this direct API.
3. **Deploying:** This is same as the first approach.

View file

@ -3,52 +3,9 @@ layout: global
title: Spark Streaming + Kafka Integration Guide title: Spark Streaming + Kafka Integration Guide
--- ---
[Apache Kafka](https://kafka.apache.org/) is publish-subscribe messaging rethought as a distributed, partitioned, replicated commit log service. Please read the [Kafka documentation](https://kafka.apache.org/documentation.html) thoroughly before starting an integration using Spark. [Apache Kafka](https://kafka.apache.org/) is publish-subscribe messaging rethought as a distributed, partitioned,
replicated commit log service. Please read the [Kafka documentation](https://kafka.apache.org/documentation.html)
thoroughly before starting an integration using Spark.
The Kafka project introduced a new consumer API between versions 0.8 and 0.10, so there are 2 separate corresponding Spark Streaming packages available. Please choose the correct package for your brokers and desired features; note that the 0.8 integration is compatible with later 0.9 and 0.10 brokers, but the 0.10 integration is not compatible with earlier brokers. At the moment, Spark requires Kafka 0.10 and higher. See
<a href="streaming-kafka-0-10-integration.html">Kafka 0.10 integration documentation</a> for details.
**Note: Kafka 0.8 support is deprecated as of Spark 2.3.0.**
<table class="table">
<tr><th></th><th><a href="streaming-kafka-0-8-integration.html">spark-streaming-kafka-0-8</a></th><th><a href="streaming-kafka-0-10-integration.html">spark-streaming-kafka-0-10</a></th></tr>
<tr>
<td>Broker Version</td>
<td>0.8.2.1 or higher</td>
<td>0.10.0 or higher</td>
</tr>
<tr>
<td>API Maturity</td>
<td>Deprecated</td>
<td>Stable</td>
</tr>
<tr>
<td>Language Support</td>
<td>Scala, Java, Python</td>
<td>Scala, Java</td>
</tr>
<tr>
<td>Receiver DStream</td>
<td>Yes</td>
<td>No</td>
</tr>
<tr>
<td>Direct DStream</td>
<td>Yes</td>
<td>Yes</td>
</tr>
<tr>
<td>SSL / TLS Support</td>
<td>No</td>
<td>Yes</td>
</tr>
<tr>
<td>Offset Commit API</td>
<td>No</td>
<td>Yes</td>
</tr>
<tr>
<td>Dynamic Topic Subscription</td>
<td>No</td>
<td>Yes</td>
</tr>
</table>

View file

@ -745,7 +745,7 @@ and add it to the classpath.
Some of these advanced sources are as follows. Some of these advanced sources are as follows.
- **Kafka:** Spark Streaming {{site.SPARK_VERSION_SHORT}} is compatible with Kafka broker versions 0.8.2.1 or higher. See the [Kafka Integration Guide](streaming-kafka-integration.html) for more details. - **Kafka:** Spark Streaming {{site.SPARK_VERSION_SHORT}} is compatible with Kafka broker versions 0.10 or higher. See the [Kafka Integration Guide](streaming-kafka-0-10-integration.html) for more details.
- **Kinesis:** Spark Streaming {{site.SPARK_VERSION_SHORT}} is compatible with Kinesis Client Library 1.2.1. See the [Kinesis Integration Guide](streaming-kinesis-integration.html) for more details. - **Kinesis:** Spark Streaming {{site.SPARK_VERSION_SHORT}} is compatible with Kinesis Client Library 1.2.1. See the [Kinesis Integration Guide](streaming-kinesis-integration.html) for more details.
@ -2172,7 +2172,7 @@ the input data stream (using `inputStream.repartition(<number of partitions>)`).
This distributes the received batches of data across the specified number of machines in the cluster This distributes the received batches of data across the specified number of machines in the cluster
before further processing. before further processing.
For direct stream, please refer to [Spark Streaming + Kafka Integration Guide](streaming-kafka-integration.html) For direct stream, please refer to [Spark Streaming + Kafka Integration Guide](streaming-kafka-0-10-integration.html)
### Level of Parallelism in Data Processing ### Level of Parallelism in Data Processing
{:.no_toc} {:.no_toc}
@ -2433,7 +2433,7 @@ The following table summarizes the semantics under failures:
### With Kafka Direct API ### With Kafka Direct API
{:.no_toc} {:.no_toc}
In Spark 1.3, we have introduced a new Kafka Direct API, which can ensure that all the Kafka data is received by Spark Streaming exactly once. Along with this, if you implement exactly-once output operation, you can achieve end-to-end exactly-once guarantees. This approach is further discussed in the [Kafka Integration Guide](streaming-kafka-integration.html). In Spark 1.3, we have introduced a new Kafka Direct API, which can ensure that all the Kafka data is received by Spark Streaming exactly once. Along with this, if you implement exactly-once output operation, you can achieve end-to-end exactly-once guarantees. This approach is further discussed in the [Kafka Integration Guide](streaming-kafka-0-10-integration.html).
## Semantics of output operations ## Semantics of output operations
{:.no_toc} {:.no_toc}
@ -2463,7 +2463,7 @@ additional effort may be necessary to achieve exactly-once semantics. There are
# Where to Go from Here # Where to Go from Here
* Additional guides * Additional guides
- [Kafka Integration Guide](streaming-kafka-integration.html) - [Kafka Integration Guide](streaming-kafka-0-10-integration.html)
- [Kinesis Integration Guide](streaming-kinesis-integration.html) - [Kinesis Integration Guide](streaming-kinesis-integration.html)
- [Custom Receiver Guide](streaming-custom-receivers.html) - [Custom Receiver Guide](streaming-custom-receivers.html)
* Third-party DStream data sources can be found in [Third Party Projects](https://spark.apache.org/third-party-projects.html) * Third-party DStream data sources can be found in [Third Party Projects](https://spark.apache.org/third-party-projects.html)

View file

@ -497,7 +497,7 @@ There are a few built-in sources.
- **File source** - Reads files written in a directory as a stream of data. Supported file formats are text, csv, json, orc, parquet. See the docs of the DataStreamReader interface for a more up-to-date list, and supported options for each file format. Note that the files must be atomically placed in the given directory, which in most file systems, can be achieved by file move operations. - **File source** - Reads files written in a directory as a stream of data. Supported file formats are text, csv, json, orc, parquet. See the docs of the DataStreamReader interface for a more up-to-date list, and supported options for each file format. Note that the files must be atomically placed in the given directory, which in most file systems, can be achieved by file move operations.
- **Kafka source** - Reads data from Kafka. It's compatible with Kafka broker versions 0.10.0 or higher. See the [Kafka Integration Guide](structured-streaming-kafka-integration.html) for more details. - **Kafka source** - Reads data from Kafka. It's compatible with Kafka broker versions 0.10.0 or higher. See the [Kafka Integration Guide](structured-streaming-kafka-0-10-integration.html) for more details.
- **Socket source (for testing)** - Reads UTF8 text data from a socket connection. The listening server socket is at the driver. Note that this should be used only for testing as this does not provide end-to-end fault-tolerance guarantees. - **Socket source (for testing)** - Reads UTF8 text data from a socket connection. The listening server socket is at the driver. Note that this should be used only for testing as this does not provide end-to-end fault-tolerance guarantees.
@ -566,7 +566,7 @@ Here are the details of all the sources in Spark.
<tr> <tr>
<td><b>Kafka Source</b></td> <td><b>Kafka Source</b></td>
<td> <td>
See the <a href="structured-streaming-kafka-integration.html">Kafka Integration Guide</a>. See the <a href="structured-streaming-kafka-0-10-integration.html">Kafka Integration Guide</a>.
</td> </td>
<td>Yes</td> <td>Yes</td>
<td></td> <td></td>
@ -1819,7 +1819,7 @@ Here are the details of all the sinks in Spark.
<tr> <tr>
<td><b>Kafka Sink</b></td> <td><b>Kafka Sink</b></td>
<td>Append, Update, Complete</td> <td>Append, Update, Complete</td>
<td>See the <a href="structured-streaming-kafka-integration.html">Kafka Integration Guide</a></td> <td>See the <a href="structured-streaming-kafka-0-10-integration.html">Kafka Integration Guide</a></td>
<td>Yes (at-least-once)</td> <td>Yes (at-least-once)</td>
<td>More details in the <a href="structured-streaming-kafka-integration.html">Kafka Integration Guide</a></td> <td>More details in the <a href="structured-streaming-kafka-integration.html">Kafka Integration Guide</a></td>
</tr> </tr>

View file

@ -1,56 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
r"""
Counts words in UTF8 encoded, '\n' delimited text directly received from Kafka in every 2 seconds.
Usage: direct_kafka_wordcount.py <broker_list> <topic>
To run this on your local machine, you need to setup Kafka and create a producer first, see
http://kafka.apache.org/documentation.html#quickstart
and then run the example
`$ bin/spark-submit --jars \
external/kafka-assembly/target/scala-*/spark-streaming-kafka-assembly-*.jar \
examples/src/main/python/streaming/direct_kafka_wordcount.py \
localhost:9092 test`
"""
from __future__ import print_function
import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: direct_kafka_wordcount.py <broker_list> <topic>", file=sys.stderr)
sys.exit(-1)
sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount")
ssc = StreamingContext(sc, 2)
brokers, topic = sys.argv[1:]
kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
lines = kvs.map(lambda x: x[1])
counts = lines.flatMap(lambda line: line.split(" ")) \
.map(lambda word: (word, 1)) \
.reduceByKey(lambda a, b: a+b)
counts.pprint()
ssc.start()
ssc.awaitTermination()

View file

@ -1,56 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
r"""
Counts words in UTF8 encoded, '\n' delimited text received from the network every second.
Usage: kafka_wordcount.py <zk> <topic>
To run this on your local machine, you need to setup Kafka and create a producer first, see
http://kafka.apache.org/documentation.html#quickstart
and then run the example
`$ bin/spark-submit --jars \
external/kafka-assembly/target/scala-*/spark-streaming-kafka-assembly-*.jar \
examples/src/main/python/streaming/kafka_wordcount.py \
localhost:2181 test`
"""
from __future__ import print_function
import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: kafka_wordcount.py <zk> <topic>", file=sys.stderr)
sys.exit(-1)
sc = SparkContext(appName="PythonStreamingKafkaWordCount")
ssc = StreamingContext(sc, 1)
zkQuorum, topic = sys.argv[1:]
kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1})
lines = kvs.map(lambda x: x[1])
counts = lines.flatMap(lambda line: line.split(" ")) \
.map(lambda word: (word, 1)) \
.reduceByKey(lambda a, b: a+b)
counts.pprint()
ssc.start()
ssc.awaitTermination()

View file

@ -26,11 +26,9 @@ import org.apache.kafka.clients.consumer._
import org.apache.kafka.clients.consumer.internals.NoOpConsumerRebalanceListener import org.apache.kafka.clients.consumer.internals.NoOpConsumerRebalanceListener
import org.apache.kafka.common.TopicPartition import org.apache.kafka.common.TopicPartition
import org.apache.spark.annotation.Experimental
import org.apache.spark.internal.Logging import org.apache.spark.internal.Logging
/** /**
* :: Experimental ::
* Choice of how to create and configure underlying Kafka Consumers on driver and executors. * Choice of how to create and configure underlying Kafka Consumers on driver and executors.
* See [[ConsumerStrategies]] to obtain instances. * See [[ConsumerStrategies]] to obtain instances.
* Kafka 0.10 consumers can require additional, sometimes complex, setup after object * Kafka 0.10 consumers can require additional, sometimes complex, setup after object
@ -38,7 +36,6 @@ import org.apache.spark.internal.Logging
* @tparam K type of Kafka message key * @tparam K type of Kafka message key
* @tparam V type of Kafka message value * @tparam V type of Kafka message value
*/ */
@Experimental
abstract class ConsumerStrategy[K, V] { abstract class ConsumerStrategy[K, V] {
/** /**
* Kafka <a href="http://kafka.apache.org/documentation.html#newconsumerconfigs"> * Kafka <a href="http://kafka.apache.org/documentation.html#newconsumerconfigs">
@ -208,13 +205,10 @@ private case class Assign[K, V](
} }
/** /**
* :: Experimental :: * Object for obtaining instances of [[ConsumerStrategy]]
* object for obtaining instances of [[ConsumerStrategy]]
*/ */
@Experimental
object ConsumerStrategies { object ConsumerStrategies {
/** /**
* :: Experimental ::
* Subscribe to a collection of topics. * Subscribe to a collection of topics.
* @param topics collection of topics to subscribe * @param topics collection of topics to subscribe
* @param kafkaParams Kafka * @param kafkaParams Kafka
@ -227,7 +221,6 @@ object ConsumerStrategies {
* TopicPartition, the committed offset (if applicable) or kafka param * TopicPartition, the committed offset (if applicable) or kafka param
* auto.offset.reset will be used. * auto.offset.reset will be used.
*/ */
@Experimental
def Subscribe[K, V]( def Subscribe[K, V](
topics: Iterable[jl.String], topics: Iterable[jl.String],
kafkaParams: collection.Map[String, Object], kafkaParams: collection.Map[String, Object],
@ -239,7 +232,6 @@ object ConsumerStrategies {
} }
/** /**
* :: Experimental ::
* Subscribe to a collection of topics. * Subscribe to a collection of topics.
* @param topics collection of topics to subscribe * @param topics collection of topics to subscribe
* @param kafkaParams Kafka * @param kafkaParams Kafka
@ -249,7 +241,6 @@ object ConsumerStrategies {
* Requires "bootstrap.servers" to be set * Requires "bootstrap.servers" to be set
* with Kafka broker(s) specified in host1:port1,host2:port2 form. * with Kafka broker(s) specified in host1:port1,host2:port2 form.
*/ */
@Experimental
def Subscribe[K, V]( def Subscribe[K, V](
topics: Iterable[jl.String], topics: Iterable[jl.String],
kafkaParams: collection.Map[String, Object]): ConsumerStrategy[K, V] = { kafkaParams: collection.Map[String, Object]): ConsumerStrategy[K, V] = {
@ -260,7 +251,6 @@ object ConsumerStrategies {
} }
/** /**
* :: Experimental ::
* Subscribe to a collection of topics. * Subscribe to a collection of topics.
* @param topics collection of topics to subscribe * @param topics collection of topics to subscribe
* @param kafkaParams Kafka * @param kafkaParams Kafka
@ -273,7 +263,6 @@ object ConsumerStrategies {
* TopicPartition, the committed offset (if applicable) or kafka param * TopicPartition, the committed offset (if applicable) or kafka param
* auto.offset.reset will be used. * auto.offset.reset will be used.
*/ */
@Experimental
def Subscribe[K, V]( def Subscribe[K, V](
topics: ju.Collection[jl.String], topics: ju.Collection[jl.String],
kafkaParams: ju.Map[String, Object], kafkaParams: ju.Map[String, Object],
@ -282,7 +271,6 @@ object ConsumerStrategies {
} }
/** /**
* :: Experimental ::
* Subscribe to a collection of topics. * Subscribe to a collection of topics.
* @param topics collection of topics to subscribe * @param topics collection of topics to subscribe
* @param kafkaParams Kafka * @param kafkaParams Kafka
@ -292,14 +280,13 @@ object ConsumerStrategies {
* Requires "bootstrap.servers" to be set * Requires "bootstrap.servers" to be set
* with Kafka broker(s) specified in host1:port1,host2:port2 form. * with Kafka broker(s) specified in host1:port1,host2:port2 form.
*/ */
@Experimental
def Subscribe[K, V]( def Subscribe[K, V](
topics: ju.Collection[jl.String], topics: ju.Collection[jl.String],
kafkaParams: ju.Map[String, Object]): ConsumerStrategy[K, V] = { kafkaParams: ju.Map[String, Object]): ConsumerStrategy[K, V] = {
new Subscribe[K, V](topics, kafkaParams, ju.Collections.emptyMap[TopicPartition, jl.Long]()) new Subscribe[K, V](topics, kafkaParams, ju.Collections.emptyMap[TopicPartition, jl.Long]())
} }
/** :: Experimental :: /**
* Subscribe to all topics matching specified pattern to get dynamically assigned partitions. * Subscribe to all topics matching specified pattern to get dynamically assigned partitions.
* The pattern matching will be done periodically against topics existing at the time of check. * The pattern matching will be done periodically against topics existing at the time of check.
* @param pattern pattern to subscribe to * @param pattern pattern to subscribe to
@ -313,7 +300,6 @@ object ConsumerStrategies {
* TopicPartition, the committed offset (if applicable) or kafka param * TopicPartition, the committed offset (if applicable) or kafka param
* auto.offset.reset will be used. * auto.offset.reset will be used.
*/ */
@Experimental
def SubscribePattern[K, V]( def SubscribePattern[K, V](
pattern: ju.regex.Pattern, pattern: ju.regex.Pattern,
kafkaParams: collection.Map[String, Object], kafkaParams: collection.Map[String, Object],
@ -324,7 +310,7 @@ object ConsumerStrategies {
new ju.HashMap[TopicPartition, jl.Long](offsets.mapValues(l => new jl.Long(l)).asJava)) new ju.HashMap[TopicPartition, jl.Long](offsets.mapValues(l => new jl.Long(l)).asJava))
} }
/** :: Experimental :: /**
* Subscribe to all topics matching specified pattern to get dynamically assigned partitions. * Subscribe to all topics matching specified pattern to get dynamically assigned partitions.
* The pattern matching will be done periodically against topics existing at the time of check. * The pattern matching will be done periodically against topics existing at the time of check.
* @param pattern pattern to subscribe to * @param pattern pattern to subscribe to
@ -335,7 +321,6 @@ object ConsumerStrategies {
* Requires "bootstrap.servers" to be set * Requires "bootstrap.servers" to be set
* with Kafka broker(s) specified in host1:port1,host2:port2 form. * with Kafka broker(s) specified in host1:port1,host2:port2 form.
*/ */
@Experimental
def SubscribePattern[K, V]( def SubscribePattern[K, V](
pattern: ju.regex.Pattern, pattern: ju.regex.Pattern,
kafkaParams: collection.Map[String, Object]): ConsumerStrategy[K, V] = { kafkaParams: collection.Map[String, Object]): ConsumerStrategy[K, V] = {
@ -345,7 +330,7 @@ object ConsumerStrategies {
ju.Collections.emptyMap[TopicPartition, jl.Long]()) ju.Collections.emptyMap[TopicPartition, jl.Long]())
} }
/** :: Experimental :: /**
* Subscribe to all topics matching specified pattern to get dynamically assigned partitions. * Subscribe to all topics matching specified pattern to get dynamically assigned partitions.
* The pattern matching will be done periodically against topics existing at the time of check. * The pattern matching will be done periodically against topics existing at the time of check.
* @param pattern pattern to subscribe to * @param pattern pattern to subscribe to
@ -359,7 +344,6 @@ object ConsumerStrategies {
* TopicPartition, the committed offset (if applicable) or kafka param * TopicPartition, the committed offset (if applicable) or kafka param
* auto.offset.reset will be used. * auto.offset.reset will be used.
*/ */
@Experimental
def SubscribePattern[K, V]( def SubscribePattern[K, V](
pattern: ju.regex.Pattern, pattern: ju.regex.Pattern,
kafkaParams: ju.Map[String, Object], kafkaParams: ju.Map[String, Object],
@ -367,7 +351,7 @@ object ConsumerStrategies {
new SubscribePattern[K, V](pattern, kafkaParams, offsets) new SubscribePattern[K, V](pattern, kafkaParams, offsets)
} }
/** :: Experimental :: /**
* Subscribe to all topics matching specified pattern to get dynamically assigned partitions. * Subscribe to all topics matching specified pattern to get dynamically assigned partitions.
* The pattern matching will be done periodically against topics existing at the time of check. * The pattern matching will be done periodically against topics existing at the time of check.
* @param pattern pattern to subscribe to * @param pattern pattern to subscribe to
@ -378,7 +362,6 @@ object ConsumerStrategies {
* Requires "bootstrap.servers" to be set * Requires "bootstrap.servers" to be set
* with Kafka broker(s) specified in host1:port1,host2:port2 form. * with Kafka broker(s) specified in host1:port1,host2:port2 form.
*/ */
@Experimental
def SubscribePattern[K, V]( def SubscribePattern[K, V](
pattern: ju.regex.Pattern, pattern: ju.regex.Pattern,
kafkaParams: ju.Map[String, Object]): ConsumerStrategy[K, V] = { kafkaParams: ju.Map[String, Object]): ConsumerStrategy[K, V] = {
@ -389,7 +372,6 @@ object ConsumerStrategies {
} }
/** /**
* :: Experimental ::
* Assign a fixed collection of TopicPartitions * Assign a fixed collection of TopicPartitions
* @param topicPartitions collection of TopicPartitions to assign * @param topicPartitions collection of TopicPartitions to assign
* @param kafkaParams Kafka * @param kafkaParams Kafka
@ -402,7 +384,6 @@ object ConsumerStrategies {
* TopicPartition, the committed offset (if applicable) or kafka param * TopicPartition, the committed offset (if applicable) or kafka param
* auto.offset.reset will be used. * auto.offset.reset will be used.
*/ */
@Experimental
def Assign[K, V]( def Assign[K, V](
topicPartitions: Iterable[TopicPartition], topicPartitions: Iterable[TopicPartition],
kafkaParams: collection.Map[String, Object], kafkaParams: collection.Map[String, Object],
@ -414,7 +395,6 @@ object ConsumerStrategies {
} }
/** /**
* :: Experimental ::
* Assign a fixed collection of TopicPartitions * Assign a fixed collection of TopicPartitions
* @param topicPartitions collection of TopicPartitions to assign * @param topicPartitions collection of TopicPartitions to assign
* @param kafkaParams Kafka * @param kafkaParams Kafka
@ -424,7 +404,6 @@ object ConsumerStrategies {
* Requires "bootstrap.servers" to be set * Requires "bootstrap.servers" to be set
* with Kafka broker(s) specified in host1:port1,host2:port2 form. * with Kafka broker(s) specified in host1:port1,host2:port2 form.
*/ */
@Experimental
def Assign[K, V]( def Assign[K, V](
topicPartitions: Iterable[TopicPartition], topicPartitions: Iterable[TopicPartition],
kafkaParams: collection.Map[String, Object]): ConsumerStrategy[K, V] = { kafkaParams: collection.Map[String, Object]): ConsumerStrategy[K, V] = {
@ -435,7 +414,6 @@ object ConsumerStrategies {
} }
/** /**
* :: Experimental ::
* Assign a fixed collection of TopicPartitions * Assign a fixed collection of TopicPartitions
* @param topicPartitions collection of TopicPartitions to assign * @param topicPartitions collection of TopicPartitions to assign
* @param kafkaParams Kafka * @param kafkaParams Kafka
@ -448,7 +426,6 @@ object ConsumerStrategies {
* TopicPartition, the committed offset (if applicable) or kafka param * TopicPartition, the committed offset (if applicable) or kafka param
* auto.offset.reset will be used. * auto.offset.reset will be used.
*/ */
@Experimental
def Assign[K, V]( def Assign[K, V](
topicPartitions: ju.Collection[TopicPartition], topicPartitions: ju.Collection[TopicPartition],
kafkaParams: ju.Map[String, Object], kafkaParams: ju.Map[String, Object],
@ -457,7 +434,6 @@ object ConsumerStrategies {
} }
/** /**
* :: Experimental ::
* Assign a fixed collection of TopicPartitions * Assign a fixed collection of TopicPartitions
* @param topicPartitions collection of TopicPartitions to assign * @param topicPartitions collection of TopicPartitions to assign
* @param kafkaParams Kafka * @param kafkaParams Kafka
@ -467,7 +443,6 @@ object ConsumerStrategies {
* Requires "bootstrap.servers" to be set * Requires "bootstrap.servers" to be set
* with Kafka broker(s) specified in host1:port1,host2:port2 form. * with Kafka broker(s) specified in host1:port1,host2:port2 form.
*/ */
@Experimental
def Assign[K, V]( def Assign[K, V](
topicPartitions: ju.Collection[TopicPartition], topicPartitions: ju.Collection[TopicPartition],
kafkaParams: ju.Map[String, Object]): ConsumerStrategy[K, V] = { kafkaParams: ju.Map[String, Object]): ConsumerStrategy[K, V] = {

View file

@ -23,7 +23,6 @@ import org.apache.kafka.clients.consumer._
import org.apache.kafka.common.TopicPartition import org.apache.kafka.common.TopicPartition
import org.apache.spark.SparkContext import org.apache.spark.SparkContext
import org.apache.spark.annotation.Experimental
import org.apache.spark.api.java.{ JavaRDD, JavaSparkContext } import org.apache.spark.api.java.{ JavaRDD, JavaSparkContext }
import org.apache.spark.internal.Logging import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
@ -32,13 +31,10 @@ import org.apache.spark.streaming.api.java.{ JavaInputDStream, JavaStreamingCont
import org.apache.spark.streaming.dstream._ import org.apache.spark.streaming.dstream._
/** /**
* :: Experimental ::
* object for constructing Kafka streams and RDDs * object for constructing Kafka streams and RDDs
*/ */
@Experimental
object KafkaUtils extends Logging { object KafkaUtils extends Logging {
/** /**
* :: Experimental ::
* Scala constructor for a batch-oriented interface for consuming from Kafka. * Scala constructor for a batch-oriented interface for consuming from Kafka.
* Starting and ending offsets are specified in advance, * Starting and ending offsets are specified in advance,
* so that you can control exactly-once semantics. * so that you can control exactly-once semantics.
@ -52,7 +48,6 @@ object KafkaUtils extends Logging {
* @tparam K type of Kafka message key * @tparam K type of Kafka message key
* @tparam V type of Kafka message value * @tparam V type of Kafka message value
*/ */
@Experimental
def createRDD[K, V]( def createRDD[K, V](
sc: SparkContext, sc: SparkContext,
kafkaParams: ju.Map[String, Object], kafkaParams: ju.Map[String, Object],
@ -75,7 +70,6 @@ object KafkaUtils extends Logging {
} }
/** /**
* :: Experimental ::
* Java constructor for a batch-oriented interface for consuming from Kafka. * Java constructor for a batch-oriented interface for consuming from Kafka.
* Starting and ending offsets are specified in advance, * Starting and ending offsets are specified in advance,
* so that you can control exactly-once semantics. * so that you can control exactly-once semantics.
@ -89,7 +83,6 @@ object KafkaUtils extends Logging {
* @tparam K type of Kafka message key * @tparam K type of Kafka message key
* @tparam V type of Kafka message value * @tparam V type of Kafka message value
*/ */
@Experimental
def createRDD[K, V]( def createRDD[K, V](
jsc: JavaSparkContext, jsc: JavaSparkContext,
kafkaParams: ju.Map[String, Object], kafkaParams: ju.Map[String, Object],
@ -101,7 +94,6 @@ object KafkaUtils extends Logging {
} }
/** /**
* :: Experimental ::
* Scala constructor for a DStream where * Scala constructor for a DStream where
* each given Kafka topic/partition corresponds to an RDD partition. * each given Kafka topic/partition corresponds to an RDD partition.
* The spark configuration spark.streaming.kafka.maxRatePerPartition gives the maximum number * The spark configuration spark.streaming.kafka.maxRatePerPartition gives the maximum number
@ -114,7 +106,6 @@ object KafkaUtils extends Logging {
* @tparam K type of Kafka message key * @tparam K type of Kafka message key
* @tparam V type of Kafka message value * @tparam V type of Kafka message value
*/ */
@Experimental
def createDirectStream[K, V]( def createDirectStream[K, V](
ssc: StreamingContext, ssc: StreamingContext,
locationStrategy: LocationStrategy, locationStrategy: LocationStrategy,
@ -125,7 +116,6 @@ object KafkaUtils extends Logging {
} }
/** /**
* :: Experimental ::
* Scala constructor for a DStream where * Scala constructor for a DStream where
* each given Kafka topic/partition corresponds to an RDD partition. * each given Kafka topic/partition corresponds to an RDD partition.
* @param locationStrategy In most cases, pass in [[LocationStrategies.PreferConsistent]], * @param locationStrategy In most cases, pass in [[LocationStrategies.PreferConsistent]],
@ -137,7 +127,6 @@ object KafkaUtils extends Logging {
* @tparam K type of Kafka message key * @tparam K type of Kafka message key
* @tparam V type of Kafka message value * @tparam V type of Kafka message value
*/ */
@Experimental
def createDirectStream[K, V]( def createDirectStream[K, V](
ssc: StreamingContext, ssc: StreamingContext,
locationStrategy: LocationStrategy, locationStrategy: LocationStrategy,
@ -148,7 +137,6 @@ object KafkaUtils extends Logging {
} }
/** /**
* :: Experimental ::
* Java constructor for a DStream where * Java constructor for a DStream where
* each given Kafka topic/partition corresponds to an RDD partition. * each given Kafka topic/partition corresponds to an RDD partition.
* @param locationStrategy In most cases, pass in [[LocationStrategies.PreferConsistent]], * @param locationStrategy In most cases, pass in [[LocationStrategies.PreferConsistent]],
@ -158,7 +146,6 @@ object KafkaUtils extends Logging {
* @tparam K type of Kafka message key * @tparam K type of Kafka message key
* @tparam V type of Kafka message value * @tparam V type of Kafka message value
*/ */
@Experimental
def createDirectStream[K, V]( def createDirectStream[K, V](
jssc: JavaStreamingContext, jssc: JavaStreamingContext,
locationStrategy: LocationStrategy, locationStrategy: LocationStrategy,
@ -170,7 +157,6 @@ object KafkaUtils extends Logging {
} }
/** /**
* :: Experimental ::
* Java constructor for a DStream where * Java constructor for a DStream where
* each given Kafka topic/partition corresponds to an RDD partition. * each given Kafka topic/partition corresponds to an RDD partition.
* @param locationStrategy In most cases, pass in [[LocationStrategies.PreferConsistent]], * @param locationStrategy In most cases, pass in [[LocationStrategies.PreferConsistent]],
@ -182,7 +168,6 @@ object KafkaUtils extends Logging {
* @tparam K type of Kafka message key * @tparam K type of Kafka message key
* @tparam V type of Kafka message value * @tparam V type of Kafka message value
*/ */
@Experimental
def createDirectStream[K, V]( def createDirectStream[K, V](
jssc: JavaStreamingContext, jssc: JavaStreamingContext,
locationStrategy: LocationStrategy, locationStrategy: LocationStrategy,

View file

@ -23,18 +23,14 @@ import scala.collection.JavaConverters._
import org.apache.kafka.common.TopicPartition import org.apache.kafka.common.TopicPartition
import org.apache.spark.annotation.Experimental
/** /**
* :: Experimental ::
* Choice of how to schedule consumers for a given TopicPartition on an executor. * Choice of how to schedule consumers for a given TopicPartition on an executor.
* See [[LocationStrategies]] to obtain instances. * See [[LocationStrategies]] to obtain instances.
* Kafka 0.10 consumers prefetch messages, so it's important for performance * Kafka 0.10 consumers prefetch messages, so it's important for performance
* to keep cached consumers on appropriate executors, not recreate them for every partition. * to keep cached consumers on appropriate executors, not recreate them for every partition.
* Choice of location is only a preference, not an absolute; partitions may be scheduled elsewhere. * Choice of location is only a preference, not an absolute; partitions may be scheduled elsewhere.
*/ */
@Experimental
sealed abstract class LocationStrategy sealed abstract class LocationStrategy
private case object PreferBrokers extends LocationStrategy private case object PreferBrokers extends LocationStrategy
@ -44,42 +40,32 @@ private case object PreferConsistent extends LocationStrategy
private case class PreferFixed(hostMap: ju.Map[TopicPartition, String]) extends LocationStrategy private case class PreferFixed(hostMap: ju.Map[TopicPartition, String]) extends LocationStrategy
/** /**
* :: Experimental :: object to obtain instances of [[LocationStrategy]] * Object to obtain instances of [[LocationStrategy]]
*
*/ */
@Experimental
object LocationStrategies { object LocationStrategies {
/** /**
* :: Experimental ::
* Use this only if your executors are on the same nodes as your Kafka brokers. * Use this only if your executors are on the same nodes as your Kafka brokers.
*/ */
@Experimental
def PreferBrokers: LocationStrategy = def PreferBrokers: LocationStrategy =
org.apache.spark.streaming.kafka010.PreferBrokers org.apache.spark.streaming.kafka010.PreferBrokers
/** /**
* :: Experimental ::
* Use this in most cases, it will consistently distribute partitions across all executors. * Use this in most cases, it will consistently distribute partitions across all executors.
*/ */
@Experimental
def PreferConsistent: LocationStrategy = def PreferConsistent: LocationStrategy =
org.apache.spark.streaming.kafka010.PreferConsistent org.apache.spark.streaming.kafka010.PreferConsistent
/** /**
* :: Experimental ::
* Use this to place particular TopicPartitions on particular hosts if your load is uneven. * Use this to place particular TopicPartitions on particular hosts if your load is uneven.
* Any TopicPartition not specified in the map will use a consistent location. * Any TopicPartition not specified in the map will use a consistent location.
*/ */
@Experimental
def PreferFixed(hostMap: collection.Map[TopicPartition, String]): LocationStrategy = def PreferFixed(hostMap: collection.Map[TopicPartition, String]): LocationStrategy =
new PreferFixed(new ju.HashMap[TopicPartition, String](hostMap.asJava)) new PreferFixed(new ju.HashMap[TopicPartition, String](hostMap.asJava))
/** /**
* :: Experimental ::
* Use this to place particular TopicPartitions on particular hosts if your load is uneven. * Use this to place particular TopicPartitions on particular hosts if your load is uneven.
* Any TopicPartition not specified in the map will use a consistent location. * Any TopicPartition not specified in the map will use a consistent location.
*/ */
@Experimental
def PreferFixed(hostMap: ju.Map[TopicPartition, String]): LocationStrategy = def PreferFixed(hostMap: ju.Map[TopicPartition, String]): LocationStrategy =
new PreferFixed(hostMap) new PreferFixed(hostMap)
} }

View file

@ -20,8 +20,6 @@ package org.apache.spark.streaming.kafka010
import org.apache.kafka.clients.consumer.OffsetCommitCallback import org.apache.kafka.clients.consumer.OffsetCommitCallback
import org.apache.kafka.common.TopicPartition import org.apache.kafka.common.TopicPartition
import org.apache.spark.annotation.Experimental
/** /**
* Represents any object that has a collection of [[OffsetRange]]s. This can be used to access the * Represents any object that has a collection of [[OffsetRange]]s. This can be used to access the
* offset ranges in RDDs generated by the direct Kafka DStream (see * offset ranges in RDDs generated by the direct Kafka DStream (see
@ -38,7 +36,6 @@ trait HasOffsetRanges {
} }
/** /**
* :: Experimental ::
* Represents any object that can commit a collection of [[OffsetRange]]s. * Represents any object that can commit a collection of [[OffsetRange]]s.
* The direct Kafka DStream implements this interface (see * The direct Kafka DStream implements this interface (see
* [[KafkaUtils.createDirectStream]]). * [[KafkaUtils.createDirectStream]]).
@ -56,25 +53,20 @@ trait HasOffsetRanges {
* }) * })
* }}} * }}}
*/ */
@Experimental
trait CanCommitOffsets { trait CanCommitOffsets {
/** /**
* :: Experimental ::
* Queue up offset ranges for commit to Kafka at a future time. Threadsafe. * Queue up offset ranges for commit to Kafka at a future time. Threadsafe.
* This is only needed if you intend to store offsets in Kafka, instead of your own store. * This is only needed if you intend to store offsets in Kafka, instead of your own store.
* @param offsetRanges The maximum untilOffset for a given partition will be used at commit. * @param offsetRanges The maximum untilOffset for a given partition will be used at commit.
*/ */
@Experimental
def commitAsync(offsetRanges: Array[OffsetRange]): Unit def commitAsync(offsetRanges: Array[OffsetRange]): Unit
/** /**
* :: Experimental ::
* Queue up offset ranges for commit to Kafka at a future time. Threadsafe. * Queue up offset ranges for commit to Kafka at a future time. Threadsafe.
* This is only needed if you intend to store offsets in Kafka, instead of your own store. * This is only needed if you intend to store offsets in Kafka, instead of your own store.
* @param offsetRanges The maximum untilOffset for a given partition will be used at commit. * @param offsetRanges The maximum untilOffset for a given partition will be used at commit.
* @param callback Only the most recently provided callback will be used at commit. * @param callback Only the most recently provided callback will be used at commit.
*/ */
@Experimental
def commitAsync(offsetRanges: Array[OffsetRange], callback: OffsetCommitCallback): Unit def commitAsync(offsetRanges: Array[OffsetRange], callback: OffsetCommitCallback): Unit
} }

View file

@ -20,14 +20,11 @@ package org.apache.spark.streaming.kafka010
import org.apache.kafka.common.TopicPartition import org.apache.kafka.common.TopicPartition
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.annotation.Experimental
/** /**
* :: Experimental ::
* Interface for user-supplied configurations that can't otherwise be set via Spark properties, * Interface for user-supplied configurations that can't otherwise be set via Spark properties,
* because they need tweaking on a per-partition basis, * because they need tweaking on a per-partition basis,
*/ */
@Experimental
abstract class PerPartitionConfig extends Serializable { abstract class PerPartitionConfig extends Serializable {
/** /**
* Maximum rate (number of records per second) at which data will be read * Maximum rate (number of records per second) at which data will be read

View file

@ -1,170 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Licensed to the Apache Software Foundation (ASF) under one or more
~ contributor license agreements. See the NOTICE file distributed with
~ this work for additional information regarding copyright ownership.
~ The ASF licenses this file to You under the Apache License, Version 2.0
~ (the "License"); you may not use this file except in compliance with
~ the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.spark</groupId>
<artifactId>spark-parent_2.11</artifactId>
<version>3.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>
<artifactId>spark-streaming-kafka-0-8-assembly_2.11</artifactId>
<packaging>jar</packaging>
<name>Spark Project External Kafka Assembly</name>
<url>http://spark.apache.org/</url>
<properties>
<sbt.project.name>streaming-kafka-0-8-assembly</sbt.project.name>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-8_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<scope>provided</scope>
</dependency>
<!--
Demote already included in the Spark assembly.
-->
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.lz4</groupId>
<artifactId>lz4-java</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro-mapred</artifactId>
<classifier>${avro.mapred.classifier}</classifier>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.curator</groupId>
<artifactId>curator-recipes</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.xerial.snappy</groupId>
<artifactId>snappy-java</artifactId>
<scope>provided</scope>
</dependency>
</dependencies>
<build>
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<configuration>
<shadedArtifactAttached>false</shadedArtifactAttached>
<artifactSet>
<includes>
<include>*:*</include>
</includes>
</artifactSet>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>reference.conf</resource>
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.DontIncludeResourceTransformer">
<resource>log4j.properties</resource>
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer"/>
<transformer implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer"/>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View file

@ -1,109 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Licensed to the Apache Software Foundation (ASF) under one or more
~ contributor license agreements. See the NOTICE file distributed with
~ this work for additional information regarding copyright ownership.
~ The ASF licenses this file to You under the Apache License, Version 2.0
~ (the "License"); you may not use this file except in compliance with
~ the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.spark</groupId>
<artifactId>spark-parent_2.11</artifactId>
<version>3.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>
<artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
<properties>
<sbt.project.name>streaming-kafka-0-8</sbt.project.name>
</properties>
<packaging>jar</packaging>
<name>Spark Integration for Kafka 0.8</name>
<url>http://spark.apache.org/</url>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<type>test-jar</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_${scala.binary.version}</artifactId>
<version>0.8.2.1</version>
<exclusions>
<exclusion>
<groupId>com.sun.jmx</groupId>
<artifactId>jmxri</artifactId>
</exclusion>
<exclusion>
<groupId>com.sun.jdmk</groupId>
<artifactId>jmxtools</artifactId>
</exclusion>
<exclusion>
<groupId>net.sf.jopt-simple</groupId>
<artifactId>jopt-simple</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-simple</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>net.sf.jopt-simple</groupId>
<artifactId>jopt-simple</artifactId>
<version>3.2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.scalacheck</groupId>
<artifactId>scalacheck_${scala.binary.version}</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-tags_${scala.binary.version}</artifactId>
</dependency>
<!--
This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
them will yield errors.
-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-tags_${scala.binary.version}</artifactId>
<type>test-jar</type>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
</build>
</project>

View file

@ -1,68 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming.kafka
import org.apache.spark.annotation.Experimental
/**
* Represents the host and port info for a Kafka broker.
* Differs from the Kafka project's internal kafka.cluster.Broker, which contains a server ID.
*/
@deprecated("Update to Kafka 0.10 integration", "2.3.0")
final class Broker private(
/** Broker's hostname */
val host: String,
/** Broker's port */
val port: Int) extends Serializable {
override def equals(obj: Any): Boolean = obj match {
case that: Broker =>
this.host == that.host &&
this.port == that.port
case _ => false
}
override def hashCode: Int = {
41 * (41 + host.hashCode) + port
}
override def toString(): String = {
s"Broker($host, $port)"
}
}
/**
* :: Experimental ::
* Companion object that provides methods to create instances of [[Broker]].
*/
@Experimental
@deprecated("Update to Kafka 0.10 integration", "2.3.0")
object Broker {
def create(host: String, port: Int): Broker =
new Broker(host, port)
def apply(host: String, port: Int): Broker =
new Broker(host, port)
def unapply(broker: Broker): Option[(String, Int)] = {
if (broker == null) {
None
} else {
Some((broker.host, broker.port))
}
}
}

View file

@ -1,233 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming.kafka
import scala.annotation.tailrec
import scala.collection.mutable
import scala.reflect.ClassTag
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.Decoder
import org.apache.spark.SparkException
import org.apache.spark.internal.Logging
import org.apache.spark.streaming.{StreamingContext, Time}
import org.apache.spark.streaming.dstream._
import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset
import org.apache.spark.streaming.scheduler.{RateController, StreamInputInfo}
import org.apache.spark.streaming.scheduler.rate.RateEstimator
/**
* A stream of [[KafkaRDD]] where
* each given Kafka topic/partition corresponds to an RDD partition.
* The spark configuration spark.streaming.kafka.maxRatePerPartition gives the maximum number
* of messages
* per second that each '''partition''' will accept.
* Starting offsets are specified in advance,
* and this DStream is not responsible for committing offsets,
* so that you can control exactly-once semantics.
* For an easy interface to Kafka-managed offsets,
* see [[KafkaCluster]]
* @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
* configuration parameters</a>.
* Requires "metadata.broker.list" or "bootstrap.servers" to be set with Kafka broker(s),
* NOT zookeeper servers, specified in host1:port1,host2:port2 form.
* @param fromOffsets per-topic/partition Kafka offsets defining the (inclusive)
* starting point of the stream
* @param messageHandler function for translating each message into the desired type
*/
private[streaming]
class DirectKafkaInputDStream[
K: ClassTag,
V: ClassTag,
U <: Decoder[K]: ClassTag,
T <: Decoder[V]: ClassTag,
R: ClassTag](
_ssc: StreamingContext,
val kafkaParams: Map[String, String],
val fromOffsets: Map[TopicAndPartition, Long],
messageHandler: MessageAndMetadata[K, V] => R
) extends InputDStream[R](_ssc) with Logging {
val maxRetries = context.sparkContext.getConf.getInt(
"spark.streaming.kafka.maxRetries", 1)
private[streaming] override def name: String = s"Kafka direct stream [$id]"
protected[streaming] override val checkpointData =
new DirectKafkaInputDStreamCheckpointData
/**
* Asynchronously maintains & sends new rate limits to the receiver through the receiver tracker.
*/
override protected[streaming] val rateController: Option[RateController] = {
if (RateController.isBackPressureEnabled(ssc.conf)) {
Some(new DirectKafkaRateController(id,
RateEstimator.create(ssc.conf, context.graph.batchDuration)))
} else {
None
}
}
protected val kc = new KafkaCluster(kafkaParams)
private val maxRateLimitPerPartition: Long = context.sparkContext.getConf.getLong(
"spark.streaming.kafka.maxRatePerPartition", 0)
private val initialRate = context.sparkContext.getConf.getLong(
"spark.streaming.backpressure.initialRate", 0)
protected[streaming] def maxMessagesPerPartition(
offsets: Map[TopicAndPartition, Long]): Option[Map[TopicAndPartition, Long]] = {
val estimatedRateLimit = rateController.map { x => {
val lr = x.getLatestRate()
if (lr > 0) lr else initialRate
}}
// calculate a per-partition rate limit based on current lag
val effectiveRateLimitPerPartition = estimatedRateLimit.filter(_ > 0) match {
case Some(rate) =>
val lagPerPartition = offsets.map { case (tp, offset) =>
tp -> Math.max(offset - currentOffsets(tp), 0)
}
val totalLag = lagPerPartition.values.sum
lagPerPartition.map { case (tp, lag) =>
val backpressureRate = lag / totalLag.toDouble * rate
tp -> (if (maxRateLimitPerPartition > 0) {
Math.min(backpressureRate, maxRateLimitPerPartition)} else backpressureRate)
}
case None => offsets.map { case (tp, offset) => tp -> maxRateLimitPerPartition.toDouble }
}
if (effectiveRateLimitPerPartition.values.sum > 0) {
val secsPerBatch = context.graph.batchDuration.milliseconds.toDouble / 1000
Some(effectiveRateLimitPerPartition.map {
case (tp, limit) => tp -> Math.max((secsPerBatch * limit).toLong, 1L)
})
} else {
None
}
}
protected var currentOffsets = fromOffsets
@tailrec
protected final def latestLeaderOffsets(retries: Int): Map[TopicAndPartition, LeaderOffset] = {
val o = kc.getLatestLeaderOffsets(currentOffsets.keySet)
// Either.fold would confuse @tailrec, do it manually
if (o.isLeft) {
val err = o.left.get.toString
if (retries <= 0) {
throw new SparkException(err)
} else {
logError(err)
Thread.sleep(kc.config.refreshLeaderBackoffMs)
latestLeaderOffsets(retries - 1)
}
} else {
o.right.get
}
}
// limits the maximum number of messages per partition
protected def clamp(
leaderOffsets: Map[TopicAndPartition, LeaderOffset]): Map[TopicAndPartition, LeaderOffset] = {
val offsets = leaderOffsets.mapValues(lo => lo.offset)
maxMessagesPerPartition(offsets).map { mmp =>
mmp.map { case (tp, messages) =>
val lo = leaderOffsets(tp)
tp -> lo.copy(offset = Math.min(currentOffsets(tp) + messages, lo.offset))
}
}.getOrElse(leaderOffsets)
}
override def compute(validTime: Time): Option[KafkaRDD[K, V, U, T, R]] = {
val untilOffsets = clamp(latestLeaderOffsets(maxRetries))
val rdd = KafkaRDD[K, V, U, T, R](
context.sparkContext, kafkaParams, currentOffsets, untilOffsets, messageHandler)
// Report the record number and metadata of this batch interval to InputInfoTracker.
val offsetRanges = currentOffsets.map { case (tp, fo) =>
val uo = untilOffsets(tp)
OffsetRange(tp.topic, tp.partition, fo, uo.offset)
}
val description = offsetRanges.filter { offsetRange =>
// Don't display empty ranges.
offsetRange.fromOffset != offsetRange.untilOffset
}.map { offsetRange =>
s"topic: ${offsetRange.topic}\tpartition: ${offsetRange.partition}\t" +
s"offsets: ${offsetRange.fromOffset} to ${offsetRange.untilOffset}"
}.mkString("\n")
// Copy offsetRanges to immutable.List to prevent from being modified by the user
val metadata = Map(
"offsets" -> offsetRanges.toList,
StreamInputInfo.METADATA_KEY_DESCRIPTION -> description)
val inputInfo = StreamInputInfo(id, rdd.count, metadata)
ssc.scheduler.inputInfoTracker.reportInfo(validTime, inputInfo)
currentOffsets = untilOffsets.map(kv => kv._1 -> kv._2.offset)
Some(rdd)
}
override def start(): Unit = {
}
def stop(): Unit = {
}
private[streaming]
class DirectKafkaInputDStreamCheckpointData extends DStreamCheckpointData(this) {
def batchForTime: mutable.HashMap[Time, Array[(String, Int, Long, Long)]] = {
data.asInstanceOf[mutable.HashMap[Time, Array[OffsetRange.OffsetRangeTuple]]]
}
override def update(time: Time): Unit = {
batchForTime.clear()
generatedRDDs.foreach { kv =>
val a = kv._2.asInstanceOf[KafkaRDD[K, V, U, T, R]].offsetRanges.map(_.toTuple).toArray
batchForTime += kv._1 -> a
}
}
override def cleanup(time: Time): Unit = { }
override def restore(): Unit = {
// this is assuming that the topics don't change during execution, which is true currently
val topics = fromOffsets.keySet
val leaders = KafkaCluster.checkErrors(kc.findLeaders(topics))
batchForTime.toSeq.sortBy(_._1)(Time.ordering).foreach { case (t, b) =>
logInfo(s"Restoring KafkaRDD for time $t ${b.mkString("[", ", ", "]")}")
generatedRDDs += t -> new KafkaRDD[K, V, U, T, R](
context.sparkContext, kafkaParams, b.map(OffsetRange(_)), leaders, messageHandler)
}
}
}
/**
* A RateController to retrieve the rate from RateEstimator.
*/
private[streaming] class DirectKafkaRateController(id: Int, estimator: RateEstimator)
extends RateController(id, estimator) {
override def publish(rate: Long): Unit = ()
}
}

View file

@ -1,439 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming.kafka
import java.util.Properties
import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer
import scala.util.Random
import scala.util.control.NonFatal
import kafka.api._
import kafka.common.{ErrorMapping, OffsetAndMetadata, OffsetMetadataAndError, TopicAndPartition}
import kafka.consumer.{ConsumerConfig, SimpleConsumer}
import org.apache.spark.SparkException
import org.apache.spark.annotation.DeveloperApi
/**
* :: DeveloperApi ::
* Convenience methods for interacting with a Kafka cluster.
* See <a href="https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol">
* A Guide To The Kafka Protocol</a> for more details on individual api calls.
* @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
* configuration parameters</a>.
* Requires "metadata.broker.list" or "bootstrap.servers" to be set with Kafka broker(s),
* NOT zookeeper servers, specified in host1:port1,host2:port2 form
*/
@DeveloperApi
@deprecated("Update to Kafka 0.10 integration", "2.3.0")
class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
import KafkaCluster.{Err, LeaderOffset, SimpleConsumerConfig}
// ConsumerConfig isn't serializable
@transient private var _config: SimpleConsumerConfig = null
def config: SimpleConsumerConfig = this.synchronized {
if (_config == null) {
_config = SimpleConsumerConfig(kafkaParams)
}
_config
}
def connect(host: String, port: Int): SimpleConsumer =
new SimpleConsumer(host, port, config.socketTimeoutMs,
config.socketReceiveBufferBytes, config.clientId)
def connectLeader(topic: String, partition: Int): Either[Err, SimpleConsumer] =
findLeader(topic, partition).right.map(hp => connect(hp._1, hp._2))
// Metadata api
// scalastyle:off
// https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-MetadataAPI
// scalastyle:on
def findLeader(topic: String, partition: Int): Either[Err, (String, Int)] = {
val req = TopicMetadataRequest(TopicMetadataRequest.CurrentVersion,
0, config.clientId, Seq(topic))
val errs = new Err
withBrokers(Random.shuffle(config.seedBrokers), errs) { consumer =>
val resp: TopicMetadataResponse = consumer.send(req)
resp.topicsMetadata.find(_.topic == topic).flatMap { tm: TopicMetadata =>
tm.partitionsMetadata.find(_.partitionId == partition)
}.foreach { pm: PartitionMetadata =>
pm.leader.foreach { leader =>
return Right((leader.host, leader.port))
}
}
}
Left(errs)
}
def findLeaders(
topicAndPartitions: Set[TopicAndPartition]
): Either[Err, Map[TopicAndPartition, (String, Int)]] = {
val topics = topicAndPartitions.map(_.topic)
val response = getPartitionMetadata(topics).right
val answer = response.flatMap { tms: Set[TopicMetadata] =>
val leaderMap = tms.flatMap { tm: TopicMetadata =>
tm.partitionsMetadata.flatMap { pm: PartitionMetadata =>
val tp = TopicAndPartition(tm.topic, pm.partitionId)
if (topicAndPartitions(tp)) {
pm.leader.map { l =>
tp -> (l.host -> l.port)
}
} else {
None
}
}
}.toMap
if (leaderMap.keys.size == topicAndPartitions.size) {
Right(leaderMap)
} else {
val missing = topicAndPartitions.diff(leaderMap.keySet)
val err = new Err
err += new SparkException(s"Couldn't find leaders for ${missing}")
Left(err)
}
}
answer
}
def getPartitions(topics: Set[String]): Either[Err, Set[TopicAndPartition]] = {
getPartitionMetadata(topics).right.map { r =>
r.flatMap { tm: TopicMetadata =>
tm.partitionsMetadata.map { pm: PartitionMetadata =>
TopicAndPartition(tm.topic, pm.partitionId)
}
}
}
}
def getPartitionMetadata(topics: Set[String]): Either[Err, Set[TopicMetadata]] = {
val req = TopicMetadataRequest(
TopicMetadataRequest.CurrentVersion, 0, config.clientId, topics.toSeq)
val errs = new Err
withBrokers(Random.shuffle(config.seedBrokers), errs) { consumer =>
val resp: TopicMetadataResponse = consumer.send(req)
val respErrs = resp.topicsMetadata.filter(m => m.errorCode != ErrorMapping.NoError)
if (respErrs.isEmpty) {
return Right(resp.topicsMetadata.toSet)
} else {
respErrs.foreach { m =>
val cause = ErrorMapping.exceptionFor(m.errorCode)
val msg = s"Error getting partition metadata for '${m.topic}'. Does the topic exist?"
errs += new SparkException(msg, cause)
}
}
}
Left(errs)
}
// Leader offset api
// scalastyle:off
// https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetAPI
// scalastyle:on
def getLatestLeaderOffsets(
topicAndPartitions: Set[TopicAndPartition]
): Either[Err, Map[TopicAndPartition, LeaderOffset]] =
getLeaderOffsets(topicAndPartitions, OffsetRequest.LatestTime)
def getEarliestLeaderOffsets(
topicAndPartitions: Set[TopicAndPartition]
): Either[Err, Map[TopicAndPartition, LeaderOffset]] =
getLeaderOffsets(topicAndPartitions, OffsetRequest.EarliestTime)
def getLeaderOffsets(
topicAndPartitions: Set[TopicAndPartition],
before: Long
): Either[Err, Map[TopicAndPartition, LeaderOffset]] = {
getLeaderOffsets(topicAndPartitions, before, 1).right.map { r =>
r.map { kv =>
// mapValues isn't serializable, see SI-7005
kv._1 -> kv._2.head
}
}
}
private def flip[K, V](m: Map[K, V]): Map[V, Seq[K]] =
m.groupBy(_._2).map { kv =>
kv._1 -> kv._2.keys.toSeq
}
def getLeaderOffsets(
topicAndPartitions: Set[TopicAndPartition],
before: Long,
maxNumOffsets: Int
): Either[Err, Map[TopicAndPartition, Seq[LeaderOffset]]] = {
findLeaders(topicAndPartitions).right.flatMap { tpToLeader =>
val leaderToTp: Map[(String, Int), Seq[TopicAndPartition]] = flip(tpToLeader)
val leaders = leaderToTp.keys
var result = Map[TopicAndPartition, Seq[LeaderOffset]]()
val errs = new Err
withBrokers(leaders, errs) { consumer =>
val partitionsToGetOffsets: Seq[TopicAndPartition] =
leaderToTp((consumer.host, consumer.port))
val reqMap = partitionsToGetOffsets.map { tp: TopicAndPartition =>
tp -> PartitionOffsetRequestInfo(before, maxNumOffsets)
}.toMap
val req = OffsetRequest(reqMap)
val resp = consumer.getOffsetsBefore(req)
val respMap = resp.partitionErrorAndOffsets
partitionsToGetOffsets.foreach { tp: TopicAndPartition =>
respMap.get(tp).foreach { por: PartitionOffsetsResponse =>
if (por.error == ErrorMapping.NoError) {
if (por.offsets.nonEmpty) {
result += tp -> por.offsets.map { off =>
LeaderOffset(consumer.host, consumer.port, off)
}
} else {
errs += new SparkException(
s"Empty offsets for ${tp}, is ${before} before log beginning?")
}
} else {
errs += ErrorMapping.exceptionFor(por.error)
}
}
}
if (result.keys.size == topicAndPartitions.size) {
return Right(result)
}
}
val missing = topicAndPartitions.diff(result.keySet)
errs += new SparkException(s"Couldn't find leader offsets for ${missing}")
Left(errs)
}
}
// Consumer offset api
// scalastyle:off
// https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetCommit/FetchAPI
// scalastyle:on
// this 0 here indicates api version, in this case the original ZK backed api.
private def defaultConsumerApiVersion: Short = 0
/**
* Requires Kafka 0.8.1.1 or later.
* Defaults to the original ZooKeeper backed API version.
*/
def getConsumerOffsets(
groupId: String,
topicAndPartitions: Set[TopicAndPartition]
): Either[Err, Map[TopicAndPartition, Long]] =
getConsumerOffsets(groupId, topicAndPartitions, defaultConsumerApiVersion)
def getConsumerOffsets(
groupId: String,
topicAndPartitions: Set[TopicAndPartition],
consumerApiVersion: Short
): Either[Err, Map[TopicAndPartition, Long]] = {
getConsumerOffsetMetadata(groupId, topicAndPartitions, consumerApiVersion).right.map { r =>
r.map { kv =>
kv._1 -> kv._2.offset
}
}
}
/**
* Requires Kafka 0.8.1.1 or later.
* Defaults to the original ZooKeeper backed API version.
*/
def getConsumerOffsetMetadata(
groupId: String,
topicAndPartitions: Set[TopicAndPartition]
): Either[Err, Map[TopicAndPartition, OffsetMetadataAndError]] =
getConsumerOffsetMetadata(groupId, topicAndPartitions, defaultConsumerApiVersion)
def getConsumerOffsetMetadata(
groupId: String,
topicAndPartitions: Set[TopicAndPartition],
consumerApiVersion: Short
): Either[Err, Map[TopicAndPartition, OffsetMetadataAndError]] = {
var result = Map[TopicAndPartition, OffsetMetadataAndError]()
val req = OffsetFetchRequest(groupId, topicAndPartitions.toSeq, consumerApiVersion)
val errs = new Err
withBrokers(Random.shuffle(config.seedBrokers), errs) { consumer =>
val resp = consumer.fetchOffsets(req)
val respMap = resp.requestInfo
val needed = topicAndPartitions.diff(result.keySet)
needed.foreach { tp: TopicAndPartition =>
respMap.get(tp).foreach { ome: OffsetMetadataAndError =>
if (ome.error == ErrorMapping.NoError) {
result += tp -> ome
} else {
errs += ErrorMapping.exceptionFor(ome.error)
}
}
}
if (result.keys.size == topicAndPartitions.size) {
return Right(result)
}
}
val missing = topicAndPartitions.diff(result.keySet)
errs += new SparkException(s"Couldn't find consumer offsets for ${missing}")
Left(errs)
}
/**
* Requires Kafka 0.8.1.1 or later.
* Defaults to the original ZooKeeper backed API version.
*/
def setConsumerOffsets(
groupId: String,
offsets: Map[TopicAndPartition, Long]
): Either[Err, Map[TopicAndPartition, Short]] =
setConsumerOffsets(groupId, offsets, defaultConsumerApiVersion)
def setConsumerOffsets(
groupId: String,
offsets: Map[TopicAndPartition, Long],
consumerApiVersion: Short
): Either[Err, Map[TopicAndPartition, Short]] = {
val meta = offsets.map { kv =>
kv._1 -> OffsetAndMetadata(kv._2)
}
setConsumerOffsetMetadata(groupId, meta, consumerApiVersion)
}
/**
* Requires Kafka 0.8.1.1 or later.
* Defaults to the original ZooKeeper backed API version.
*/
def setConsumerOffsetMetadata(
groupId: String,
metadata: Map[TopicAndPartition, OffsetAndMetadata]
): Either[Err, Map[TopicAndPartition, Short]] =
setConsumerOffsetMetadata(groupId, metadata, defaultConsumerApiVersion)
def setConsumerOffsetMetadata(
groupId: String,
metadata: Map[TopicAndPartition, OffsetAndMetadata],
consumerApiVersion: Short
): Either[Err, Map[TopicAndPartition, Short]] = {
var result = Map[TopicAndPartition, Short]()
val req = OffsetCommitRequest(groupId, metadata, consumerApiVersion)
val errs = new Err
val topicAndPartitions = metadata.keySet
withBrokers(Random.shuffle(config.seedBrokers), errs) { consumer =>
val resp = consumer.commitOffsets(req)
val respMap = resp.commitStatus
val needed = topicAndPartitions.diff(result.keySet)
needed.foreach { tp: TopicAndPartition =>
respMap.get(tp).foreach { err: Short =>
if (err == ErrorMapping.NoError) {
result += tp -> err
} else {
errs += ErrorMapping.exceptionFor(err)
}
}
}
if (result.keys.size == topicAndPartitions.size) {
return Right(result)
}
}
val missing = topicAndPartitions.diff(result.keySet)
errs += new SparkException(s"Couldn't set offsets for ${missing}")
Left(errs)
}
// Try a call against potentially multiple brokers, accumulating errors
private def withBrokers(brokers: Iterable[(String, Int)], errs: Err)
(fn: SimpleConsumer => Any): Unit = {
brokers.foreach { hp =>
var consumer: SimpleConsumer = null
try {
consumer = connect(hp._1, hp._2)
fn(consumer)
} catch {
case NonFatal(e) =>
errs += e
} finally {
if (consumer != null) {
consumer.close()
}
}
}
}
}
@DeveloperApi
@deprecated("Update to Kafka 0.10 integration", "2.3.0")
object KafkaCluster {
type Err = ArrayBuffer[Throwable]
/** If the result is right, return it, otherwise throw SparkException */
def checkErrors[T](result: Either[Err, T]): T = {
result.fold(
errs => throw new SparkException(errs.mkString("\n")),
ok => ok
)
}
case class LeaderOffset(host: String, port: Int, offset: Long)
/**
* High-level kafka consumers connect to ZK. ConsumerConfig assumes this use case.
* Simple consumers connect directly to brokers, but need many of the same configs.
* This subclass won't warn about missing ZK params, or presence of broker params.
*/
class SimpleConsumerConfig private(brokers: String, originalProps: Properties)
extends ConsumerConfig(originalProps) {
val seedBrokers: Array[(String, Int)] = brokers.split(",").map { hp =>
val hpa = hp.split(":")
if (hpa.size == 1) {
throw new SparkException(s"Broker not in the correct format of <host>:<port> [$brokers]")
}
(hpa(0), hpa(1).toInt)
}
}
object SimpleConsumerConfig {
/**
* Make a consumer config without requiring group.id or zookeeper.connect,
* since communicating with brokers also needs common settings such as timeout
*/
def apply(kafkaParams: Map[String, String]): SimpleConsumerConfig = {
// These keys are from other pre-existing kafka configs for specifying brokers, accept either
val brokers = kafkaParams.get("metadata.broker.list")
.orElse(kafkaParams.get("bootstrap.servers"))
.getOrElse(throw new SparkException(
"Must specify metadata.broker.list or bootstrap.servers"))
val props = new Properties()
kafkaParams.foreach { case (key, value) =>
// prevent warnings on parameters ConsumerConfig doesn't know about
if (key != "metadata.broker.list" && key != "bootstrap.servers") {
props.put(key, value)
}
}
Seq("zookeeper.connect", "group.id").foreach { s =>
if (!props.containsKey(s)) {
props.setProperty(s, "")
}
}
new SimpleConsumerConfig(brokers, props)
}
}
}

View file

@ -1,142 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming.kafka
import java.util.Properties
import scala.collection.Map
import scala.reflect.{classTag, ClassTag}
import kafka.consumer.{Consumer, ConsumerConfig, ConsumerConnector, KafkaStream}
import kafka.serializer.Decoder
import kafka.utils.VerifiableProperties
import org.apache.spark.internal.Logging
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream._
import org.apache.spark.streaming.receiver.Receiver
import org.apache.spark.util.ThreadUtils
/**
* Input stream that pulls messages from a Kafka Broker.
*
* @param kafkaParams Map of kafka configuration parameters.
* See: http://kafka.apache.org/configuration.html
* @param topics Map of (topic_name to numPartitions) to consume. Each partition is consumed
* in its own thread.
* @param storageLevel RDD storage level.
*/
private[streaming]
class KafkaInputDStream[
K: ClassTag,
V: ClassTag,
U <: Decoder[_]: ClassTag,
T <: Decoder[_]: ClassTag](
_ssc: StreamingContext,
kafkaParams: Map[String, String],
topics: Map[String, Int],
useReliableReceiver: Boolean,
storageLevel: StorageLevel
) extends ReceiverInputDStream[(K, V)](_ssc) with Logging {
def getReceiver(): Receiver[(K, V)] = {
if (!useReliableReceiver) {
new KafkaReceiver[K, V, U, T](kafkaParams, topics, storageLevel)
} else {
new ReliableKafkaReceiver[K, V, U, T](kafkaParams, topics, storageLevel)
}
}
}
private[streaming]
class KafkaReceiver[
K: ClassTag,
V: ClassTag,
U <: Decoder[_]: ClassTag,
T <: Decoder[_]: ClassTag](
kafkaParams: Map[String, String],
topics: Map[String, Int],
storageLevel: StorageLevel
) extends Receiver[(K, V)](storageLevel) with Logging {
// Connection to Kafka
var consumerConnector: ConsumerConnector = null
def onStop() {
if (consumerConnector != null) {
consumerConnector.shutdown()
consumerConnector = null
}
}
def onStart() {
logInfo("Starting Kafka Consumer Stream with group: " + kafkaParams("group.id"))
// Kafka connection properties
val props = new Properties()
kafkaParams.foreach(param => props.put(param._1, param._2))
val zkConnect = kafkaParams("zookeeper.connect")
// Create the connection to the cluster
logInfo("Connecting to Zookeeper: " + zkConnect)
val consumerConfig = new ConsumerConfig(props)
consumerConnector = Consumer.create(consumerConfig)
logInfo("Connected to " + zkConnect)
val keyDecoder = classTag[U].runtimeClass.getConstructor(classOf[VerifiableProperties])
.newInstance(consumerConfig.props)
.asInstanceOf[Decoder[K]]
val valueDecoder = classTag[T].runtimeClass.getConstructor(classOf[VerifiableProperties])
.newInstance(consumerConfig.props)
.asInstanceOf[Decoder[V]]
// Create threads for each topic/message Stream we are listening
val topicMessageStreams = consumerConnector.createMessageStreams(
topics, keyDecoder, valueDecoder)
val executorPool =
ThreadUtils.newDaemonFixedThreadPool(topics.values.sum, "KafkaMessageHandler")
try {
// Start the messages handler for each partition
topicMessageStreams.values.foreach { streams =>
streams.foreach { stream => executorPool.submit(new MessageHandler(stream)) }
}
} finally {
executorPool.shutdown() // Just causes threads to terminate after work is done
}
}
// Handles Kafka messages
private class MessageHandler(stream: KafkaStream[K, V])
extends Runnable {
def run() {
logInfo("Starting MessageHandler.")
try {
val streamIterator = stream.iterator()
while (streamIterator.hasNext()) {
val msgAndMetadata = streamIterator.next()
store((msgAndMetadata.key, msgAndMetadata.message))
}
} catch {
case e: Throwable => reportError("Error handling message; exiting", e)
}
}
}
}

View file

@ -1,273 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming.kafka
import scala.collection.mutable.ArrayBuffer
import scala.reflect.{classTag, ClassTag}
import kafka.api.{FetchRequestBuilder, FetchResponse}
import kafka.common.{ErrorMapping, TopicAndPartition}
import kafka.consumer.SimpleConsumer
import kafka.message.{MessageAndMetadata, MessageAndOffset}
import kafka.serializer.Decoder
import kafka.utils.VerifiableProperties
import org.apache.spark.{Partition, SparkContext, SparkException, TaskContext}
import org.apache.spark.internal.Logging
import org.apache.spark.partial.{BoundedDouble, PartialResult}
import org.apache.spark.rdd.RDD
import org.apache.spark.util.NextIterator
/**
* A batch-oriented interface for consuming from Kafka.
* Starting and ending offsets are specified in advance,
* so that you can control exactly-once semantics.
* @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
* configuration parameters</a>. Requires "metadata.broker.list" or "bootstrap.servers" to be set
* with Kafka broker(s) specified in host1:port1,host2:port2 form.
* @param offsetRanges offset ranges that define the Kafka data belonging to this RDD
* @param messageHandler function for translating each message into the desired type
*/
private[kafka]
class KafkaRDD[
K: ClassTag,
V: ClassTag,
U <: Decoder[_]: ClassTag,
T <: Decoder[_]: ClassTag,
R: ClassTag] private[spark] (
sc: SparkContext,
kafkaParams: Map[String, String],
val offsetRanges: Array[OffsetRange],
leaders: Map[TopicAndPartition, (String, Int)],
messageHandler: MessageAndMetadata[K, V] => R
) extends RDD[R](sc, Nil) with Logging with HasOffsetRanges {
override def getPartitions: Array[Partition] = {
offsetRanges.zipWithIndex.map { case (o, i) =>
val (host, port) = leaders(TopicAndPartition(o.topic, o.partition))
new KafkaRDDPartition(i, o.topic, o.partition, o.fromOffset, o.untilOffset, host, port)
}.toArray
}
override def count(): Long = offsetRanges.map(_.count).sum
override def countApprox(
timeout: Long,
confidence: Double = 0.95
): PartialResult[BoundedDouble] = {
val c = count
new PartialResult(new BoundedDouble(c, 1.0, c, c), true)
}
override def isEmpty(): Boolean = count == 0L
override def take(num: Int): Array[R] = {
val nonEmptyPartitions = this.partitions
.map(_.asInstanceOf[KafkaRDDPartition])
.filter(_.count > 0)
if (num < 1 || nonEmptyPartitions.isEmpty) {
return new Array[R](0)
}
// Determine in advance how many messages need to be taken from each partition
val parts = nonEmptyPartitions.foldLeft(Map[Int, Int]()) { (result, part) =>
val remain = num - result.values.sum
if (remain > 0) {
val taken = Math.min(remain, part.count)
result + (part.index -> taken.toInt)
} else {
result
}
}
val buf = new ArrayBuffer[R]
val res = context.runJob(
this,
(tc: TaskContext, it: Iterator[R]) => it.take(parts(tc.partitionId)).toArray,
parts.keys.toArray)
res.foreach(buf ++= _)
buf.toArray
}
override def getPreferredLocations(thePart: Partition): Seq[String] = {
val part = thePart.asInstanceOf[KafkaRDDPartition]
// TODO is additional hostname resolution necessary here
Seq(part.host)
}
private def errBeginAfterEnd(part: KafkaRDDPartition): String =
s"Beginning offset ${part.fromOffset} is after the ending offset ${part.untilOffset} " +
s"for topic ${part.topic} partition ${part.partition}. " +
"You either provided an invalid fromOffset, or the Kafka topic has been damaged"
private def errRanOutBeforeEnd(part: KafkaRDDPartition): String =
s"Ran out of messages before reaching ending offset ${part.untilOffset} " +
s"for topic ${part.topic} partition ${part.partition} start ${part.fromOffset}." +
" This should not happen, and indicates that messages may have been lost"
private def errOvershotEnd(itemOffset: Long, part: KafkaRDDPartition): String =
s"Got ${itemOffset} > ending offset ${part.untilOffset} " +
s"for topic ${part.topic} partition ${part.partition} start ${part.fromOffset}." +
" This should not happen, and indicates a message may have been skipped"
override def compute(thePart: Partition, context: TaskContext): Iterator[R] = {
val part = thePart.asInstanceOf[KafkaRDDPartition]
assert(part.fromOffset <= part.untilOffset, errBeginAfterEnd(part))
if (part.fromOffset == part.untilOffset) {
logInfo(s"Beginning offset ${part.fromOffset} is the same as ending offset " +
s"skipping ${part.topic} ${part.partition}")
Iterator.empty
} else {
new KafkaRDDIterator(part, context)
}
}
/**
* An iterator that fetches messages directly from Kafka for the offsets in partition.
*/
private class KafkaRDDIterator(
part: KafkaRDDPartition,
context: TaskContext) extends NextIterator[R] {
context.addTaskCompletionListener{ context => closeIfNeeded() }
logInfo(s"Computing topic ${part.topic}, partition ${part.partition} " +
s"offsets ${part.fromOffset} -> ${part.untilOffset}")
val kc = new KafkaCluster(kafkaParams)
val keyDecoder = classTag[U].runtimeClass.getConstructor(classOf[VerifiableProperties])
.newInstance(kc.config.props)
.asInstanceOf[Decoder[K]]
val valueDecoder = classTag[T].runtimeClass.getConstructor(classOf[VerifiableProperties])
.newInstance(kc.config.props)
.asInstanceOf[Decoder[V]]
val consumer = connectLeader
var requestOffset = part.fromOffset
var iter: Iterator[MessageAndOffset] = null
// The idea is to use the provided preferred host, except on task retry attempts,
// to minimize number of kafka metadata requests
private def connectLeader: SimpleConsumer = {
if (context.attemptNumber > 0) {
kc.connectLeader(part.topic, part.partition).fold(
errs => throw new SparkException(
s"Couldn't connect to leader for topic ${part.topic} ${part.partition}: " +
errs.mkString("\n")),
consumer => consumer
)
} else {
kc.connect(part.host, part.port)
}
}
private def handleFetchErr(resp: FetchResponse) {
if (resp.hasError) {
val err = resp.errorCode(part.topic, part.partition)
if (err == ErrorMapping.LeaderNotAvailableCode ||
err == ErrorMapping.NotLeaderForPartitionCode) {
logError(s"Lost leader for topic ${part.topic} partition ${part.partition}, " +
s" sleeping for ${kc.config.refreshLeaderBackoffMs}ms")
Thread.sleep(kc.config.refreshLeaderBackoffMs)
}
// Let normal rdd retry sort out reconnect attempts
throw ErrorMapping.exceptionFor(err)
}
}
private def fetchBatch: Iterator[MessageAndOffset] = {
val req = new FetchRequestBuilder()
.clientId(consumer.clientId)
.addFetch(part.topic, part.partition, requestOffset, kc.config.fetchMessageMaxBytes)
.build()
val resp = consumer.fetch(req)
handleFetchErr(resp)
// kafka may return a batch that starts before the requested offset
resp.messageSet(part.topic, part.partition)
.iterator
.dropWhile(_.offset < requestOffset)
}
override def close(): Unit = {
if (consumer != null) {
consumer.close()
}
}
override def getNext(): R = {
if (iter == null || !iter.hasNext) {
iter = fetchBatch
}
if (!iter.hasNext) {
assert(requestOffset == part.untilOffset, errRanOutBeforeEnd(part))
finished = true
null.asInstanceOf[R]
} else {
val item = iter.next()
if (item.offset >= part.untilOffset) {
assert(item.offset == part.untilOffset, errOvershotEnd(item.offset, part))
finished = true
null.asInstanceOf[R]
} else {
requestOffset = item.nextOffset
messageHandler(new MessageAndMetadata(
part.topic, part.partition, item.message, item.offset, keyDecoder, valueDecoder))
}
}
}
}
}
private[kafka]
object KafkaRDD {
import KafkaCluster.LeaderOffset
/**
* @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
* configuration parameters</a>.
* Requires "metadata.broker.list" or "bootstrap.servers" to be set with Kafka broker(s),
* NOT zookeeper servers, specified in host1:port1,host2:port2 form.
* @param fromOffsets per-topic/partition Kafka offsets defining the (inclusive)
* starting point of the batch
* @param untilOffsets per-topic/partition Kafka offsets defining the (exclusive)
* ending point of the batch
* @param messageHandler function for translating each message into the desired type
*/
def apply[
K: ClassTag,
V: ClassTag,
U <: Decoder[_]: ClassTag,
T <: Decoder[_]: ClassTag,
R: ClassTag](
sc: SparkContext,
kafkaParams: Map[String, String],
fromOffsets: Map[TopicAndPartition, Long],
untilOffsets: Map[TopicAndPartition, LeaderOffset],
messageHandler: MessageAndMetadata[K, V] => R
): KafkaRDD[K, V, U, T, R] = {
val leaders = untilOffsets.map { case (tp, lo) =>
tp -> ((lo.host, lo.port))
}
val offsetRanges = fromOffsets.map { case (tp, fo) =>
val uo = untilOffsets(tp)
OffsetRange(tp.topic, tp.partition, fo, uo.offset)
}.toArray
new KafkaRDD[K, V, U, T, R](sc, kafkaParams, offsetRanges, leaders, messageHandler)
}
}

View file

@ -1,42 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming.kafka
import org.apache.spark.Partition
/**
* @param topic kafka topic name
* @param partition kafka partition id
* @param fromOffset inclusive starting offset
* @param untilOffset exclusive ending offset
* @param host preferred kafka host, i.e. the leader at the time the rdd was created
* @param port preferred kafka host's port
*/
private[kafka]
class KafkaRDDPartition(
val index: Int,
val topic: String,
val partition: Int,
val fromOffset: Long,
val untilOffset: Long,
val host: String,
val port: Int
) extends Partition {
/** Number of messages this partition refers to */
def count(): Long = untilOffset - fromOffset
}

View file

@ -1,299 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming.kafka
import java.io.{File, IOException}
import java.lang.{Integer => JInt}
import java.net.InetSocketAddress
import java.util.{Map => JMap, Properties}
import java.util.concurrent.TimeoutException
import scala.annotation.tailrec
import scala.collection.JavaConverters._
import scala.util.control.NonFatal
import kafka.admin.AdminUtils
import kafka.api.Request
import kafka.producer.{KeyedMessage, Producer, ProducerConfig}
import kafka.serializer.StringEncoder
import kafka.server.{KafkaConfig, KafkaServer}
import kafka.utils.{ZKStringSerializer, ZkUtils}
import org.I0Itec.zkclient.ZkClient
import org.apache.commons.lang3.RandomUtils
import org.apache.zookeeper.server.{NIOServerCnxnFactory, ZooKeeperServer}
import org.apache.spark.SparkConf
import org.apache.spark.internal.Logging
import org.apache.spark.streaming.Time
import org.apache.spark.util.Utils
/**
* This is a helper class for Kafka test suites. This has the functionality to set up
* and tear down local Kafka servers, and to push data using Kafka producers.
*
* The reason to put Kafka test utility class in src is to test Python related Kafka APIs.
*/
private[kafka] class KafkaTestUtils extends Logging {
// Zookeeper related configurations
private val zkHost = "localhost"
private var zkPort: Int = 0
private val zkConnectionTimeout = 60000
private val zkSessionTimeout = 6000
private var zookeeper: EmbeddedZookeeper = _
private var zkClient: ZkClient = _
// Kafka broker related configurations
private val brokerHost = "localhost"
// 0.8.2 server doesn't have a boundPort method, so can't use 0 for a random port
private var brokerPort = RandomUtils.nextInt(1024, 65536)
private var brokerConf: KafkaConfig = _
// Kafka broker server
private var server: KafkaServer = _
// Kafka producer
private var producer: Producer[String, String] = _
// Flag to test whether the system is correctly started
private var zkReady = false
private var brokerReady = false
def zkAddress: String = {
assert(zkReady, "Zookeeper not setup yet or already torn down, cannot get zookeeper address")
s"$zkHost:$zkPort"
}
def brokerAddress: String = {
assert(brokerReady, "Kafka not setup yet or already torn down, cannot get broker address")
s"$brokerHost:$brokerPort"
}
def zookeeperClient: ZkClient = {
assert(zkReady, "Zookeeper not setup yet or already torn down, cannot get zookeeper client")
Option(zkClient).getOrElse(
throw new IllegalStateException("Zookeeper client is not yet initialized"))
}
// Set up the Embedded Zookeeper server and get the proper Zookeeper port
private def setupEmbeddedZookeeper(): Unit = {
// Zookeeper server startup
zookeeper = new EmbeddedZookeeper(s"$zkHost:$zkPort")
// Get the actual zookeeper binding port
zkPort = zookeeper.actualPort
zkClient = new ZkClient(s"$zkHost:$zkPort", zkSessionTimeout, zkConnectionTimeout,
ZKStringSerializer)
zkReady = true
}
// Set up the Embedded Kafka server
private def setupEmbeddedKafkaServer(): Unit = {
assert(zkReady, "Zookeeper should be set up beforehand")
// Kafka broker startup
Utils.startServiceOnPort(brokerPort, port => {
brokerPort = port
brokerConf = new KafkaConfig(brokerConfiguration)
server = new KafkaServer(brokerConf)
server.startup()
(server, brokerPort)
}, new SparkConf(), "KafkaBroker")
brokerReady = true
}
/** setup the whole embedded servers, including Zookeeper and Kafka brokers */
def setup(): Unit = {
setupEmbeddedZookeeper()
setupEmbeddedKafkaServer()
}
/** Teardown the whole servers, including Kafka broker and Zookeeper */
def teardown(): Unit = {
brokerReady = false
zkReady = false
if (producer != null) {
producer.close()
producer = null
}
if (server != null) {
server.shutdown()
server.awaitShutdown()
server = null
}
// On Windows, `logDirs` is left open even after Kafka server above is completely shut down
// in some cases. It leads to test failures on Windows if the directory deletion failure
// throws an exception.
brokerConf.logDirs.foreach { f =>
try {
Utils.deleteRecursively(new File(f))
} catch {
case e: IOException if Utils.isWindows =>
logWarning(e.getMessage)
}
}
if (zkClient != null) {
zkClient.close()
zkClient = null
}
if (zookeeper != null) {
zookeeper.shutdown()
zookeeper = null
}
}
/** Create a Kafka topic and wait until it is propagated to the whole cluster */
def createTopic(topic: String, partitions: Int): Unit = {
AdminUtils.createTopic(zkClient, topic, partitions, 1)
// wait until metadata is propagated
(0 until partitions).foreach { p => waitUntilMetadataIsPropagated(topic, p) }
}
/** Single-argument version for backwards compatibility */
def createTopic(topic: String): Unit = createTopic(topic, 1)
/** Java-friendly function for sending messages to the Kafka broker */
def sendMessages(topic: String, messageToFreq: JMap[String, JInt]): Unit = {
sendMessages(topic, Map(messageToFreq.asScala.mapValues(_.intValue()).toSeq: _*))
}
/** Send the messages to the Kafka broker */
def sendMessages(topic: String, messageToFreq: Map[String, Int]): Unit = {
val messages = messageToFreq.flatMap { case (s, freq) => Seq.fill(freq)(s) }.toArray
sendMessages(topic, messages)
}
/** Send the array of messages to the Kafka broker */
def sendMessages(topic: String, messages: Array[String]): Unit = {
producer = new Producer[String, String](new ProducerConfig(producerConfiguration))
producer.send(messages.map { new KeyedMessage[String, String](topic, _ ) }: _*)
producer.close()
producer = null
}
private def brokerConfiguration: Properties = {
val props = new Properties()
props.put("broker.id", "0")
props.put("host.name", "localhost")
props.put("port", brokerPort.toString)
props.put("log.dir", Utils.createTempDir().getAbsolutePath)
props.put("zookeeper.connect", zkAddress)
props.put("log.flush.interval.messages", "1")
props.put("replica.socket.timeout.ms", "1500")
props
}
private def producerConfiguration: Properties = {
val props = new Properties()
props.put("metadata.broker.list", brokerAddress)
props.put("serializer.class", classOf[StringEncoder].getName)
// wait for all in-sync replicas to ack sends
props.put("request.required.acks", "-1")
props
}
// A simplified version of scalatest eventually, rewritten here to avoid adding extra test
// dependency
def eventually[T](timeout: Time, interval: Time)(func: => T): T = {
def makeAttempt(): Either[Throwable, T] = {
try {
Right(func)
} catch {
case e if NonFatal(e) => Left(e)
}
}
val startTime = System.currentTimeMillis()
@tailrec
def tryAgain(attempt: Int): T = {
makeAttempt() match {
case Right(result) => result
case Left(e) =>
val duration = System.currentTimeMillis() - startTime
if (duration < timeout.milliseconds) {
Thread.sleep(interval.milliseconds)
} else {
throw new TimeoutException(e.getMessage)
}
tryAgain(attempt + 1)
}
}
tryAgain(1)
}
private def waitUntilMetadataIsPropagated(topic: String, partition: Int): Unit = {
def isPropagated = server.apis.metadataCache.getPartitionInfo(topic, partition) match {
case Some(partitionState) =>
val leaderAndInSyncReplicas = partitionState.leaderIsrAndControllerEpoch.leaderAndIsr
ZkUtils.getLeaderForPartition(zkClient, topic, partition).isDefined &&
Request.isValidBrokerId(leaderAndInSyncReplicas.leader) &&
leaderAndInSyncReplicas.isr.size >= 1
case _ =>
false
}
eventually(Time(10000), Time(100)) {
assert(isPropagated, s"Partition [$topic, $partition] metadata not propagated after timeout")
}
}
private class EmbeddedZookeeper(val zkConnect: String) {
val snapshotDir = Utils.createTempDir()
val logDir = Utils.createTempDir()
val zookeeper = new ZooKeeperServer(snapshotDir, logDir, 500)
val (ip, port) = {
val splits = zkConnect.split(":")
(splits(0), splits(1).toInt)
}
val factory = new NIOServerCnxnFactory()
factory.configure(new InetSocketAddress(ip, port), 16)
factory.startup(zookeeper)
val actualPort = factory.getLocalPort
def shutdown() {
factory.shutdown()
// The directories are not closed even if the ZooKeeper server is shut down.
// Please see ZOOKEEPER-1844, which is fixed in 3.4.6+. It leads to test failures
// on Windows if the directory deletion failure throws an exception.
try {
Utils.deleteRecursively(snapshotDir)
} catch {
case e: IOException if Utils.isWindows =>
logWarning(e.getMessage)
}
try {
Utils.deleteRecursively(logDir)
} catch {
case e: IOException if Utils.isWindows =>
logWarning(e.getMessage)
}
}
}
}

View file

@ -1,806 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming.kafka
import java.io.OutputStream
import java.lang.{Integer => JInt, Long => JLong, Number => JNumber}
import java.nio.charset.StandardCharsets
import java.util.{List => JList, Locale, Map => JMap, Set => JSet}
import scala.collection.JavaConverters._
import scala.reflect.ClassTag
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.{Decoder, DefaultDecoder, StringDecoder}
import net.razorvine.pickle.{IObjectPickler, Opcodes, Pickler}
import org.apache.spark.{SparkContext, SparkException}
import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext}
import org.apache.spark.api.java.function.{Function => JFunction}
import org.apache.spark.api.python.SerDeUtil
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.api.java._
import org.apache.spark.streaming.dstream.{DStream, InputDStream, ReceiverInputDStream}
import org.apache.spark.streaming.util.WriteAheadLogUtils
@deprecated("Update to Kafka 0.10 integration", "2.3.0")
object KafkaUtils {
/**
* Create an input stream that pulls messages from Kafka Brokers.
* @param ssc StreamingContext object
* @param zkQuorum Zookeeper quorum (hostname:port,hostname:port,..)
* @param groupId The group id for this consumer
* @param topics Map of (topic_name to numPartitions) to consume. Each partition is consumed
* in its own thread
* @param storageLevel Storage level to use for storing the received objects
* (default: StorageLevel.MEMORY_AND_DISK_SER_2)
* @return DStream of (Kafka message key, Kafka message value)
*/
def createStream(
ssc: StreamingContext,
zkQuorum: String,
groupId: String,
topics: Map[String, Int],
storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2
): ReceiverInputDStream[(String, String)] = {
val kafkaParams = Map[String, String](
"zookeeper.connect" -> zkQuorum, "group.id" -> groupId,
"zookeeper.connection.timeout.ms" -> "10000")
createStream[String, String, StringDecoder, StringDecoder](
ssc, kafkaParams, topics, storageLevel)
}
/**
* Create an input stream that pulls messages from Kafka Brokers.
* @param ssc StreamingContext object
* @param kafkaParams Map of kafka configuration parameters,
* see http://kafka.apache.org/08/configuration.html
* @param topics Map of (topic_name to numPartitions) to consume. Each partition is consumed
* in its own thread.
* @param storageLevel Storage level to use for storing the received objects
* @tparam K type of Kafka message key
* @tparam V type of Kafka message value
* @tparam U type of Kafka message key decoder
* @tparam T type of Kafka message value decoder
* @return DStream of (Kafka message key, Kafka message value)
*/
def createStream[K: ClassTag, V: ClassTag, U <: Decoder[_]: ClassTag, T <: Decoder[_]: ClassTag](
ssc: StreamingContext,
kafkaParams: Map[String, String],
topics: Map[String, Int],
storageLevel: StorageLevel
): ReceiverInputDStream[(K, V)] = {
val walEnabled = WriteAheadLogUtils.enableReceiverLog(ssc.conf)
new KafkaInputDStream[K, V, U, T](ssc, kafkaParams, topics, walEnabled, storageLevel)
}
/**
* Create an input stream that pulls messages from Kafka Brokers.
* Storage level of the data will be the default StorageLevel.MEMORY_AND_DISK_SER_2.
* @param jssc JavaStreamingContext object
* @param zkQuorum Zookeeper quorum (hostname:port,hostname:port,..)
* @param groupId The group id for this consumer
* @param topics Map of (topic_name to numPartitions) to consume. Each partition is consumed
* in its own thread
* @return DStream of (Kafka message key, Kafka message value)
*/
def createStream(
jssc: JavaStreamingContext,
zkQuorum: String,
groupId: String,
topics: JMap[String, JInt]
): JavaPairReceiverInputDStream[String, String] = {
createStream(jssc.ssc, zkQuorum, groupId, Map(topics.asScala.mapValues(_.intValue()).toSeq: _*))
}
/**
* Create an input stream that pulls messages from Kafka Brokers.
* @param jssc JavaStreamingContext object
* @param zkQuorum Zookeeper quorum (hostname:port,hostname:port,..).
* @param groupId The group id for this consumer.
* @param topics Map of (topic_name to numPartitions) to consume. Each partition is consumed
* in its own thread.
* @param storageLevel RDD storage level.
* @return DStream of (Kafka message key, Kafka message value)
*/
def createStream(
jssc: JavaStreamingContext,
zkQuorum: String,
groupId: String,
topics: JMap[String, JInt],
storageLevel: StorageLevel
): JavaPairReceiverInputDStream[String, String] = {
createStream(jssc.ssc, zkQuorum, groupId, Map(topics.asScala.mapValues(_.intValue()).toSeq: _*),
storageLevel)
}
/**
* Create an input stream that pulls messages from Kafka Brokers.
* @param jssc JavaStreamingContext object
* @param keyTypeClass Key type of DStream
* @param valueTypeClass value type of Dstream
* @param keyDecoderClass Type of kafka key decoder
* @param valueDecoderClass Type of kafka value decoder
* @param kafkaParams Map of kafka configuration parameters,
* see http://kafka.apache.org/08/configuration.html
* @param topics Map of (topic_name to numPartitions) to consume. Each partition is consumed
* in its own thread
* @param storageLevel RDD storage level.
* @tparam K type of Kafka message key
* @tparam V type of Kafka message value
* @tparam U type of Kafka message key decoder
* @tparam T type of Kafka message value decoder
* @return DStream of (Kafka message key, Kafka message value)
*/
def createStream[K, V, U <: Decoder[_], T <: Decoder[_]](
jssc: JavaStreamingContext,
keyTypeClass: Class[K],
valueTypeClass: Class[V],
keyDecoderClass: Class[U],
valueDecoderClass: Class[T],
kafkaParams: JMap[String, String],
topics: JMap[String, JInt],
storageLevel: StorageLevel
): JavaPairReceiverInputDStream[K, V] = {
implicit val keyCmt: ClassTag[K] = ClassTag(keyTypeClass)
implicit val valueCmt: ClassTag[V] = ClassTag(valueTypeClass)
implicit val keyCmd: ClassTag[U] = ClassTag(keyDecoderClass)
implicit val valueCmd: ClassTag[T] = ClassTag(valueDecoderClass)
createStream[K, V, U, T](
jssc.ssc,
kafkaParams.asScala.toMap,
Map(topics.asScala.mapValues(_.intValue()).toSeq: _*),
storageLevel)
}
/** get leaders for the given offset ranges, or throw an exception */
private def leadersForRanges(
kc: KafkaCluster,
offsetRanges: Array[OffsetRange]): Map[TopicAndPartition, (String, Int)] = {
val topics = offsetRanges.map(o => TopicAndPartition(o.topic, o.partition)).toSet
val leaders = kc.findLeaders(topics)
KafkaCluster.checkErrors(leaders)
}
/** Make sure offsets are available in kafka, or throw an exception */
private def checkOffsets(
kc: KafkaCluster,
offsetRanges: Array[OffsetRange]): Unit = {
val topics = offsetRanges.map(_.topicAndPartition).toSet
val result = for {
low <- kc.getEarliestLeaderOffsets(topics).right
high <- kc.getLatestLeaderOffsets(topics).right
} yield {
offsetRanges.filterNot { o =>
low(o.topicAndPartition).offset <= o.fromOffset &&
o.untilOffset <= high(o.topicAndPartition).offset
}
}
val badRanges = KafkaCluster.checkErrors(result)
if (!badRanges.isEmpty) {
throw new SparkException("Offsets not available on leader: " + badRanges.mkString(","))
}
}
private[kafka] def getFromOffsets(
kc: KafkaCluster,
kafkaParams: Map[String, String],
topics: Set[String]
): Map[TopicAndPartition, Long] = {
val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase(Locale.ROOT))
val result = for {
topicPartitions <- kc.getPartitions(topics).right
leaderOffsets <- (if (reset == Some("smallest")) {
kc.getEarliestLeaderOffsets(topicPartitions)
} else {
kc.getLatestLeaderOffsets(topicPartitions)
}).right
} yield {
leaderOffsets.map { case (tp, lo) =>
(tp, lo.offset)
}
}
KafkaCluster.checkErrors(result)
}
/**
* Create an RDD from Kafka using offset ranges for each topic and partition.
*
* @param sc SparkContext object
* @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
* configuration parameters</a>. Requires "metadata.broker.list" or "bootstrap.servers"
* to be set with Kafka broker(s) (NOT zookeeper servers) specified in
* host1:port1,host2:port2 form.
* @param offsetRanges Each OffsetRange in the batch corresponds to a
* range of offsets for a given Kafka topic/partition
* @tparam K type of Kafka message key
* @tparam V type of Kafka message value
* @tparam KD type of Kafka message key decoder
* @tparam VD type of Kafka message value decoder
* @return RDD of (Kafka message key, Kafka message value)
*/
def createRDD[
K: ClassTag,
V: ClassTag,
KD <: Decoder[K]: ClassTag,
VD <: Decoder[V]: ClassTag](
sc: SparkContext,
kafkaParams: Map[String, String],
offsetRanges: Array[OffsetRange]
): RDD[(K, V)] = sc.withScope {
val messageHandler = (mmd: MessageAndMetadata[K, V]) => (mmd.key, mmd.message)
val kc = new KafkaCluster(kafkaParams)
val leaders = leadersForRanges(kc, offsetRanges)
checkOffsets(kc, offsetRanges)
new KafkaRDD[K, V, KD, VD, (K, V)](sc, kafkaParams, offsetRanges, leaders, messageHandler)
}
/**
* Create an RDD from Kafka using offset ranges for each topic and partition. This allows you
* specify the Kafka leader to connect to (to optimize fetching) and access the message as well
* as the metadata.
*
* @param sc SparkContext object
* @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
* configuration parameters</a>. Requires "metadata.broker.list" or "bootstrap.servers"
* to be set with Kafka broker(s) (NOT zookeeper servers) specified in
* host1:port1,host2:port2 form.
* @param offsetRanges Each OffsetRange in the batch corresponds to a
* range of offsets for a given Kafka topic/partition
* @param leaders Kafka brokers for each TopicAndPartition in offsetRanges. May be an empty map,
* in which case leaders will be looked up on the driver.
* @param messageHandler Function for translating each message and metadata into the desired type
* @tparam K type of Kafka message key
* @tparam V type of Kafka message value
* @tparam KD type of Kafka message key decoder
* @tparam VD type of Kafka message value decoder
* @tparam R type returned by messageHandler
* @return RDD of R
*/
def createRDD[
K: ClassTag,
V: ClassTag,
KD <: Decoder[K]: ClassTag,
VD <: Decoder[V]: ClassTag,
R: ClassTag](
sc: SparkContext,
kafkaParams: Map[String, String],
offsetRanges: Array[OffsetRange],
leaders: Map[TopicAndPartition, Broker],
messageHandler: MessageAndMetadata[K, V] => R
): RDD[R] = sc.withScope {
val kc = new KafkaCluster(kafkaParams)
val leaderMap = if (leaders.isEmpty) {
leadersForRanges(kc, offsetRanges)
} else {
// This could be avoided by refactoring KafkaRDD.leaders and KafkaCluster to use Broker
leaders.map {
case (tp: TopicAndPartition, Broker(host, port)) => (tp, (host, port))
}
}
val cleanedHandler = sc.clean(messageHandler)
checkOffsets(kc, offsetRanges)
new KafkaRDD[K, V, KD, VD, R](sc, kafkaParams, offsetRanges, leaderMap, cleanedHandler)
}
/**
* Create an RDD from Kafka using offset ranges for each topic and partition.
*
* @param jsc JavaSparkContext object
* @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
* configuration parameters</a>. Requires "metadata.broker.list" or "bootstrap.servers"
* to be set with Kafka broker(s) (NOT zookeeper servers) specified in
* host1:port1,host2:port2 form.
* @param offsetRanges Each OffsetRange in the batch corresponds to a
* range of offsets for a given Kafka topic/partition
* @param keyClass type of Kafka message key
* @param valueClass type of Kafka message value
* @param keyDecoderClass type of Kafka message key decoder
* @param valueDecoderClass type of Kafka message value decoder
* @tparam K type of Kafka message key
* @tparam V type of Kafka message value
* @tparam KD type of Kafka message key decoder
* @tparam VD type of Kafka message value decoder
* @return RDD of (Kafka message key, Kafka message value)
*/
def createRDD[K, V, KD <: Decoder[K], VD <: Decoder[V]](
jsc: JavaSparkContext,
keyClass: Class[K],
valueClass: Class[V],
keyDecoderClass: Class[KD],
valueDecoderClass: Class[VD],
kafkaParams: JMap[String, String],
offsetRanges: Array[OffsetRange]
): JavaPairRDD[K, V] = jsc.sc.withScope {
implicit val keyCmt: ClassTag[K] = ClassTag(keyClass)
implicit val valueCmt: ClassTag[V] = ClassTag(valueClass)
implicit val keyDecoderCmt: ClassTag[KD] = ClassTag(keyDecoderClass)
implicit val valueDecoderCmt: ClassTag[VD] = ClassTag(valueDecoderClass)
new JavaPairRDD(createRDD[K, V, KD, VD](
jsc.sc, Map(kafkaParams.asScala.toSeq: _*), offsetRanges))
}
/**
* Create an RDD from Kafka using offset ranges for each topic and partition. This allows you
* specify the Kafka leader to connect to (to optimize fetching) and access the message as well
* as the metadata.
*
* @param jsc JavaSparkContext object
* @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
* configuration parameters</a>. Requires "metadata.broker.list" or "bootstrap.servers"
* to be set with Kafka broker(s) (NOT zookeeper servers) specified in
* host1:port1,host2:port2 form.
* @param offsetRanges Each OffsetRange in the batch corresponds to a
* range of offsets for a given Kafka topic/partition
* @param leaders Kafka brokers for each TopicAndPartition in offsetRanges. May be an empty map,
* in which case leaders will be looked up on the driver.
* @param messageHandler Function for translating each message and metadata into the desired type
* @tparam K type of Kafka message key
* @tparam V type of Kafka message value
* @tparam KD type of Kafka message key decoder
* @tparam VD type of Kafka message value decoder
* @tparam R type returned by messageHandler
* @return RDD of R
*/
def createRDD[K, V, KD <: Decoder[K], VD <: Decoder[V], R](
jsc: JavaSparkContext,
keyClass: Class[K],
valueClass: Class[V],
keyDecoderClass: Class[KD],
valueDecoderClass: Class[VD],
recordClass: Class[R],
kafkaParams: JMap[String, String],
offsetRanges: Array[OffsetRange],
leaders: JMap[TopicAndPartition, Broker],
messageHandler: JFunction[MessageAndMetadata[K, V], R]
): JavaRDD[R] = jsc.sc.withScope {
implicit val keyCmt: ClassTag[K] = ClassTag(keyClass)
implicit val valueCmt: ClassTag[V] = ClassTag(valueClass)
implicit val keyDecoderCmt: ClassTag[KD] = ClassTag(keyDecoderClass)
implicit val valueDecoderCmt: ClassTag[VD] = ClassTag(valueDecoderClass)
implicit val recordCmt: ClassTag[R] = ClassTag(recordClass)
val leaderMap = Map(leaders.asScala.toSeq: _*)
createRDD[K, V, KD, VD, R](
jsc.sc, Map(kafkaParams.asScala.toSeq: _*), offsetRanges, leaderMap, messageHandler.call(_))
}
/**
* Create an input stream that directly pulls messages from Kafka Brokers
* without using any receiver. This stream can guarantee that each message
* from Kafka is included in transformations exactly once (see points below).
*
* Points to note:
* - No receivers: This stream does not use any receiver. It directly queries Kafka
* - Offsets: This does not use Zookeeper to store offsets. The consumed offsets are tracked
* by the stream itself. For interoperability with Kafka monitoring tools that depend on
* Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application.
* You can access the offsets used in each batch from the generated RDDs (see
* [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
* - Failure Recovery: To recover from driver failures, you have to enable checkpointing
* in the `StreamingContext`. The information on consumed offset can be
* recovered from the checkpoint. See the programming guide for details (constraints, etc.).
* - End-to-end semantics: This stream ensures that every records is effectively received and
* transformed exactly once, but gives no guarantees on whether the transformed data are
* outputted exactly once. For end-to-end exactly-once semantics, you have to either ensure
* that the output operation is idempotent, or use transactions to output records atomically.
* See the programming guide for more details.
*
* @param ssc StreamingContext object
* @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
* configuration parameters</a>. Requires "metadata.broker.list" or "bootstrap.servers"
* to be set with Kafka broker(s) (NOT zookeeper servers) specified in
* host1:port1,host2:port2 form.
* @param fromOffsets Per-topic/partition Kafka offsets defining the (inclusive)
* starting point of the stream
* @param messageHandler Function for translating each message and metadata into the desired type
* @tparam K type of Kafka message key
* @tparam V type of Kafka message value
* @tparam KD type of Kafka message key decoder
* @tparam VD type of Kafka message value decoder
* @tparam R type returned by messageHandler
* @return DStream of R
*/
def createDirectStream[
K: ClassTag,
V: ClassTag,
KD <: Decoder[K]: ClassTag,
VD <: Decoder[V]: ClassTag,
R: ClassTag] (
ssc: StreamingContext,
kafkaParams: Map[String, String],
fromOffsets: Map[TopicAndPartition, Long],
messageHandler: MessageAndMetadata[K, V] => R
): InputDStream[R] = {
val cleanedHandler = ssc.sc.clean(messageHandler)
new DirectKafkaInputDStream[K, V, KD, VD, R](
ssc, kafkaParams, fromOffsets, cleanedHandler)
}
/**
* Create an input stream that directly pulls messages from Kafka Brokers
* without using any receiver. This stream can guarantee that each message
* from Kafka is included in transformations exactly once (see points below).
*
* Points to note:
* - No receivers: This stream does not use any receiver. It directly queries Kafka
* - Offsets: This does not use Zookeeper to store offsets. The consumed offsets are tracked
* by the stream itself. For interoperability with Kafka monitoring tools that depend on
* Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application.
* You can access the offsets used in each batch from the generated RDDs (see
* [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
* - Failure Recovery: To recover from driver failures, you have to enable checkpointing
* in the `StreamingContext`. The information on consumed offset can be
* recovered from the checkpoint. See the programming guide for details (constraints, etc.).
* - End-to-end semantics: This stream ensures that every records is effectively received and
* transformed exactly once, but gives no guarantees on whether the transformed data are
* outputted exactly once. For end-to-end exactly-once semantics, you have to either ensure
* that the output operation is idempotent, or use transactions to output records atomically.
* See the programming guide for more details.
*
* @param ssc StreamingContext object
* @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
* configuration parameters</a>. Requires "metadata.broker.list" or "bootstrap.servers"
* to be set with Kafka broker(s) (NOT zookeeper servers), specified in
* host1:port1,host2:port2 form.
* If not starting from a checkpoint, "auto.offset.reset" may be set to "largest" or "smallest"
* to determine where the stream starts (defaults to "largest")
* @param topics Names of the topics to consume
* @tparam K type of Kafka message key
* @tparam V type of Kafka message value
* @tparam KD type of Kafka message key decoder
* @tparam VD type of Kafka message value decoder
* @return DStream of (Kafka message key, Kafka message value)
*/
def createDirectStream[
K: ClassTag,
V: ClassTag,
KD <: Decoder[K]: ClassTag,
VD <: Decoder[V]: ClassTag] (
ssc: StreamingContext,
kafkaParams: Map[String, String],
topics: Set[String]
): InputDStream[(K, V)] = {
val messageHandler = (mmd: MessageAndMetadata[K, V]) => (mmd.key, mmd.message)
val kc = new KafkaCluster(kafkaParams)
val fromOffsets = getFromOffsets(kc, kafkaParams, topics)
new DirectKafkaInputDStream[K, V, KD, VD, (K, V)](
ssc, kafkaParams, fromOffsets, messageHandler)
}
/**
* Create an input stream that directly pulls messages from Kafka Brokers
* without using any receiver. This stream can guarantee that each message
* from Kafka is included in transformations exactly once (see points below).
*
* Points to note:
* - No receivers: This stream does not use any receiver. It directly queries Kafka
* - Offsets: This does not use Zookeeper to store offsets. The consumed offsets are tracked
* by the stream itself. For interoperability with Kafka monitoring tools that depend on
* Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application.
* You can access the offsets used in each batch from the generated RDDs (see
* [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
* - Failure Recovery: To recover from driver failures, you have to enable checkpointing
* in the `StreamingContext`. The information on consumed offset can be
* recovered from the checkpoint. See the programming guide for details (constraints, etc.).
* - End-to-end semantics: This stream ensures that every records is effectively received and
* transformed exactly once, but gives no guarantees on whether the transformed data are
* outputted exactly once. For end-to-end exactly-once semantics, you have to either ensure
* that the output operation is idempotent, or use transactions to output records atomically.
* See the programming guide for more details.
*
* @param jssc JavaStreamingContext object
* @param keyClass Class of the keys in the Kafka records
* @param valueClass Class of the values in the Kafka records
* @param keyDecoderClass Class of the key decoder
* @param valueDecoderClass Class of the value decoder
* @param recordClass Class of the records in DStream
* @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
* configuration parameters</a>. Requires "metadata.broker.list" or "bootstrap.servers"
* to be set with Kafka broker(s) (NOT zookeeper servers), specified in
* host1:port1,host2:port2 form.
* @param fromOffsets Per-topic/partition Kafka offsets defining the (inclusive)
* starting point of the stream
* @param messageHandler Function for translating each message and metadata into the desired type
* @tparam K type of Kafka message key
* @tparam V type of Kafka message value
* @tparam KD type of Kafka message key decoder
* @tparam VD type of Kafka message value decoder
* @tparam R type returned by messageHandler
* @return DStream of R
*/
def createDirectStream[K, V, KD <: Decoder[K], VD <: Decoder[V], R](
jssc: JavaStreamingContext,
keyClass: Class[K],
valueClass: Class[V],
keyDecoderClass: Class[KD],
valueDecoderClass: Class[VD],
recordClass: Class[R],
kafkaParams: JMap[String, String],
fromOffsets: JMap[TopicAndPartition, JLong],
messageHandler: JFunction[MessageAndMetadata[K, V], R]
): JavaInputDStream[R] = {
implicit val keyCmt: ClassTag[K] = ClassTag(keyClass)
implicit val valueCmt: ClassTag[V] = ClassTag(valueClass)
implicit val keyDecoderCmt: ClassTag[KD] = ClassTag(keyDecoderClass)
implicit val valueDecoderCmt: ClassTag[VD] = ClassTag(valueDecoderClass)
implicit val recordCmt: ClassTag[R] = ClassTag(recordClass)
val cleanedHandler = jssc.sparkContext.clean(messageHandler.call _)
createDirectStream[K, V, KD, VD, R](
jssc.ssc,
Map(kafkaParams.asScala.toSeq: _*),
Map(fromOffsets.asScala.mapValues(_.longValue()).toSeq: _*),
cleanedHandler
)
}
/**
* Create an input stream that directly pulls messages from Kafka Brokers
* without using any receiver. This stream can guarantee that each message
* from Kafka is included in transformations exactly once (see points below).
*
* Points to note:
* - No receivers: This stream does not use any receiver. It directly queries Kafka
* - Offsets: This does not use Zookeeper to store offsets. The consumed offsets are tracked
* by the stream itself. For interoperability with Kafka monitoring tools that depend on
* Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application.
* You can access the offsets used in each batch from the generated RDDs (see
* [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
* - Failure Recovery: To recover from driver failures, you have to enable checkpointing
* in the `StreamingContext`. The information on consumed offset can be
* recovered from the checkpoint. See the programming guide for details (constraints, etc.).
* - End-to-end semantics: This stream ensures that every records is effectively received and
* transformed exactly once, but gives no guarantees on whether the transformed data are
* outputted exactly once. For end-to-end exactly-once semantics, you have to either ensure
* that the output operation is idempotent, or use transactions to output records atomically.
* See the programming guide for more details.
*
* @param jssc JavaStreamingContext object
* @param keyClass Class of the keys in the Kafka records
* @param valueClass Class of the values in the Kafka records
* @param keyDecoderClass Class of the key decoder
* @param valueDecoderClass Class type of the value decoder
* @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
* configuration parameters</a>. Requires "metadata.broker.list" or "bootstrap.servers"
* to be set with Kafka broker(s) (NOT zookeeper servers), specified in
* host1:port1,host2:port2 form.
* If not starting from a checkpoint, "auto.offset.reset" may be set to "largest" or "smallest"
* to determine where the stream starts (defaults to "largest")
* @param topics Names of the topics to consume
* @tparam K type of Kafka message key
* @tparam V type of Kafka message value
* @tparam KD type of Kafka message key decoder
* @tparam VD type of Kafka message value decoder
* @return DStream of (Kafka message key, Kafka message value)
*/
def createDirectStream[K, V, KD <: Decoder[K], VD <: Decoder[V]](
jssc: JavaStreamingContext,
keyClass: Class[K],
valueClass: Class[V],
keyDecoderClass: Class[KD],
valueDecoderClass: Class[VD],
kafkaParams: JMap[String, String],
topics: JSet[String]
): JavaPairInputDStream[K, V] = {
implicit val keyCmt: ClassTag[K] = ClassTag(keyClass)
implicit val valueCmt: ClassTag[V] = ClassTag(valueClass)
implicit val keyDecoderCmt: ClassTag[KD] = ClassTag(keyDecoderClass)
implicit val valueDecoderCmt: ClassTag[VD] = ClassTag(valueDecoderClass)
createDirectStream[K, V, KD, VD](
jssc.ssc,
Map(kafkaParams.asScala.toSeq: _*),
Set(topics.asScala.toSeq: _*)
)
}
}
/**
* This is a helper class that wraps the KafkaUtils.createStream() into more
* Python-friendly class and function so that it can be easily
* instantiated and called from Python's KafkaUtils.
*
* The zero-arg constructor helps instantiate this class from the Class object
* classOf[KafkaUtilsPythonHelper].newInstance(), and the createStream()
* takes care of known parameters instead of passing them from Python
*/
private[kafka] class KafkaUtilsPythonHelper {
import KafkaUtilsPythonHelper._
def createStream(
jssc: JavaStreamingContext,
kafkaParams: JMap[String, String],
topics: JMap[String, JInt],
storageLevel: StorageLevel): JavaPairReceiverInputDStream[Array[Byte], Array[Byte]] = {
KafkaUtils.createStream[Array[Byte], Array[Byte], DefaultDecoder, DefaultDecoder](
jssc,
classOf[Array[Byte]],
classOf[Array[Byte]],
classOf[DefaultDecoder],
classOf[DefaultDecoder],
kafkaParams,
topics,
storageLevel)
}
def createRDDWithoutMessageHandler(
jsc: JavaSparkContext,
kafkaParams: JMap[String, String],
offsetRanges: JList[OffsetRange],
leaders: JMap[TopicAndPartition, Broker]): JavaRDD[(Array[Byte], Array[Byte])] = {
val messageHandler =
(mmd: MessageAndMetadata[Array[Byte], Array[Byte]]) => (mmd.key, mmd.message)
new JavaRDD(createRDD(jsc, kafkaParams, offsetRanges, leaders, messageHandler))
}
def createRDDWithMessageHandler(
jsc: JavaSparkContext,
kafkaParams: JMap[String, String],
offsetRanges: JList[OffsetRange],
leaders: JMap[TopicAndPartition, Broker]): JavaRDD[Array[Byte]] = {
val messageHandler = (mmd: MessageAndMetadata[Array[Byte], Array[Byte]]) =>
new PythonMessageAndMetadata(
mmd.topic, mmd.partition, mmd.offset, mmd.key(), mmd.message())
val rdd = createRDD(jsc, kafkaParams, offsetRanges, leaders, messageHandler).
mapPartitions(picklerIterator)
new JavaRDD(rdd)
}
private def createRDD[V: ClassTag](
jsc: JavaSparkContext,
kafkaParams: JMap[String, String],
offsetRanges: JList[OffsetRange],
leaders: JMap[TopicAndPartition, Broker],
messageHandler: MessageAndMetadata[Array[Byte], Array[Byte]] => V): RDD[V] = {
KafkaUtils.createRDD[Array[Byte], Array[Byte], DefaultDecoder, DefaultDecoder, V](
jsc.sc,
kafkaParams.asScala.toMap,
offsetRanges.toArray(new Array[OffsetRange](offsetRanges.size())),
leaders.asScala.toMap,
messageHandler
)
}
def createDirectStreamWithoutMessageHandler(
jssc: JavaStreamingContext,
kafkaParams: JMap[String, String],
topics: JSet[String],
fromOffsets: JMap[TopicAndPartition, JNumber]): JavaDStream[(Array[Byte], Array[Byte])] = {
val messageHandler =
(mmd: MessageAndMetadata[Array[Byte], Array[Byte]]) => (mmd.key, mmd.message)
new JavaDStream(createDirectStream(jssc, kafkaParams, topics, fromOffsets, messageHandler))
}
def createDirectStreamWithMessageHandler(
jssc: JavaStreamingContext,
kafkaParams: JMap[String, String],
topics: JSet[String],
fromOffsets: JMap[TopicAndPartition, JNumber]): JavaDStream[Array[Byte]] = {
val messageHandler = (mmd: MessageAndMetadata[Array[Byte], Array[Byte]]) =>
new PythonMessageAndMetadata(mmd.topic, mmd.partition, mmd.offset, mmd.key(), mmd.message())
val stream = createDirectStream(jssc, kafkaParams, topics, fromOffsets, messageHandler).
mapPartitions(picklerIterator)
new JavaDStream(stream)
}
private def createDirectStream[V: ClassTag](
jssc: JavaStreamingContext,
kafkaParams: JMap[String, String],
topics: JSet[String],
fromOffsets: JMap[TopicAndPartition, JNumber],
messageHandler: MessageAndMetadata[Array[Byte], Array[Byte]] => V): DStream[V] = {
val currentFromOffsets = if (!fromOffsets.isEmpty) {
val topicsFromOffsets = fromOffsets.keySet().asScala.map(_.topic)
if (topicsFromOffsets != topics.asScala.toSet) {
throw new IllegalStateException(
s"The specified topics: ${topics.asScala.toSet.mkString(" ")} " +
s"do not equal to the topic from offsets: ${topicsFromOffsets.mkString(" ")}")
}
Map(fromOffsets.asScala.mapValues { _.longValue() }.toSeq: _*)
} else {
val kc = new KafkaCluster(Map(kafkaParams.asScala.toSeq: _*))
KafkaUtils.getFromOffsets(
kc, Map(kafkaParams.asScala.toSeq: _*), Set(topics.asScala.toSeq: _*))
}
KafkaUtils.createDirectStream[Array[Byte], Array[Byte], DefaultDecoder, DefaultDecoder, V](
jssc.ssc,
Map(kafkaParams.asScala.toSeq: _*),
Map(currentFromOffsets.toSeq: _*),
messageHandler)
}
def createOffsetRange(topic: String, partition: JInt, fromOffset: JLong, untilOffset: JLong
): OffsetRange = OffsetRange.create(topic, partition, fromOffset, untilOffset)
def createTopicAndPartition(topic: String, partition: JInt): TopicAndPartition =
TopicAndPartition(topic, partition)
def createBroker(host: String, port: JInt): Broker = Broker(host, port)
def offsetRangesOfKafkaRDD(rdd: RDD[_]): JList[OffsetRange] = {
val parentRDDs = rdd.getNarrowAncestors
val kafkaRDDs = parentRDDs.filter(rdd => rdd.isInstanceOf[KafkaRDD[_, _, _, _, _]])
require(
kafkaRDDs.length == 1,
"Cannot get offset ranges, as there may be multiple Kafka RDDs or no Kafka RDD associated" +
"with this RDD, please call this method only on a Kafka RDD.")
val kafkaRDD = kafkaRDDs.head.asInstanceOf[KafkaRDD[_, _, _, _, _]]
kafkaRDD.offsetRanges.toSeq.asJava
}
}
private object KafkaUtilsPythonHelper {
private var initialized = false
def initialize(): Unit = {
SerDeUtil.initialize()
synchronized {
if (!initialized) {
new PythonMessageAndMetadataPickler().register()
initialized = true
}
}
}
initialize()
def picklerIterator(iter: Iterator[Any]): Iterator[Array[Byte]] = {
new SerDeUtil.AutoBatchedPickler(iter)
}
case class PythonMessageAndMetadata(
topic: String,
partition: JInt,
offset: JLong,
key: Array[Byte],
message: Array[Byte])
class PythonMessageAndMetadataPickler extends IObjectPickler {
private val module = "pyspark.streaming.kafka"
def register(): Unit = {
Pickler.registerCustomPickler(classOf[PythonMessageAndMetadata], this)
Pickler.registerCustomPickler(this.getClass, this)
}
def pickle(obj: Object, out: OutputStream, pickler: Pickler) {
if (obj == this) {
out.write(Opcodes.GLOBAL)
out.write(s"$module\nKafkaMessageAndMetadata\n".getBytes(StandardCharsets.UTF_8))
} else {
pickler.save(this)
val msgAndMetaData = obj.asInstanceOf[PythonMessageAndMetadata]
out.write(Opcodes.MARK)
pickler.save(msgAndMetaData.topic)
pickler.save(msgAndMetaData.partition)
pickler.save(msgAndMetaData.offset)
pickler.save(msgAndMetaData.key)
pickler.save(msgAndMetaData.message)
out.write(Opcodes.TUPLE)
out.write(Opcodes.REDUCE)
}
}
}
}

View file

@ -1,112 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming.kafka
import kafka.common.TopicAndPartition
/**
* Represents any object that has a collection of [[OffsetRange]]s. This can be used to access the
* offset ranges in RDDs generated by the direct Kafka DStream (see
* `KafkaUtils.createDirectStream()`).
* {{{
* KafkaUtils.createDirectStream(...).foreachRDD { rdd =>
* val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
* ...
* }
* }}}
*/
@deprecated("Update to Kafka 0.10 integration", "2.3.0")
trait HasOffsetRanges {
def offsetRanges: Array[OffsetRange]
}
/**
* Represents a range of offsets from a single Kafka TopicAndPartition. Instances of this class
* can be created with `OffsetRange.create()`.
* @param topic Kafka topic name
* @param partition Kafka partition id
* @param fromOffset Inclusive starting offset
* @param untilOffset Exclusive ending offset
*/
@deprecated("Update to Kafka 0.10 integration", "2.3.0")
final class OffsetRange private(
val topic: String,
val partition: Int,
val fromOffset: Long,
val untilOffset: Long) extends Serializable {
import OffsetRange.OffsetRangeTuple
/** Kafka TopicAndPartition object, for convenience */
def topicAndPartition(): TopicAndPartition = TopicAndPartition(topic, partition)
/** Number of messages this OffsetRange refers to */
def count(): Long = untilOffset - fromOffset
override def equals(obj: Any): Boolean = obj match {
case that: OffsetRange =>
this.topic == that.topic &&
this.partition == that.partition &&
this.fromOffset == that.fromOffset &&
this.untilOffset == that.untilOffset
case _ => false
}
override def hashCode(): Int = {
toTuple.hashCode()
}
override def toString(): String = {
s"OffsetRange(topic: '$topic', partition: $partition, range: [$fromOffset -> $untilOffset])"
}
/** this is to avoid ClassNotFoundException during checkpoint restore */
private[streaming]
def toTuple: OffsetRangeTuple = (topic, partition, fromOffset, untilOffset)
}
/**
* Companion object the provides methods to create instances of [[OffsetRange]].
*/
@deprecated("Update to Kafka 0.10 integration", "2.3.0")
object OffsetRange {
def create(topic: String, partition: Int, fromOffset: Long, untilOffset: Long): OffsetRange =
new OffsetRange(topic, partition, fromOffset, untilOffset)
def create(
topicAndPartition: TopicAndPartition,
fromOffset: Long,
untilOffset: Long): OffsetRange =
new OffsetRange(topicAndPartition.topic, topicAndPartition.partition, fromOffset, untilOffset)
def apply(topic: String, partition: Int, fromOffset: Long, untilOffset: Long): OffsetRange =
new OffsetRange(topic, partition, fromOffset, untilOffset)
def apply(
topicAndPartition: TopicAndPartition,
fromOffset: Long,
untilOffset: Long): OffsetRange =
new OffsetRange(topicAndPartition.topic, topicAndPartition.partition, fromOffset, untilOffset)
/** this is to avoid ClassNotFoundException during checkpoint restore */
private[kafka]
type OffsetRangeTuple = (String, Int, Long, Long)
private[kafka]
def apply(t: OffsetRangeTuple) =
new OffsetRange(t._1, t._2, t._3, t._4)
}

View file

@ -1,302 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming.kafka
import java.util.Properties
import java.util.concurrent.{ConcurrentHashMap, ThreadPoolExecutor}
import scala.collection.{mutable, Map}
import scala.reflect.{classTag, ClassTag}
import kafka.common.TopicAndPartition
import kafka.consumer.{Consumer, ConsumerConfig, ConsumerConnector, KafkaStream}
import kafka.message.MessageAndMetadata
import kafka.serializer.Decoder
import kafka.utils.{VerifiableProperties, ZKGroupTopicDirs, ZKStringSerializer, ZkUtils}
import org.I0Itec.zkclient.ZkClient
import org.apache.spark.SparkEnv
import org.apache.spark.internal.Logging
import org.apache.spark.storage.{StorageLevel, StreamBlockId}
import org.apache.spark.streaming.receiver.{BlockGenerator, BlockGeneratorListener, Receiver}
import org.apache.spark.util.ThreadUtils
/**
* ReliableKafkaReceiver offers the ability to reliably store data into BlockManager without loss.
* It is turned off by default and will be enabled when
* spark.streaming.receiver.writeAheadLog.enable is true. The difference compared to KafkaReceiver
* is that this receiver manages topic-partition/offset itself and updates the offset information
* after data is reliably stored as write-ahead log. Offsets will only be updated when data is
* reliably stored, so the potential data loss problem of KafkaReceiver can be eliminated.
*
* Note: ReliableKafkaReceiver will set auto.commit.enable to false to turn off automatic offset
* commit mechanism in Kafka consumer. So setting this configuration manually within kafkaParams
* will not take effect.
*/
private[streaming]
class ReliableKafkaReceiver[
K: ClassTag,
V: ClassTag,
U <: Decoder[_]: ClassTag,
T <: Decoder[_]: ClassTag](
kafkaParams: Map[String, String],
topics: Map[String, Int],
storageLevel: StorageLevel)
extends Receiver[(K, V)](storageLevel) with Logging {
private val groupId = kafkaParams("group.id")
private val AUTO_OFFSET_COMMIT = "auto.commit.enable"
private def conf = SparkEnv.get.conf
/** High level consumer to connect to Kafka. */
private var consumerConnector: ConsumerConnector = null
/** zkClient to connect to Zookeeper to commit the offsets. */
private var zkClient: ZkClient = null
/**
* A HashMap to manage the offset for each topic/partition, this HashMap is called in
* synchronized block, so mutable HashMap will not meet concurrency issue.
*/
private var topicPartitionOffsetMap: mutable.HashMap[TopicAndPartition, Long] = null
/** A concurrent HashMap to store the stream block id and related offset snapshot. */
private var blockOffsetMap: ConcurrentHashMap[StreamBlockId, Map[TopicAndPartition, Long]] = null
/**
* Manage the BlockGenerator in receiver itself for better managing block store and offset
* commit.
*/
private var blockGenerator: BlockGenerator = null
/** Thread pool running the handlers for receiving message from multiple topics and partitions. */
private var messageHandlerThreadPool: ThreadPoolExecutor = null
override def onStart(): Unit = {
logInfo(s"Starting Kafka Consumer Stream with group: $groupId")
// Initialize the topic-partition / offset hash map.
topicPartitionOffsetMap = new mutable.HashMap[TopicAndPartition, Long]
// Initialize the stream block id / offset snapshot hash map.
blockOffsetMap = new ConcurrentHashMap[StreamBlockId, Map[TopicAndPartition, Long]]()
// Initialize the block generator for storing Kafka message.
blockGenerator = supervisor.createBlockGenerator(new GeneratedBlockHandler)
if (kafkaParams.contains(AUTO_OFFSET_COMMIT) && kafkaParams(AUTO_OFFSET_COMMIT) == "true") {
logWarning(s"$AUTO_OFFSET_COMMIT should be set to false in ReliableKafkaReceiver, " +
"otherwise we will manually set it to false to turn off auto offset commit in Kafka")
}
val props = new Properties()
kafkaParams.foreach(param => props.put(param._1, param._2))
// Manually set "auto.commit.enable" to "false" no matter user explicitly set it to true,
// we have to make sure this property is set to false to turn off auto commit mechanism in
// Kafka.
props.setProperty(AUTO_OFFSET_COMMIT, "false")
val consumerConfig = new ConsumerConfig(props)
assert(!consumerConfig.autoCommitEnable)
logInfo(s"Connecting to Zookeeper: ${consumerConfig.zkConnect}")
consumerConnector = Consumer.create(consumerConfig)
logInfo(s"Connected to Zookeeper: ${consumerConfig.zkConnect}")
zkClient = new ZkClient(consumerConfig.zkConnect, consumerConfig.zkSessionTimeoutMs,
consumerConfig.zkConnectionTimeoutMs, ZKStringSerializer)
messageHandlerThreadPool = ThreadUtils.newDaemonFixedThreadPool(
topics.values.sum, "KafkaMessageHandler")
blockGenerator.start()
val keyDecoder = classTag[U].runtimeClass.getConstructor(classOf[VerifiableProperties])
.newInstance(consumerConfig.props)
.asInstanceOf[Decoder[K]]
val valueDecoder = classTag[T].runtimeClass.getConstructor(classOf[VerifiableProperties])
.newInstance(consumerConfig.props)
.asInstanceOf[Decoder[V]]
val topicMessageStreams = consumerConnector.createMessageStreams(
topics, keyDecoder, valueDecoder)
topicMessageStreams.values.foreach { streams =>
streams.foreach { stream =>
messageHandlerThreadPool.submit(new MessageHandler(stream))
}
}
}
override def onStop(): Unit = {
if (messageHandlerThreadPool != null) {
messageHandlerThreadPool.shutdown()
messageHandlerThreadPool = null
}
if (consumerConnector != null) {
consumerConnector.shutdown()
consumerConnector = null
}
if (zkClient != null) {
zkClient.close()
zkClient = null
}
if (blockGenerator != null) {
blockGenerator.stop()
blockGenerator = null
}
if (topicPartitionOffsetMap != null) {
topicPartitionOffsetMap.clear()
topicPartitionOffsetMap = null
}
if (blockOffsetMap != null) {
blockOffsetMap.clear()
blockOffsetMap = null
}
}
/** Store a Kafka message and the associated metadata as a tuple. */
private def storeMessageAndMetadata(
msgAndMetadata: MessageAndMetadata[K, V]): Unit = {
val topicAndPartition = TopicAndPartition(msgAndMetadata.topic, msgAndMetadata.partition)
val data = (msgAndMetadata.key, msgAndMetadata.message)
val metadata = (topicAndPartition, msgAndMetadata.offset)
blockGenerator.addDataWithCallback(data, metadata)
}
/** Update stored offset */
private def updateOffset(topicAndPartition: TopicAndPartition, offset: Long): Unit = {
topicPartitionOffsetMap.put(topicAndPartition, offset)
}
/**
* Remember the current offsets for each topic and partition. This is called when a block is
* generated.
*/
private def rememberBlockOffsets(blockId: StreamBlockId): Unit = {
// Get a snapshot of current offset map and store with related block id.
val offsetSnapshot = topicPartitionOffsetMap.toMap
blockOffsetMap.put(blockId, offsetSnapshot)
topicPartitionOffsetMap.clear()
}
/**
* Store the ready-to-be-stored block and commit the related offsets to zookeeper. This method
* will try a fixed number of times to push the block. If the push fails, the receiver is stopped.
*/
private def storeBlockAndCommitOffset(
blockId: StreamBlockId, arrayBuffer: mutable.ArrayBuffer[_]): Unit = {
var count = 0
var pushed = false
var exception: Exception = null
while (!pushed && count <= 3) {
try {
store(arrayBuffer.asInstanceOf[mutable.ArrayBuffer[(K, V)]])
pushed = true
} catch {
case ex: Exception =>
count += 1
exception = ex
}
}
if (pushed) {
Option(blockOffsetMap.get(blockId)).foreach(commitOffset)
blockOffsetMap.remove(blockId)
} else {
stop("Error while storing block into Spark", exception)
}
}
/**
* Commit the offset of Kafka's topic/partition, the commit mechanism follow Kafka 0.8.x's
* metadata schema in Zookeeper.
*/
private def commitOffset(offsetMap: Map[TopicAndPartition, Long]): Unit = {
if (zkClient == null) {
val thrown = new IllegalStateException("Zookeeper client is unexpectedly null")
stop("Zookeeper client is not initialized before commit offsets to ZK", thrown)
return
}
for ((topicAndPart, offset) <- offsetMap) {
try {
val topicDirs = new ZKGroupTopicDirs(groupId, topicAndPart.topic)
val zkPath = s"${topicDirs.consumerOffsetDir}/${topicAndPart.partition}"
ZkUtils.updatePersistentPath(zkClient, zkPath, offset.toString)
} catch {
case e: Exception =>
logWarning(s"Exception during commit offset $offset for topic" +
s"${topicAndPart.topic}, partition ${topicAndPart.partition}", e)
}
logInfo(s"Committed offset $offset for topic ${topicAndPart.topic}, " +
s"partition ${topicAndPart.partition}")
}
}
/** Class to handle received Kafka message. */
private final class MessageHandler(stream: KafkaStream[K, V]) extends Runnable {
override def run(): Unit = {
while (!isStopped) {
try {
val streamIterator = stream.iterator()
while (streamIterator.hasNext) {
storeMessageAndMetadata(streamIterator.next)
}
} catch {
case e: Exception =>
reportError("Error handling message", e)
}
}
}
}
/** Class to handle blocks generated by the block generator. */
private final class GeneratedBlockHandler extends BlockGeneratorListener {
def onAddData(data: Any, metadata: Any): Unit = {
// Update the offset of the data that was added to the generator
if (metadata != null) {
val (topicAndPartition, offset) = metadata.asInstanceOf[(TopicAndPartition, Long)]
updateOffset(topicAndPartition, offset)
}
}
def onGenerateBlock(blockId: StreamBlockId): Unit = {
// Remember the offsets of topics/partitions when a block has been generated
rememberBlockOffsets(blockId)
}
def onPushBlock(blockId: StreamBlockId, arrayBuffer: mutable.ArrayBuffer[_]): Unit = {
// Store block and commit the blocks offset
storeBlockAndCommitOffset(blockId, arrayBuffer)
}
def onError(message: String, throwable: Throwable): Unit = {
reportError(message, throwable)
}
}
}

View file

@ -1,21 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Kafka receiver for spark streaming.
*/
package org.apache.spark.streaming.kafka;

View file

@ -1,23 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming
/**
* Kafka receiver for spark streaming,
*/
package object kafka

View file

@ -1,170 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming.kafka;
import java.io.Serializable;
import java.util.*;
import java.util.concurrent.atomic.AtomicReference;
import scala.Tuple2;
import kafka.common.TopicAndPartition;
import kafka.message.MessageAndMetadata;
import kafka.serializer.StringDecoder;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
public class JavaDirectKafkaStreamSuite implements Serializable {
private transient JavaStreamingContext ssc = null;
private transient KafkaTestUtils kafkaTestUtils = null;
@Before
public void setUp() {
kafkaTestUtils = new KafkaTestUtils();
kafkaTestUtils.setup();
SparkConf sparkConf = new SparkConf()
.setMaster("local[4]").setAppName(this.getClass().getSimpleName());
ssc = new JavaStreamingContext(sparkConf, Durations.milliseconds(200));
}
@After
public void tearDown() {
if (ssc != null) {
ssc.stop();
ssc = null;
}
if (kafkaTestUtils != null) {
kafkaTestUtils.teardown();
kafkaTestUtils = null;
}
}
@Test
public void testKafkaStream() throws InterruptedException {
final String topic1 = "topic1";
final String topic2 = "topic2";
// hold a reference to the current offset ranges, so it can be used downstream
final AtomicReference<OffsetRange[]> offsetRanges = new AtomicReference<>();
String[] topic1data = createTopicAndSendData(topic1);
String[] topic2data = createTopicAndSendData(topic2);
Set<String> sent = new HashSet<>();
sent.addAll(Arrays.asList(topic1data));
sent.addAll(Arrays.asList(topic2data));
Map<String, String> kafkaParams = new HashMap<>();
kafkaParams.put("metadata.broker.list", kafkaTestUtils.brokerAddress());
kafkaParams.put("auto.offset.reset", "smallest");
JavaDStream<String> stream1 = KafkaUtils.createDirectStream(
ssc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
kafkaParams,
topicToSet(topic1)
).transformToPair(
// Make sure you can get offset ranges from the rdd
new Function<JavaPairRDD<String, String>, JavaPairRDD<String, String>>() {
@Override
public JavaPairRDD<String, String> call(JavaPairRDD<String, String> rdd) {
OffsetRange[] offsets = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
offsetRanges.set(offsets);
Assert.assertEquals(topic1, offsets[0].topic());
return rdd;
}
}
).map(
new Function<Tuple2<String, String>, String>() {
@Override
public String call(Tuple2<String, String> kv) {
return kv._2();
}
}
);
JavaDStream<String> stream2 = KafkaUtils.createDirectStream(
ssc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
String.class,
kafkaParams,
topicOffsetToMap(topic2, 0L),
new Function<MessageAndMetadata<String, String>, String>() {
@Override
public String call(MessageAndMetadata<String, String> msgAndMd) {
return msgAndMd.message();
}
}
);
JavaDStream<String> unifiedStream = stream1.union(stream2);
final Set<String> result = Collections.synchronizedSet(new HashSet<String>());
unifiedStream.foreachRDD(new VoidFunction<JavaRDD<String>>() {
@Override
public void call(JavaRDD<String> rdd) {
result.addAll(rdd.collect());
}
}
);
ssc.start();
long startTime = System.currentTimeMillis();
boolean matches = false;
while (!matches && System.currentTimeMillis() - startTime < 20000) {
matches = sent.size() == result.size();
Thread.sleep(50);
}
Assert.assertEquals(sent, result);
ssc.stop();
}
private static Set<String> topicToSet(String topic) {
Set<String> topicSet = new HashSet<>();
topicSet.add(topic);
return topicSet;
}
private static Map<TopicAndPartition, Long> topicOffsetToMap(String topic, Long offsetToStart) {
Map<TopicAndPartition, Long> topicMap = new HashMap<>();
topicMap.put(new TopicAndPartition(topic, 0), offsetToStart);
return topicMap;
}
private String[] createTopicAndSendData(String topic) {
String[] data = { topic + "-1", topic + "-2", topic + "-3"};
kafkaTestUtils.createTopic(topic, 1);
kafkaTestUtils.sendMessages(topic, data);
return data;
}
}

View file

@ -1,156 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming.kafka;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import scala.Tuple2;
import kafka.common.TopicAndPartition;
import kafka.message.MessageAndMetadata;
import kafka.serializer.StringDecoder;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
public class JavaKafkaRDDSuite implements Serializable {
private transient JavaSparkContext sc = null;
private transient KafkaTestUtils kafkaTestUtils = null;
@Before
public void setUp() {
kafkaTestUtils = new KafkaTestUtils();
kafkaTestUtils.setup();
SparkConf sparkConf = new SparkConf()
.setMaster("local[4]").setAppName(this.getClass().getSimpleName());
sc = new JavaSparkContext(sparkConf);
}
@After
public void tearDown() {
if (sc != null) {
sc.stop();
sc = null;
}
if (kafkaTestUtils != null) {
kafkaTestUtils.teardown();
kafkaTestUtils = null;
}
}
@Test
public void testKafkaRDD() throws InterruptedException {
String topic1 = "topic1";
String topic2 = "topic2";
createTopicAndSendData(topic1);
createTopicAndSendData(topic2);
Map<String, String> kafkaParams = new HashMap<>();
kafkaParams.put("metadata.broker.list", kafkaTestUtils.brokerAddress());
OffsetRange[] offsetRanges = {
OffsetRange.create(topic1, 0, 0, 1),
OffsetRange.create(topic2, 0, 0, 1)
};
Map<TopicAndPartition, Broker> emptyLeaders = new HashMap<>();
Map<TopicAndPartition, Broker> leaders = new HashMap<>();
String[] hostAndPort = kafkaTestUtils.brokerAddress().split(":");
Broker broker = Broker.create(hostAndPort[0], Integer.parseInt(hostAndPort[1]));
leaders.put(new TopicAndPartition(topic1, 0), broker);
leaders.put(new TopicAndPartition(topic2, 0), broker);
JavaRDD<String> rdd1 = KafkaUtils.createRDD(
sc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
kafkaParams,
offsetRanges
).map(
new Function<Tuple2<String, String>, String>() {
@Override
public String call(Tuple2<String, String> kv) {
return kv._2();
}
}
);
JavaRDD<String> rdd2 = KafkaUtils.createRDD(
sc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
String.class,
kafkaParams,
offsetRanges,
emptyLeaders,
new Function<MessageAndMetadata<String, String>, String>() {
@Override
public String call(MessageAndMetadata<String, String> msgAndMd) {
return msgAndMd.message();
}
}
);
JavaRDD<String> rdd3 = KafkaUtils.createRDD(
sc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
String.class,
kafkaParams,
offsetRanges,
leaders,
new Function<MessageAndMetadata<String, String>, String>() {
@Override
public String call(MessageAndMetadata<String, String> msgAndMd) {
return msgAndMd.message();
}
}
);
// just making sure the java user apis work; the scala tests handle logic corner cases
long count1 = rdd1.count();
long count2 = rdd2.count();
long count3 = rdd3.count();
Assert.assertTrue(count1 > 0);
Assert.assertEquals(count1, count2);
Assert.assertEquals(count1, count3);
}
private String[] createTopicAndSendData(String topic) {
String[] data = { topic + "-1", topic + "-2", topic + "-3"};
kafkaTestUtils.createTopic(topic, 1);
kafkaTestUtils.sendMessages(topic, data);
return data;
}
}

View file

@ -1,144 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming.kafka;
import java.io.Serializable;
import java.util.*;
import scala.Tuple2;
import kafka.serializer.StringDecoder;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.storage.StorageLevel;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
public class JavaKafkaStreamSuite implements Serializable {
private transient JavaStreamingContext ssc = null;
private transient Random random = new Random();
private transient KafkaTestUtils kafkaTestUtils = null;
@Before
public void setUp() {
kafkaTestUtils = new KafkaTestUtils();
kafkaTestUtils.setup();
SparkConf sparkConf = new SparkConf()
.setMaster("local[4]").setAppName(this.getClass().getSimpleName());
ssc = new JavaStreamingContext(sparkConf, new Duration(500));
}
@After
public void tearDown() {
if (ssc != null) {
ssc.stop();
ssc = null;
}
if (kafkaTestUtils != null) {
kafkaTestUtils.teardown();
kafkaTestUtils = null;
}
}
@Test
public void testKafkaStream() throws InterruptedException {
String topic = "topic1";
Map<String, Integer> topics = new HashMap<>();
topics.put(topic, 1);
Map<String, Integer> sent = new HashMap<>();
sent.put("a", 5);
sent.put("b", 3);
sent.put("c", 10);
kafkaTestUtils.createTopic(topic, 1);
kafkaTestUtils.sendMessages(topic, sent);
Map<String, String> kafkaParams = new HashMap<>();
kafkaParams.put("zookeeper.connect", kafkaTestUtils.zkAddress());
kafkaParams.put("group.id", "test-consumer-" + random.nextInt(10000));
kafkaParams.put("auto.offset.reset", "smallest");
JavaPairDStream<String, String> stream = KafkaUtils.createStream(ssc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
kafkaParams,
topics,
StorageLevel.MEMORY_ONLY_SER());
final Map<String, Long> result = Collections.synchronizedMap(new HashMap<String, Long>());
JavaDStream<String> words = stream.map(
new Function<Tuple2<String, String>, String>() {
@Override
public String call(Tuple2<String, String> tuple2) {
return tuple2._2();
}
}
);
words.countByValue().foreachRDD(new VoidFunction<JavaPairRDD<String, Long>>() {
@Override
public void call(JavaPairRDD<String, Long> rdd) {
List<Tuple2<String, Long>> ret = rdd.collect();
for (Tuple2<String, Long> r : ret) {
if (result.containsKey(r._1())) {
result.put(r._1(), result.get(r._1()) + r._2());
} else {
result.put(r._1(), r._2());
}
}
}
}
);
ssc.start();
long startTime = System.currentTimeMillis();
AssertionError lastError = null;
while (System.currentTimeMillis() - startTime < 20000) {
try {
Assert.assertEquals(sent.size(), result.size());
for (Map.Entry<String, Integer> e : sent.entrySet()) {
Assert.assertEquals(e.getValue().intValue(), result.get(e.getKey()).intValue());
}
return;
} catch (AssertionError e) {
lastError = e;
}
Thread.sleep(200);
}
if (lastError != null) {
throw lastError;
} else {
Assert.fail("timeout");
}
}
}

View file

@ -1,28 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Set everything to be logged to the file target/unit-tests.log
log4j.rootCategory=INFO, file
log4j.appender.file=org.apache.log4j.FileAppender
log4j.appender.file.append=true
log4j.appender.file.file=target/unit-tests.log
log4j.appender.file.layout=org.apache.log4j.PatternLayout
log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
# Ignore messages below warning level from Jetty, because it's a bit verbose
log4j.logger.org.spark_project.jetty=WARN

View file

@ -1,636 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming.kafka
import java.io.File
import java.util.{ Arrays, UUID }
import java.util.concurrent.ConcurrentLinkedQueue
import java.util.concurrent.atomic.AtomicLong
import scala.collection.JavaConverters._
import scala.concurrent.duration._
import scala.language.postfixOps
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll}
import org.scalatest.concurrent.Eventually
import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Milliseconds, StreamingContext, Time}
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.scheduler._
import org.apache.spark.streaming.scheduler.rate.RateEstimator
import org.apache.spark.util.Utils
class DirectKafkaStreamSuite
extends SparkFunSuite
with BeforeAndAfter
with BeforeAndAfterAll
with Eventually
with Logging {
val sparkConf = new SparkConf()
.setMaster("local[4]")
.setAppName(this.getClass.getSimpleName)
private var ssc: StreamingContext = _
private var testDir: File = _
private var kafkaTestUtils: KafkaTestUtils = _
override def beforeAll {
super.beforeAll()
kafkaTestUtils = new KafkaTestUtils
kafkaTestUtils.setup()
}
override def afterAll {
try {
if (kafkaTestUtils != null) {
kafkaTestUtils.teardown()
kafkaTestUtils = null
}
} finally {
super.afterAll()
}
}
after {
if (ssc != null) {
ssc.stop(stopSparkContext = true)
}
if (testDir != null) {
Utils.deleteRecursively(testDir)
}
}
test("basic stream receiving with multiple topics and smallest starting offset") {
val topics = Set("basic1", "basic2", "basic3")
val data = Map("a" -> 7, "b" -> 9)
topics.foreach { t =>
kafkaTestUtils.createTopic(t)
kafkaTestUtils.sendMessages(t, data)
}
val totalSent = data.values.sum * topics.size
val kafkaParams = Map(
"metadata.broker.list" -> kafkaTestUtils.brokerAddress,
"auto.offset.reset" -> "smallest"
)
ssc = new StreamingContext(sparkConf, Milliseconds(200))
val stream = withClue("Error creating direct stream") {
KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
ssc, kafkaParams, topics)
}
val allReceived = new ConcurrentLinkedQueue[(String, String)]()
// hold a reference to the current offset ranges, so it can be used downstream
var offsetRanges = Array[OffsetRange]()
stream.transform { rdd =>
// Get the offset ranges in the RDD
offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd
}.foreachRDD { rdd =>
for (o <- offsetRanges) {
logInfo(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}")
}
val collected = rdd.mapPartitionsWithIndex { (i, iter) =>
// For each partition, get size of the range in the partition,
// and the number of items in the partition
val off = offsetRanges(i)
val all = iter.toSeq
val partSize = all.size
val rangeSize = off.untilOffset - off.fromOffset
Iterator((partSize, rangeSize))
}.collect
// Verify whether number of elements in each partition
// matches with the corresponding offset range
collected.foreach { case (partSize, rangeSize) =>
assert(partSize === rangeSize, "offset ranges are wrong")
}
}
stream.foreachRDD { rdd => allReceived.addAll(Arrays.asList(rdd.collect(): _*)) }
ssc.start()
eventually(timeout(20000.milliseconds), interval(200.milliseconds)) {
assert(allReceived.size === totalSent,
"didn't get expected number of messages, messages:\n" +
allReceived.asScala.mkString("\n"))
}
ssc.stop()
}
test("receiving from largest starting offset") {
val topic = "largest"
val topicPartition = TopicAndPartition(topic, 0)
val data = Map("a" -> 10)
kafkaTestUtils.createTopic(topic)
val kafkaParams = Map(
"metadata.broker.list" -> kafkaTestUtils.brokerAddress,
"auto.offset.reset" -> "largest"
)
val kc = new KafkaCluster(kafkaParams)
def getLatestOffset(): Long = {
kc.getLatestLeaderOffsets(Set(topicPartition)).right.get(topicPartition).offset
}
// Send some initial messages before starting context
kafkaTestUtils.sendMessages(topic, data)
eventually(timeout(10 seconds), interval(20 milliseconds)) {
assert(getLatestOffset() > 3)
}
val offsetBeforeStart = getLatestOffset()
// Setup context and kafka stream with largest offset
ssc = new StreamingContext(sparkConf, Milliseconds(200))
val stream = withClue("Error creating direct stream") {
KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
ssc, kafkaParams, Set(topic))
}
assert(
stream.asInstanceOf[DirectKafkaInputDStream[_, _, _, _, _]]
.fromOffsets(topicPartition) >= offsetBeforeStart,
"Start offset not from latest"
)
val collectedData = new ConcurrentLinkedQueue[String]()
stream.map { _._2 }.foreachRDD { rdd => collectedData.addAll(Arrays.asList(rdd.collect(): _*)) }
ssc.start()
val newData = Map("b" -> 10)
kafkaTestUtils.sendMessages(topic, newData)
eventually(timeout(10 seconds), interval(50 milliseconds)) {
collectedData.contains("b")
}
assert(!collectedData.contains("a"))
ssc.stop()
}
test("creating stream by offset") {
val topic = "offset"
val topicPartition = TopicAndPartition(topic, 0)
val data = Map("a" -> 10)
kafkaTestUtils.createTopic(topic)
val kafkaParams = Map(
"metadata.broker.list" -> kafkaTestUtils.brokerAddress,
"auto.offset.reset" -> "largest"
)
val kc = new KafkaCluster(kafkaParams)
def getLatestOffset(): Long = {
kc.getLatestLeaderOffsets(Set(topicPartition)).right.get(topicPartition).offset
}
// Send some initial messages before starting context
kafkaTestUtils.sendMessages(topic, data)
eventually(timeout(10 seconds), interval(20 milliseconds)) {
assert(getLatestOffset() >= 10)
}
val offsetBeforeStart = getLatestOffset()
// Setup context and kafka stream with largest offset
ssc = new StreamingContext(sparkConf, Milliseconds(200))
val stream = withClue("Error creating direct stream") {
KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, String](
ssc, kafkaParams, Map(topicPartition -> 11L),
(m: MessageAndMetadata[String, String]) => m.message())
}
assert(
stream.asInstanceOf[DirectKafkaInputDStream[_, _, _, _, _]]
.fromOffsets(topicPartition) >= offsetBeforeStart,
"Start offset not from latest"
)
val collectedData = new ConcurrentLinkedQueue[String]()
stream.foreachRDD { rdd => collectedData.addAll(Arrays.asList(rdd.collect(): _*)) }
ssc.start()
val newData = Map("b" -> 10)
kafkaTestUtils.sendMessages(topic, newData)
eventually(timeout(10 seconds), interval(50 milliseconds)) {
collectedData.contains("b")
}
assert(!collectedData.contains("a"))
ssc.stop()
}
// Test to verify the offset ranges can be recovered from the checkpoints
test("offset recovery") {
val topic = "recovery"
kafkaTestUtils.createTopic(topic)
testDir = Utils.createTempDir()
val kafkaParams = Map(
"metadata.broker.list" -> kafkaTestUtils.brokerAddress,
"auto.offset.reset" -> "smallest"
)
// Send data to Kafka and wait for it to be received
def sendData(data: Seq[Int]) {
val strings = data.map { _.toString}
kafkaTestUtils.sendMessages(topic, strings.map { _ -> 1}.toMap)
}
// Setup the streaming context
ssc = new StreamingContext(sparkConf, Milliseconds(100))
val kafkaStream = withClue("Error creating direct stream") {
KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
ssc, kafkaParams, Set(topic))
}
val keyedStream = kafkaStream.map { v => "key" -> v._2.toInt }
val stateStream = keyedStream.updateStateByKey { (values: Seq[Int], state: Option[Int]) =>
Some(values.sum + state.getOrElse(0))
}
ssc.checkpoint(testDir.getAbsolutePath)
// This is ensure all the data is eventually receiving only once
stateStream.foreachRDD { (rdd: RDD[(String, Int)]) =>
rdd.collect().headOption.foreach { x =>
DirectKafkaStreamSuite.total.set(x._2)
}
}
ssc.start()
// Send some data
for (i <- (1 to 10).grouped(4)) {
sendData(i)
}
eventually(timeout(20 seconds), interval(50 milliseconds)) {
assert(DirectKafkaStreamSuite.total.get === (1 to 10).sum)
}
ssc.stop()
// Verify that offset ranges were generated
// Since "offsetRangesAfterStop" will be used to compare with "recoveredOffsetRanges", we should
// collect offset ranges after stopping. Otherwise, because new RDDs keep being generated before
// stopping, we may not be able to get the latest RDDs, then "recoveredOffsetRanges" will
// contain something not in "offsetRangesAfterStop".
val offsetRangesAfterStop = getOffsetRanges(kafkaStream)
assert(offsetRangesAfterStop.size >= 1, "No offset ranges generated")
assert(
offsetRangesAfterStop.head._2.forall { _.fromOffset === 0 },
"starting offset not zero"
)
logInfo("====== RESTARTING ========")
// Recover context from checkpoints
ssc = new StreamingContext(testDir.getAbsolutePath)
val recoveredStream = ssc.graph.getInputStreams().head.asInstanceOf[DStream[(String, String)]]
// Verify offset ranges have been recovered
val recoveredOffsetRanges = getOffsetRanges(recoveredStream).map { x => (x._1, x._2.toSet) }
assert(recoveredOffsetRanges.size > 0, "No offset ranges recovered")
val earlierOffsetRanges = offsetRangesAfterStop.map { x => (x._1, x._2.toSet) }
assert(
recoveredOffsetRanges.forall { or =>
earlierOffsetRanges.contains((or._1, or._2))
},
"Recovered ranges are not the same as the ones generated\n" +
s"recoveredOffsetRanges: $recoveredOffsetRanges\n" +
s"earlierOffsetRanges: $earlierOffsetRanges"
)
// Restart context, give more data and verify the total at the end
// If the total is write that means each records has been received only once
ssc.start()
for (i <- (11 to 20).grouped(4)) {
sendData(i)
}
eventually(timeout(20 seconds), interval(50 milliseconds)) {
assert(DirectKafkaStreamSuite.total.get === (1 to 20).sum)
}
ssc.stop()
}
test("Direct Kafka stream report input information") {
val topic = "report-test"
val data = Map("a" -> 7, "b" -> 9)
kafkaTestUtils.createTopic(topic)
kafkaTestUtils.sendMessages(topic, data)
val totalSent = data.values.sum
val kafkaParams = Map(
"metadata.broker.list" -> kafkaTestUtils.brokerAddress,
"auto.offset.reset" -> "smallest"
)
import DirectKafkaStreamSuite._
ssc = new StreamingContext(sparkConf, Milliseconds(200))
val collector = new InputInfoCollector
ssc.addStreamingListener(collector)
val stream = withClue("Error creating direct stream") {
KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
ssc, kafkaParams, Set(topic))
}
val allReceived = new ConcurrentLinkedQueue[(String, String)]
stream.foreachRDD { rdd => allReceived.addAll(Arrays.asList(rdd.collect(): _*)) }
ssc.start()
eventually(timeout(20000.milliseconds), interval(200.milliseconds)) {
assert(allReceived.size === totalSent,
"didn't get expected number of messages, messages:\n" +
allReceived.asScala.mkString("\n"))
// Calculate all the record number collected in the StreamingListener.
assert(collector.numRecordsSubmitted.get() === totalSent)
assert(collector.numRecordsStarted.get() === totalSent)
assert(collector.numRecordsCompleted.get() === totalSent)
}
ssc.stop()
}
test("maxMessagesPerPartition with backpressure disabled") {
val topic = "maxMessagesPerPartition"
val kafkaStream = getDirectKafkaStream(topic, None)
val input = Map(TopicAndPartition(topic, 0) -> 50L, TopicAndPartition(topic, 1) -> 50L)
assert(kafkaStream.maxMessagesPerPartition(input).get ==
Map(TopicAndPartition(topic, 0) -> 10L, TopicAndPartition(topic, 1) -> 10L))
}
test("maxMessagesPerPartition with no lag") {
val topic = "maxMessagesPerPartition"
val rateController = Some(new ConstantRateController(0, new ConstantEstimator(100), 100))
val kafkaStream = getDirectKafkaStream(topic, rateController)
val input = Map(TopicAndPartition(topic, 0) -> 0L, TopicAndPartition(topic, 1) -> 0L)
assert(kafkaStream.maxMessagesPerPartition(input).isEmpty)
}
test("maxMessagesPerPartition respects max rate") {
val topic = "maxMessagesPerPartition"
val rateController = Some(new ConstantRateController(0, new ConstantEstimator(100), 1000))
val kafkaStream = getDirectKafkaStream(topic, rateController)
val input = Map(TopicAndPartition(topic, 0) -> 1000L, TopicAndPartition(topic, 1) -> 1000L)
assert(kafkaStream.maxMessagesPerPartition(input).get ==
Map(TopicAndPartition(topic, 0) -> 10L, TopicAndPartition(topic, 1) -> 10L))
}
test("using rate controller") {
val topic = "backpressure"
val topicPartitions = Set(TopicAndPartition(topic, 0), TopicAndPartition(topic, 1))
kafkaTestUtils.createTopic(topic, 2)
val kafkaParams = Map(
"metadata.broker.list" -> kafkaTestUtils.brokerAddress,
"auto.offset.reset" -> "smallest"
)
val batchIntervalMilliseconds = 100
val estimator = new ConstantEstimator(100)
val messages = Map("foo" -> 200)
kafkaTestUtils.sendMessages(topic, messages)
val sparkConf = new SparkConf()
// Safe, even with streaming, because we're using the direct API.
// Using 1 core is useful to make the test more predictable.
.setMaster("local[1]")
.setAppName(this.getClass.getSimpleName)
.set("spark.streaming.kafka.maxRatePerPartition", "100")
// Setup the streaming context
ssc = new StreamingContext(sparkConf, Milliseconds(batchIntervalMilliseconds))
val kafkaStream = withClue("Error creating direct stream") {
val kc = new KafkaCluster(kafkaParams)
val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.key, mmd.message)
val m = kc.getEarliestLeaderOffsets(topicPartitions)
.fold(e => Map.empty[TopicAndPartition, Long], m => m.mapValues(lo => lo.offset))
new DirectKafkaInputDStream[String, String, StringDecoder, StringDecoder, (String, String)](
ssc, kafkaParams, m, messageHandler) {
override protected[streaming] val rateController =
Some(new DirectKafkaRateController(id, estimator))
}
}
val collectedData = new ConcurrentLinkedQueue[Array[String]]()
// Used for assertion failure messages.
def dataToString: String =
collectedData.asScala.map(_.mkString("[", ",", "]")).mkString("{", ", ", "}")
// This is to collect the raw data received from Kafka
kafkaStream.foreachRDD { (rdd: RDD[(String, String)], time: Time) =>
val data = rdd.map { _._2 }.collect()
collectedData.add(data)
}
ssc.start()
// Try different rate limits.
// Wait for arrays of data to appear matching the rate.
Seq(100, 50, 20).foreach { rate =>
collectedData.clear() // Empty this buffer on each pass.
estimator.updateRate(rate) // Set a new rate.
// Expect blocks of data equal to "rate", scaled by the interval length in secs.
val expectedSize = Math.round(rate * batchIntervalMilliseconds * 0.001)
eventually(timeout(5.seconds), interval(batchIntervalMilliseconds.milliseconds)) {
// Assert that rate estimator values are used to determine maxMessagesPerPartition.
// Funky "-" in message makes the complete assertion message read better.
assert(collectedData.asScala.exists(_.size == expectedSize),
s" - No arrays of size $expectedSize for rate $rate found in $dataToString")
}
}
ssc.stop()
}
test("use backpressure.initialRate with backpressure") {
backpressureTest(maxRatePerPartition = 1000, initialRate = 500, maxMessagesPerPartition = 250)
}
test("backpressure.initialRate should honor maxRatePerPartition") {
backpressureTest(maxRatePerPartition = 300, initialRate = 1000, maxMessagesPerPartition = 150)
}
private def backpressureTest(
maxRatePerPartition: Int,
initialRate: Int,
maxMessagesPerPartition: Int) = {
val topic = UUID.randomUUID().toString
val topicPartitions = Set(TopicAndPartition(topic, 0))
kafkaTestUtils.createTopic(topic, 1)
val kafkaParams = Map(
"metadata.broker.list" -> kafkaTestUtils.brokerAddress,
"auto.offset.reset" -> "smallest"
)
val sparkConf = new SparkConf()
// Safe, even with streaming, because we're using the direct API.
// Using 1 core is useful to make the test more predictable.
.setMaster("local[1]")
.setAppName(this.getClass.getSimpleName)
.set("spark.streaming.backpressure.enabled", "true")
.set("spark.streaming.backpressure.initialRate", initialRate.toString)
.set("spark.streaming.kafka.maxRatePerPartition", maxRatePerPartition.toString)
val messages = Map("foo" -> 5000)
kafkaTestUtils.sendMessages(topic, messages)
ssc = new StreamingContext(sparkConf, Milliseconds(500))
val kafkaStream = withClue("Error creating direct stream") {
val kc = new KafkaCluster(kafkaParams)
val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.key, mmd.message)
val m = kc.getEarliestLeaderOffsets(topicPartitions)
.fold(e => Map.empty[TopicAndPartition, Long], m => m.mapValues(lo => lo.offset))
new DirectKafkaInputDStream[String, String, StringDecoder, StringDecoder, (String, String)](
ssc, kafkaParams, m, messageHandler)
}
kafkaStream.start()
val input = Map(new TopicAndPartition(topic, 0) -> 1000L)
assert(kafkaStream.maxMessagesPerPartition(input).get ==
Map(new TopicAndPartition(topic, 0) -> maxMessagesPerPartition))
kafkaStream.stop()
}
test("maxMessagesPerPartition with zero offset and rate equal to one") {
val topic = "backpressure"
val kafkaParams = Map(
"metadata.broker.list" -> kafkaTestUtils.brokerAddress,
"auto.offset.reset" -> "smallest"
)
val batchIntervalMilliseconds = 60000
val sparkConf = new SparkConf()
// Safe, even with streaming, because we're using the direct API.
// Using 1 core is useful to make the test more predictable.
.setMaster("local[1]")
.setAppName(this.getClass.getSimpleName)
.set("spark.streaming.kafka.maxRatePerPartition", "100")
// Setup the streaming context
ssc = new StreamingContext(sparkConf, Milliseconds(batchIntervalMilliseconds))
val estimatedRate = 1L
val kafkaStream = withClue("Error creating direct stream") {
val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.key, mmd.message)
val fromOffsets = Map(
TopicAndPartition(topic, 0) -> 0L,
TopicAndPartition(topic, 1) -> 0L,
TopicAndPartition(topic, 2) -> 0L,
TopicAndPartition(topic, 3) -> 0L
)
new DirectKafkaInputDStream[String, String, StringDecoder, StringDecoder, (String, String)](
ssc, kafkaParams, fromOffsets, messageHandler) {
override protected[streaming] val rateController =
Some(new DirectKafkaRateController(id, null) {
override def getLatestRate() = estimatedRate
})
}
}
val offsets = Map(
TopicAndPartition(topic, 0) -> 0L,
TopicAndPartition(topic, 1) -> 100L,
TopicAndPartition(topic, 2) -> 200L,
TopicAndPartition(topic, 3) -> 300L
)
val result = kafkaStream.maxMessagesPerPartition(offsets)
val expected = Map(
TopicAndPartition(topic, 0) -> 1L,
TopicAndPartition(topic, 1) -> 10L,
TopicAndPartition(topic, 2) -> 20L,
TopicAndPartition(topic, 3) -> 30L
)
assert(result.contains(expected), s"Number of messages per partition must be at least 1")
}
/** Get the generated offset ranges from the DirectKafkaStream */
private def getOffsetRanges[K, V](
kafkaStream: DStream[(K, V)]): Seq[(Time, Array[OffsetRange])] = {
kafkaStream.generatedRDDs.mapValues { rdd =>
rdd.asInstanceOf[KafkaRDD[K, V, _, _, (K, V)]].offsetRanges
}.toSeq.sortBy { _._1 }
}
private def getDirectKafkaStream(topic: String, mockRateController: Option[RateController]) = {
val batchIntervalMilliseconds = 100
val sparkConf = new SparkConf()
.setMaster("local[1]")
.setAppName(this.getClass.getSimpleName)
.set("spark.streaming.kafka.maxRatePerPartition", "100")
// Setup the streaming context
ssc = new StreamingContext(sparkConf, Milliseconds(batchIntervalMilliseconds))
val earliestOffsets = Map(TopicAndPartition(topic, 0) -> 0L, TopicAndPartition(topic, 1) -> 0L)
val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.key, mmd.message)
new DirectKafkaInputDStream[String, String, StringDecoder, StringDecoder, (String, String)](
ssc, Map[String, String](), earliestOffsets, messageHandler) {
override protected[streaming] val rateController = mockRateController
}
}
}
object DirectKafkaStreamSuite {
val total = new AtomicLong(-1L)
class InputInfoCollector extends StreamingListener {
val numRecordsSubmitted = new AtomicLong(0L)
val numRecordsStarted = new AtomicLong(0L)
val numRecordsCompleted = new AtomicLong(0L)
override def onBatchSubmitted(batchSubmitted: StreamingListenerBatchSubmitted): Unit = {
numRecordsSubmitted.addAndGet(batchSubmitted.batchInfo.numRecords)
}
override def onBatchStarted(batchStarted: StreamingListenerBatchStarted): Unit = {
numRecordsStarted.addAndGet(batchStarted.batchInfo.numRecords)
}
override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted): Unit = {
numRecordsCompleted.addAndGet(batchCompleted.batchInfo.numRecords)
}
}
}
private[streaming] class ConstantEstimator(@volatile private var rate: Long)
extends RateEstimator {
def updateRate(newRate: Long): Unit = {
rate = newRate
}
def compute(
time: Long,
elements: Long,
processingDelay: Long,
schedulingDelay: Long): Option[Double] = Some(rate)
}
private[streaming] class ConstantRateController(id: Int, estimator: RateEstimator, rate: Long)
extends RateController(id, estimator) {
override def publish(rate: Long): Unit = ()
override def getLatestRate(): Long = rate
}

View file

@ -1,86 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming.kafka
import scala.util.Random
import kafka.common.TopicAndPartition
import org.scalatest.BeforeAndAfterAll
import org.apache.spark.SparkFunSuite
class KafkaClusterSuite extends SparkFunSuite with BeforeAndAfterAll {
private val topic = "kcsuitetopic" + Random.nextInt(10000)
private val topicAndPartition = TopicAndPartition(topic, 0)
private var kc: KafkaCluster = null
private var kafkaTestUtils: KafkaTestUtils = _
override def beforeAll() {
super.beforeAll()
kafkaTestUtils = new KafkaTestUtils
kafkaTestUtils.setup()
kafkaTestUtils.createTopic(topic)
kafkaTestUtils.sendMessages(topic, Map("a" -> 1))
kc = new KafkaCluster(Map("metadata.broker.list" -> kafkaTestUtils.brokerAddress))
}
override def afterAll() {
try {
if (kafkaTestUtils != null) {
kafkaTestUtils.teardown()
kafkaTestUtils = null
}
} finally {
super.afterAll()
}
}
test("metadata apis") {
val leader = kc.findLeaders(Set(topicAndPartition)).right.get(topicAndPartition)
val leaderAddress = s"${leader._1}:${leader._2}"
assert(leaderAddress === kafkaTestUtils.brokerAddress, "didn't get leader")
val parts = kc.getPartitions(Set(topic)).right.get
assert(parts(topicAndPartition), "didn't get partitions")
val err = kc.getPartitions(Set(topic + "BAD"))
assert(err.isLeft, "getPartitions for a nonexistant topic should be an error")
}
test("leader offset apis") {
val earliest = kc.getEarliestLeaderOffsets(Set(topicAndPartition)).right.get
assert(earliest(topicAndPartition).offset === 0, "didn't get earliest")
val latest = kc.getLatestLeaderOffsets(Set(topicAndPartition)).right.get
assert(latest(topicAndPartition).offset === 1, "didn't get latest")
}
test("consumer offset apis") {
val group = "kcsuitegroup" + Random.nextInt(10000)
val offset = Random.nextInt(10000)
val set = kc.setConsumerOffsets(group, Map(topicAndPartition -> offset))
assert(set.isRight, "didn't set consumer offsets")
val get = kc.getConsumerOffsets(group, Set(topicAndPartition)).right.get
assert(get(topicAndPartition) === offset, "didn't get consumer offsets")
}
}

View file

@ -1,182 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming.kafka
import scala.util.Random
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.scalatest.BeforeAndAfterAll
import org.apache.spark._
class KafkaRDDSuite extends SparkFunSuite with BeforeAndAfterAll {
private var kafkaTestUtils: KafkaTestUtils = _
private val sparkConf = new SparkConf().setMaster("local[4]")
.setAppName(this.getClass.getSimpleName)
private var sc: SparkContext = _
override def beforeAll {
super.beforeAll()
sc = new SparkContext(sparkConf)
kafkaTestUtils = new KafkaTestUtils
kafkaTestUtils.setup()
}
override def afterAll {
try {
try {
if (sc != null) {
sc.stop
sc = null
}
} finally {
if (kafkaTestUtils != null) {
kafkaTestUtils.teardown()
kafkaTestUtils = null
}
}
} finally {
super.afterAll()
}
}
test("basic usage") {
val topic = s"topicbasic-${Random.nextInt}-${System.currentTimeMillis}"
kafkaTestUtils.createTopic(topic)
val messages = Array("the", "quick", "brown", "fox")
kafkaTestUtils.sendMessages(topic, messages)
val kafkaParams = Map("metadata.broker.list" -> kafkaTestUtils.brokerAddress,
"group.id" -> s"test-consumer-${Random.nextInt}-${System.currentTimeMillis}")
val offsetRanges = Array(OffsetRange(topic, 0, 0, messages.size))
val rdd = KafkaUtils.createRDD[String, String, StringDecoder, StringDecoder](
sc, kafkaParams, offsetRanges)
val received = rdd.map(_._2).collect.toSet
assert(received === messages.toSet)
// size-related method optimizations return sane results
assert(rdd.count === messages.size)
assert(rdd.countApprox(0).getFinalValue.mean === messages.size)
assert(!rdd.isEmpty)
assert(rdd.take(1).size === 1)
assert(rdd.take(1).head._2 === messages.head)
assert(rdd.take(messages.size + 10).size === messages.size)
val emptyRdd = KafkaUtils.createRDD[String, String, StringDecoder, StringDecoder](
sc, kafkaParams, Array(OffsetRange(topic, 0, 0, 0)))
assert(emptyRdd.isEmpty)
// invalid offset ranges throw exceptions
val badRanges = Array(OffsetRange(topic, 0, 0, messages.size + 1))
intercept[SparkException] {
KafkaUtils.createRDD[String, String, StringDecoder, StringDecoder](
sc, kafkaParams, badRanges)
}
}
test("iterator boundary conditions") {
// the idea is to find e.g. off-by-one errors between what kafka has available and the rdd
val topic = s"topicboundary-${Random.nextInt}-${System.currentTimeMillis}"
val sent = Map("a" -> 5, "b" -> 3, "c" -> 10)
kafkaTestUtils.createTopic(topic)
val kafkaParams = Map("metadata.broker.list" -> kafkaTestUtils.brokerAddress,
"group.id" -> s"test-consumer-${Random.nextInt}-${System.currentTimeMillis}")
val kc = new KafkaCluster(kafkaParams)
// this is the "lots of messages" case
kafkaTestUtils.sendMessages(topic, sent)
val sentCount = sent.values.sum
// rdd defined from leaders after sending messages, should get the number sent
val rdd = getRdd(kc, Set(topic))
assert(rdd.isDefined)
val ranges = rdd.get.asInstanceOf[HasOffsetRanges].offsetRanges
val rangeCount = ranges.map(o => o.untilOffset - o.fromOffset).sum
assert(rangeCount === sentCount, "offset range didn't include all sent messages")
assert(rdd.get.count === sentCount, "didn't get all sent messages")
val rangesMap = ranges.map(o => TopicAndPartition(o.topic, o.partition) -> o.untilOffset).toMap
// make sure consumer offsets are committed before the next getRdd call
kc.setConsumerOffsets(kafkaParams("group.id"), rangesMap).fold(
err => throw new Exception(err.mkString("\n")),
_ => ()
)
// this is the "0 messages" case
val rdd2 = getRdd(kc, Set(topic))
// shouldn't get anything, since message is sent after rdd was defined
val sentOnlyOne = Map("d" -> 1)
kafkaTestUtils.sendMessages(topic, sentOnlyOne)
assert(rdd2.isDefined)
assert(rdd2.get.count === 0, "got messages when there shouldn't be any")
// this is the "exactly 1 message" case, namely the single message from sentOnlyOne above
val rdd3 = getRdd(kc, Set(topic))
// send lots of messages after rdd was defined, they shouldn't show up
kafkaTestUtils.sendMessages(topic, Map("extra" -> 22))
assert(rdd3.isDefined)
assert(rdd3.get.count === sentOnlyOne.values.sum, "didn't get exactly one message")
}
// get an rdd from the committed consumer offsets until the latest leader offsets,
private def getRdd(kc: KafkaCluster, topics: Set[String]) = {
val groupId = kc.kafkaParams("group.id")
def consumerOffsets(topicPartitions: Set[TopicAndPartition]) = {
kc.getConsumerOffsets(groupId, topicPartitions).right.toOption.orElse(
kc.getEarliestLeaderOffsets(topicPartitions).right.toOption.map { offs =>
offs.map(kv => kv._1 -> kv._2.offset)
}
)
}
kc.getPartitions(topics).right.toOption.flatMap { topicPartitions =>
consumerOffsets(topicPartitions).flatMap { from =>
kc.getLatestLeaderOffsets(topicPartitions).right.toOption.map { until =>
val offsetRanges = from.map { case (tp: TopicAndPartition, fromOffset: Long) =>
OffsetRange(tp.topic, tp.partition, fromOffset, until(tp).offset)
}.toArray
val leaders = until.map { case (tp: TopicAndPartition, lo: KafkaCluster.LeaderOffset) =>
tp -> Broker(lo.host, lo.port)
}.toMap
KafkaUtils.createRDD[String, String, StringDecoder, StringDecoder, String](
sc, kc.kafkaParams, offsetRanges, leaders,
(mmd: MessageAndMetadata[String, String]) => s"${mmd.offset} ${mmd.message}")
}
}
}
}
}

View file

@ -1,92 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming.kafka
import scala.collection.mutable
import scala.concurrent.duration._
import scala.language.postfixOps
import scala.util.Random
import kafka.serializer.StringDecoder
import org.scalatest.BeforeAndAfterAll
import org.scalatest.concurrent.Eventually
import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Milliseconds, StreamingContext}
class KafkaStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfterAll {
private var ssc: StreamingContext = _
private var kafkaTestUtils: KafkaTestUtils = _
override def beforeAll(): Unit = {
super.beforeAll()
kafkaTestUtils = new KafkaTestUtils
kafkaTestUtils.setup()
}
override def afterAll(): Unit = {
try {
try {
if (ssc != null) {
ssc.stop()
ssc = null
}
} finally {
if (kafkaTestUtils != null) {
kafkaTestUtils.teardown()
kafkaTestUtils = null
}
}
} finally {
super.afterAll()
}
}
test("Kafka input stream") {
val sparkConf = new SparkConf().setMaster("local[4]").setAppName(this.getClass.getSimpleName)
ssc = new StreamingContext(sparkConf, Milliseconds(500))
val topic = "topic1"
val sent = Map("a" -> 5, "b" -> 3, "c" -> 10)
kafkaTestUtils.createTopic(topic)
kafkaTestUtils.sendMessages(topic, sent)
val kafkaParams = Map("zookeeper.connect" -> kafkaTestUtils.zkAddress,
"group.id" -> s"test-consumer-${Random.nextInt(10000)}",
"auto.offset.reset" -> "smallest")
val stream = KafkaUtils.createStream[String, String, StringDecoder, StringDecoder](
ssc, kafkaParams, Map(topic -> 1), StorageLevel.MEMORY_ONLY)
val result = new mutable.HashMap[String, Long]()
stream.map(_._2).countByValue().foreachRDD { r =>
r.collect().foreach { kv =>
result.synchronized {
val count = result.getOrElseUpdate(kv._1, 0) + kv._2
result.put(kv._1, count)
}
}
}
ssc.start()
eventually(timeout(10000 milliseconds), interval(100 milliseconds)) {
assert(result.synchronized { sent === result })
}
ssc.stop()
}
}

View file

@ -1,153 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming.kafka
import java.io.File
import scala.collection.mutable
import scala.concurrent.duration._
import scala.language.postfixOps
import scala.util.Random
import kafka.serializer.StringDecoder
import kafka.utils.{ZKGroupTopicDirs, ZkUtils}
import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll}
import org.scalatest.concurrent.Eventually
import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Milliseconds, StreamingContext}
import org.apache.spark.util.Utils
class ReliableKafkaStreamSuite extends SparkFunSuite
with BeforeAndAfterAll with BeforeAndAfter with Eventually {
private val sparkConf = new SparkConf()
.setMaster("local[4]")
.setAppName(this.getClass.getSimpleName)
.set("spark.streaming.receiver.writeAheadLog.enable", "true")
private val data = Map("a" -> 10, "b" -> 10, "c" -> 10)
private var kafkaTestUtils: KafkaTestUtils = _
private var groupId: String = _
private var kafkaParams: Map[String, String] = _
private var ssc: StreamingContext = _
private var tempDirectory: File = null
override def beforeAll(): Unit = {
super.beforeAll()
kafkaTestUtils = new KafkaTestUtils
kafkaTestUtils.setup()
groupId = s"test-consumer-${Random.nextInt(10000)}"
kafkaParams = Map(
"zookeeper.connect" -> kafkaTestUtils.zkAddress,
"group.id" -> groupId,
"auto.offset.reset" -> "smallest"
)
tempDirectory = Utils.createTempDir()
}
override def afterAll(): Unit = {
try {
Utils.deleteRecursively(tempDirectory)
if (kafkaTestUtils != null) {
kafkaTestUtils.teardown()
kafkaTestUtils = null
}
} finally {
super.afterAll()
}
}
before {
ssc = new StreamingContext(sparkConf, Milliseconds(500))
ssc.checkpoint(tempDirectory.getAbsolutePath)
}
after {
if (ssc != null) {
ssc.stop(stopSparkContext = true)
ssc = null
}
}
test("Reliable Kafka input stream with single topic") {
val topic = "test-topic"
kafkaTestUtils.createTopic(topic)
kafkaTestUtils.sendMessages(topic, data)
// Verify whether the offset of this group/topic/partition is 0 before starting.
assert(getCommitOffset(groupId, topic, 0) === None)
val stream = KafkaUtils.createStream[String, String, StringDecoder, StringDecoder](
ssc, kafkaParams, Map(topic -> 1), StorageLevel.MEMORY_ONLY)
val result = new mutable.HashMap[String, Long]()
stream.map { case (k, v) => v }.foreachRDD { r =>
val ret = r.collect()
ret.foreach { v =>
val count = result.getOrElseUpdate(v, 0) + 1
result.put(v, count)
}
}
ssc.start()
eventually(timeout(20000 milliseconds), interval(200 milliseconds)) {
// A basic process verification for ReliableKafkaReceiver.
// Verify whether received message number is equal to the sent message number.
assert(data.size === result.size)
// Verify whether each message is the same as the data to be verified.
data.keys.foreach { k => assert(data(k) === result(k).toInt) }
// Verify the offset number whether it is equal to the total message number.
assert(getCommitOffset(groupId, topic, 0) === Some(29L))
}
}
test("Reliable Kafka input stream with multiple topics") {
val topics = Map("topic1" -> 1, "topic2" -> 1, "topic3" -> 1)
topics.foreach { case (t, _) =>
kafkaTestUtils.createTopic(t)
kafkaTestUtils.sendMessages(t, data)
}
// Before started, verify all the group/topic/partition offsets are 0.
topics.foreach { case (t, _) => assert(getCommitOffset(groupId, t, 0) === None) }
// Consuming all the data sent to the broker which will potential commit the offsets internally.
val stream = KafkaUtils.createStream[String, String, StringDecoder, StringDecoder](
ssc, kafkaParams, topics, StorageLevel.MEMORY_ONLY)
stream.foreachRDD(_ => Unit)
ssc.start()
eventually(timeout(20000 milliseconds), interval(100 milliseconds)) {
// Verify the offset for each group/topic to see whether they are equal to the expected one.
topics.foreach { case (t, _) => assert(getCommitOffset(groupId, t, 0) === Some(29L)) }
}
}
/** Getting partition offset from Zookeeper. */
private def getCommitOffset(groupId: String, topic: String, partition: Int): Option[Long] = {
val topicDirs = new ZKGroupTopicDirs(groupId, topic)
val zkPath = s"${topicDirs.consumerOffsetDir}/$partition"
ZkUtils.readDataMaybeNull(kafkaTestUtils.zookeeperClient, zkPath)._1.map(_.toLong)
}
}

View file

@ -2674,14 +2674,6 @@
</modules> </modules>
</profile> </profile>
<profile>
<id>kafka-0-8</id>
<modules>
<module>external/kafka-0-8</module>
<module>external/kafka-0-8-assembly</module>
</modules>
</profile>
<profile> <profile>
<id>test-java-home</id> <id>test-java-home</id>
<activation> <activation>

View file

@ -55,14 +55,14 @@ object BuildCommons {
).map(ProjectRef(buildLocation, _)) ++ sqlProjects ++ streamingProjects ).map(ProjectRef(buildLocation, _)) ++ sqlProjects ++ streamingProjects
val optionallyEnabledProjects@Seq(kubernetes, mesos, yarn, val optionallyEnabledProjects@Seq(kubernetes, mesos, yarn,
streamingKafka, sparkGangliaLgpl, streamingKinesisAsl, sparkGangliaLgpl, streamingKinesisAsl,
dockerIntegrationTests, hadoopCloud, kubernetesIntegrationTests) = dockerIntegrationTests, hadoopCloud, kubernetesIntegrationTests) =
Seq("kubernetes", "mesos", "yarn", Seq("kubernetes", "mesos", "yarn",
"streaming-kafka-0-8", "ganglia-lgpl", "streaming-kinesis-asl", "ganglia-lgpl", "streaming-kinesis-asl",
"docker-integration-tests", "hadoop-cloud", "kubernetes-integration-tests").map(ProjectRef(buildLocation, _)) "docker-integration-tests", "hadoop-cloud", "kubernetes-integration-tests").map(ProjectRef(buildLocation, _))
val assemblyProjects@Seq(networkYarn, streamingKafkaAssembly, streamingKafka010Assembly, streamingKinesisAslAssembly) = val assemblyProjects@Seq(networkYarn, streamingKafka010Assembly, streamingKinesisAslAssembly) =
Seq("network-yarn", "streaming-kafka-0-8-assembly", "streaming-kafka-0-10-assembly", "streaming-kinesis-asl-assembly") Seq("network-yarn", "streaming-kafka-0-10-assembly", "streaming-kinesis-asl-assembly")
.map(ProjectRef(buildLocation, _)) .map(ProjectRef(buildLocation, _))
val copyJarsProjects@Seq(assembly, examples) = Seq("assembly", "examples") val copyJarsProjects@Seq(assembly, examples) = Seq("assembly", "examples")
@ -580,10 +580,8 @@ object Assembly {
.getOrElse(SbtPomKeys.effectivePom.value.getProperties.get("hadoop.version").asInstanceOf[String]) .getOrElse(SbtPomKeys.effectivePom.value.getProperties.get("hadoop.version").asInstanceOf[String])
}, },
jarName in assembly := { jarName in assembly := {
if (moduleName.value.contains("streaming-kafka-0-8-assembly") if (moduleName.value.contains("streaming-kafka-0-10-assembly")
|| moduleName.value.contains("streaming-kafka-0-10-assembly")
|| moduleName.value.contains("streaming-kinesis-asl-assembly")) { || moduleName.value.contains("streaming-kinesis-asl-assembly")) {
// This must match the same name used in maven (see external/kafka-0-8-assembly/pom.xml)
s"${moduleName.value}-${version.value}.jar" s"${moduleName.value}-${version.value}.jar"
} else { } else {
s"${moduleName.value}-${version.value}-hadoop${hadoopVersion.value}.jar" s"${moduleName.value}-${version.value}-hadoop${hadoopVersion.value}.jar"

View file

@ -9,13 +9,6 @@ Module contents
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
pyspark.streaming.kafka module
------------------------------
.. automodule:: pyspark.streaming.kafka
:members:
:undoc-members:
:show-inheritance:
pyspark.streaming.kinesis module pyspark.streaming.kinesis module
-------------------------------- --------------------------------
.. automodule:: pyspark.streaming.kinesis .. automodule:: pyspark.streaming.kinesis

View file

@ -45,7 +45,7 @@ class DStream(object):
for more details on RDDs). for more details on RDDs).
DStreams can either be created from live data (such as, data from TCP DStreams can either be created from live data (such as, data from TCP
sockets, Kafka, etc.) using a L{StreamingContext} or it can be sockets, etc.) using a L{StreamingContext} or it can be
generated by transforming existing DStreams using operations such as generated by transforming existing DStreams using operations such as
`map`, `window` and `reduceByKeyAndWindow`. While a Spark Streaming `map`, `window` and `reduceByKeyAndWindow`. While a Spark Streaming
program is running, each DStream periodically generates a RDD, either program is running, each DStream periodically generates a RDD, either
@ -626,7 +626,6 @@ class TransformedDStream(DStream):
# Using type() to avoid folding the functions and compacting the DStreams which is not # Using type() to avoid folding the functions and compacting the DStreams which is not
# not strictly an object of TransformedDStream. # not strictly an object of TransformedDStream.
# Changed here is to avoid bug in KafkaTransformedDStream when calling offsetRanges().
if (type(prev) is TransformedDStream and if (type(prev) is TransformedDStream and
not prev.is_cached and not prev.is_checkpointed): not prev.is_cached and not prev.is_checkpointed):
prev_func = prev.func prev_func = prev.func

View file

@ -1,506 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import warnings
from py4j.protocol import Py4JJavaError
from pyspark.rdd import RDD
from pyspark.storagelevel import StorageLevel
from pyspark.serializers import AutoBatchedSerializer, PickleSerializer, PairDeserializer, \
NoOpSerializer
from pyspark.streaming import DStream
from pyspark.streaming.dstream import TransformedDStream
from pyspark.streaming.util import TransformFunction
__all__ = ['Broker', 'KafkaMessageAndMetadata', 'KafkaUtils', 'OffsetRange',
'TopicAndPartition', 'utf8_decoder']
def utf8_decoder(s):
""" Decode the unicode as UTF-8 """
if s is None:
return None
return s.decode('utf-8')
class KafkaUtils(object):
@staticmethod
def createStream(ssc, zkQuorum, groupId, topics, kafkaParams=None,
storageLevel=StorageLevel.MEMORY_AND_DISK_2,
keyDecoder=utf8_decoder, valueDecoder=utf8_decoder):
"""
Create an input stream that pulls messages from a Kafka Broker.
:param ssc: StreamingContext object
:param zkQuorum: Zookeeper quorum (hostname:port,hostname:port,..).
:param groupId: The group id for this consumer.
:param topics: Dict of (topic_name -> numPartitions) to consume.
Each partition is consumed in its own thread.
:param kafkaParams: Additional params for Kafka
:param storageLevel: RDD storage level.
:param keyDecoder: A function used to decode key (default is utf8_decoder)
:param valueDecoder: A function used to decode value (default is utf8_decoder)
:return: A DStream object
.. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0.
See SPARK-21893.
"""
warnings.warn(
"Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. "
"See SPARK-21893.",
DeprecationWarning)
if kafkaParams is None:
kafkaParams = dict()
kafkaParams.update({
"zookeeper.connect": zkQuorum,
"group.id": groupId,
"zookeeper.connection.timeout.ms": "10000",
})
if not isinstance(topics, dict):
raise TypeError("topics should be dict")
jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
helper = KafkaUtils._get_helper(ssc._sc)
jstream = helper.createStream(ssc._jssc, kafkaParams, topics, jlevel)
ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
stream = DStream(jstream, ssc, ser)
return stream.map(lambda k_v: (keyDecoder(k_v[0]), valueDecoder(k_v[1])))
@staticmethod
def createDirectStream(ssc, topics, kafkaParams, fromOffsets=None,
keyDecoder=utf8_decoder, valueDecoder=utf8_decoder,
messageHandler=None):
"""
Create an input stream that directly pulls messages from a Kafka Broker and specific offset.
This is not a receiver based Kafka input stream, it directly pulls the message from Kafka
in each batch duration and processed without storing.
This does not use Zookeeper to store offsets. The consumed offsets are tracked
by the stream itself. For interoperability with Kafka monitoring tools that depend on
Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application.
You can access the offsets used in each batch from the generated RDDs (see
To recover from driver failures, you have to enable checkpointing in the StreamingContext.
The information on consumed offset can be recovered from the checkpoint.
See the programming guide for details (constraints, etc.).
:param ssc: StreamingContext object.
:param topics: list of topic_name to consume.
:param kafkaParams: Additional params for Kafka.
:param fromOffsets: Per-topic/partition Kafka offsets defining the (inclusive) starting
point of the stream (a dictionary mapping `TopicAndPartition` to
integers).
:param keyDecoder: A function used to decode key (default is utf8_decoder).
:param valueDecoder: A function used to decode value (default is utf8_decoder).
:param messageHandler: A function used to convert KafkaMessageAndMetadata. You can assess
meta using messageHandler (default is None).
:return: A DStream object
.. note:: Experimental
.. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0.
See SPARK-21893.
"""
warnings.warn(
"Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. "
"See SPARK-21893.",
DeprecationWarning)
if fromOffsets is None:
fromOffsets = dict()
if not isinstance(topics, list):
raise TypeError("topics should be list")
if not isinstance(kafkaParams, dict):
raise TypeError("kafkaParams should be dict")
def funcWithoutMessageHandler(k_v):
return (keyDecoder(k_v[0]), valueDecoder(k_v[1]))
def funcWithMessageHandler(m):
m._set_key_decoder(keyDecoder)
m._set_value_decoder(valueDecoder)
return messageHandler(m)
helper = KafkaUtils._get_helper(ssc._sc)
jfromOffsets = dict([(k._jTopicAndPartition(helper),
v) for (k, v) in fromOffsets.items()])
if messageHandler is None:
ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
func = funcWithoutMessageHandler
jstream = helper.createDirectStreamWithoutMessageHandler(
ssc._jssc, kafkaParams, set(topics), jfromOffsets)
else:
ser = AutoBatchedSerializer(PickleSerializer())
func = funcWithMessageHandler
jstream = helper.createDirectStreamWithMessageHandler(
ssc._jssc, kafkaParams, set(topics), jfromOffsets)
stream = DStream(jstream, ssc, ser).map(func)
return KafkaDStream(stream._jdstream, ssc, stream._jrdd_deserializer)
@staticmethod
def createRDD(sc, kafkaParams, offsetRanges, leaders=None,
keyDecoder=utf8_decoder, valueDecoder=utf8_decoder,
messageHandler=None):
"""
Create an RDD from Kafka using offset ranges for each topic and partition.
:param sc: SparkContext object
:param kafkaParams: Additional params for Kafka
:param offsetRanges: list of offsetRange to specify topic:partition:[start, end) to consume
:param leaders: Kafka brokers for each TopicAndPartition in offsetRanges. May be an empty
map, in which case leaders will be looked up on the driver.
:param keyDecoder: A function used to decode key (default is utf8_decoder)
:param valueDecoder: A function used to decode value (default is utf8_decoder)
:param messageHandler: A function used to convert KafkaMessageAndMetadata. You can assess
meta using messageHandler (default is None).
:return: An RDD object
.. note:: Experimental
.. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0.
See SPARK-21893.
"""
warnings.warn(
"Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. "
"See SPARK-21893.",
DeprecationWarning)
if leaders is None:
leaders = dict()
if not isinstance(kafkaParams, dict):
raise TypeError("kafkaParams should be dict")
if not isinstance(offsetRanges, list):
raise TypeError("offsetRanges should be list")
def funcWithoutMessageHandler(k_v):
return (keyDecoder(k_v[0]), valueDecoder(k_v[1]))
def funcWithMessageHandler(m):
m._set_key_decoder(keyDecoder)
m._set_value_decoder(valueDecoder)
return messageHandler(m)
helper = KafkaUtils._get_helper(sc)
joffsetRanges = [o._jOffsetRange(helper) for o in offsetRanges]
jleaders = dict([(k._jTopicAndPartition(helper),
v._jBroker(helper)) for (k, v) in leaders.items()])
if messageHandler is None:
jrdd = helper.createRDDWithoutMessageHandler(
sc._jsc, kafkaParams, joffsetRanges, jleaders)
ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
rdd = RDD(jrdd, sc, ser).map(funcWithoutMessageHandler)
else:
jrdd = helper.createRDDWithMessageHandler(
sc._jsc, kafkaParams, joffsetRanges, jleaders)
rdd = RDD(jrdd, sc).map(funcWithMessageHandler)
return KafkaRDD(rdd._jrdd, sc, rdd._jrdd_deserializer)
@staticmethod
def _get_helper(sc):
try:
return sc._jvm.org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper()
except TypeError as e:
if str(e) == "'JavaPackage' object is not callable":
KafkaUtils._printErrorMsg(sc)
raise
@staticmethod
def _printErrorMsg(sc):
print("""
________________________________________________________________________________________________
Spark Streaming's Kafka libraries not found in class path. Try one of the following.
1. Include the Kafka library and its dependencies with in the
spark-submit command as
$ bin/spark-submit --packages org.apache.spark:spark-streaming-kafka-0-8:%s ...
2. Download the JAR of the artifact from Maven Central http://search.maven.org/,
Group Id = org.apache.spark, Artifact Id = spark-streaming-kafka-0-8-assembly, Version = %s.
Then, include the jar in the spark-submit command as
$ bin/spark-submit --jars <spark-streaming-kafka-0-8-assembly.jar> ...
________________________________________________________________________________________________
""" % (sc.version, sc.version))
class OffsetRange(object):
"""
Represents a range of offsets from a single Kafka TopicAndPartition.
.. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0.
See SPARK-21893.
"""
def __init__(self, topic, partition, fromOffset, untilOffset):
"""
Create an OffsetRange to represent range of offsets
:param topic: Kafka topic name.
:param partition: Kafka partition id.
:param fromOffset: Inclusive starting offset.
:param untilOffset: Exclusive ending offset.
"""
warnings.warn(
"Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. "
"See SPARK-21893.",
DeprecationWarning)
self.topic = topic
self.partition = partition
self.fromOffset = fromOffset
self.untilOffset = untilOffset
def __eq__(self, other):
if isinstance(other, self.__class__):
return (self.topic == other.topic
and self.partition == other.partition
and self.fromOffset == other.fromOffset
and self.untilOffset == other.untilOffset)
else:
return False
def __ne__(self, other):
return not self.__eq__(other)
def __str__(self):
return "OffsetRange(topic: %s, partition: %d, range: [%d -> %d]" \
% (self.topic, self.partition, self.fromOffset, self.untilOffset)
def _jOffsetRange(self, helper):
return helper.createOffsetRange(self.topic, self.partition, self.fromOffset,
self.untilOffset)
class TopicAndPartition(object):
"""
Represents a specific topic and partition for Kafka.
.. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0.
See SPARK-21893.
"""
def __init__(self, topic, partition):
"""
Create a Python TopicAndPartition to map to the Java related object
:param topic: Kafka topic name.
:param partition: Kafka partition id.
"""
warnings.warn(
"Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. "
"See SPARK-21893.",
DeprecationWarning)
self._topic = topic
self._partition = partition
def _jTopicAndPartition(self, helper):
return helper.createTopicAndPartition(self._topic, self._partition)
def __eq__(self, other):
if isinstance(other, self.__class__):
return (self._topic == other._topic
and self._partition == other._partition)
else:
return False
def __ne__(self, other):
return not self.__eq__(other)
def __hash__(self):
return (self._topic, self._partition).__hash__()
class Broker(object):
"""
Represent the host and port info for a Kafka broker.
.. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0.
See SPARK-21893.
"""
def __init__(self, host, port):
"""
Create a Python Broker to map to the Java related object.
:param host: Broker's hostname.
:param port: Broker's port.
"""
warnings.warn(
"Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. "
"See SPARK-21893.",
DeprecationWarning)
self._host = host
self._port = port
def _jBroker(self, helper):
return helper.createBroker(self._host, self._port)
class KafkaRDD(RDD):
"""
A Python wrapper of KafkaRDD, to provide additional information on normal RDD.
.. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0.
See SPARK-21893.
"""
def __init__(self, jrdd, ctx, jrdd_deserializer):
warnings.warn(
"Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. "
"See SPARK-21893.",
DeprecationWarning)
RDD.__init__(self, jrdd, ctx, jrdd_deserializer)
def offsetRanges(self):
"""
Get the OffsetRange of specific KafkaRDD.
:return: A list of OffsetRange
"""
helper = KafkaUtils._get_helper(self.ctx)
joffsetRanges = helper.offsetRangesOfKafkaRDD(self._jrdd.rdd())
ranges = [OffsetRange(o.topic(), o.partition(), o.fromOffset(), o.untilOffset())
for o in joffsetRanges]
return ranges
class KafkaDStream(DStream):
"""
A Python wrapper of KafkaDStream
.. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0.
See SPARK-21893.
"""
def __init__(self, jdstream, ssc, jrdd_deserializer):
warnings.warn(
"Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. "
"See SPARK-21893.",
DeprecationWarning)
DStream.__init__(self, jdstream, ssc, jrdd_deserializer)
def foreachRDD(self, func):
"""
Apply a function to each RDD in this DStream.
"""
if func.__code__.co_argcount == 1:
old_func = func
func = lambda r, rdd: old_func(rdd)
jfunc = TransformFunction(self._sc, func, self._jrdd_deserializer) \
.rdd_wrapper(lambda jrdd, ctx, ser: KafkaRDD(jrdd, ctx, ser))
api = self._ssc._jvm.PythonDStream
api.callForeachRDD(self._jdstream, jfunc)
def transform(self, func):
"""
Return a new DStream in which each RDD is generated by applying a function
on each RDD of this DStream.
`func` can have one argument of `rdd`, or have two arguments of
(`time`, `rdd`)
"""
if func.__code__.co_argcount == 1:
oldfunc = func
func = lambda t, rdd: oldfunc(rdd)
assert func.__code__.co_argcount == 2, "func should take one or two arguments"
return KafkaTransformedDStream(self, func)
class KafkaTransformedDStream(TransformedDStream):
"""
Kafka specific wrapper of TransformedDStream to transform on Kafka RDD.
.. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0.
See SPARK-21893.
"""
def __init__(self, prev, func):
warnings.warn(
"Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. "
"See SPARK-21893.",
DeprecationWarning)
TransformedDStream.__init__(self, prev, func)
@property
def _jdstream(self):
if self._jdstream_val is not None:
return self._jdstream_val
jfunc = TransformFunction(self._sc, self.func, self.prev._jrdd_deserializer) \
.rdd_wrapper(lambda jrdd, ctx, ser: KafkaRDD(jrdd, ctx, ser))
dstream = self._sc._jvm.PythonTransformedDStream(self.prev._jdstream.dstream(), jfunc)
self._jdstream_val = dstream.asJavaDStream()
return self._jdstream_val
class KafkaMessageAndMetadata(object):
"""
Kafka message and metadata information. Including topic, partition, offset and message
.. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0.
See SPARK-21893.
"""
def __init__(self, topic, partition, offset, key, message):
"""
Python wrapper of Kafka MessageAndMetadata
:param topic: topic name of this Kafka message
:param partition: partition id of this Kafka message
:param offset: Offset of this Kafka message in the specific partition
:param key: key payload of this Kafka message, can be null if this Kafka message has no key
specified, the return data is undecoded bytearry.
:param message: actual message payload of this Kafka message, the return data is
undecoded bytearray.
"""
warnings.warn(
"Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. "
"See SPARK-21893.",
DeprecationWarning)
self.topic = topic
self.partition = partition
self.offset = offset
self._rawKey = key
self._rawMessage = message
self._keyDecoder = utf8_decoder
self._valueDecoder = utf8_decoder
def __str__(self):
return "KafkaMessageAndMetadata(topic: %s, partition: %d, offset: %d, key and message...)" \
% (self.topic, self.partition, self.offset)
def __repr__(self):
return self.__str__()
def __reduce__(self):
return (KafkaMessageAndMetadata,
(self.topic, self.partition, self.offset, self._rawKey, self._rawMessage))
def _set_key_decoder(self, decoder):
self._keyDecoder = decoder
def _set_value_decoder(self, decoder):
self._valueDecoder = decoder
@property
def key(self):
return self._keyDecoder(self._rawKey)
@property
def message(self):
return self._valueDecoder(self._rawMessage)

View file

@ -47,7 +47,6 @@ if sys.version >= "3":
from pyspark.context import SparkConf, SparkContext, RDD from pyspark.context import SparkConf, SparkContext, RDD
from pyspark.storagelevel import StorageLevel from pyspark.storagelevel import StorageLevel
from pyspark.streaming.context import StreamingContext from pyspark.streaming.context import StreamingContext
from pyspark.streaming.kafka import Broker, KafkaUtils, OffsetRange, TopicAndPartition
from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream
from pyspark.streaming.listener import StreamingListener from pyspark.streaming.listener import StreamingListener
@ -1047,259 +1046,6 @@ class CheckpointTests(unittest.TestCase):
self.ssc.stop(True, True) self.ssc.stop(True, True)
class KafkaStreamTests(PySparkStreamingTestCase):
timeout = 20 # seconds
duration = 1
def setUp(self):
super(KafkaStreamTests, self).setUp()
self._kafkaTestUtils = self.ssc._jvm.org.apache.spark.streaming.kafka.KafkaTestUtils()
self._kafkaTestUtils.setup()
def tearDown(self):
super(KafkaStreamTests, self).tearDown()
if self._kafkaTestUtils is not None:
self._kafkaTestUtils.teardown()
self._kafkaTestUtils = None
def _randomTopic(self):
return "topic-%d" % random.randint(0, 10000)
def _validateStreamResult(self, sendData, stream):
result = {}
for i in chain.from_iterable(self._collect(stream.map(lambda x: x[1]),
sum(sendData.values()))):
result[i] = result.get(i, 0) + 1
self.assertEqual(sendData, result)
def _validateRddResult(self, sendData, rdd):
result = {}
for i in rdd.map(lambda x: x[1]).collect():
result[i] = result.get(i, 0) + 1
self.assertEqual(sendData, result)
def test_kafka_stream(self):
"""Test the Python Kafka stream API."""
topic = self._randomTopic()
sendData = {"a": 3, "b": 5, "c": 10}
self._kafkaTestUtils.createTopic(topic)
self._kafkaTestUtils.sendMessages(topic, sendData)
stream = KafkaUtils.createStream(self.ssc, self._kafkaTestUtils.zkAddress(),
"test-streaming-consumer", {topic: 1},
{"auto.offset.reset": "smallest"})
self._validateStreamResult(sendData, stream)
def test_kafka_direct_stream(self):
"""Test the Python direct Kafka stream API."""
topic = self._randomTopic()
sendData = {"a": 1, "b": 2, "c": 3}
kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(),
"auto.offset.reset": "smallest"}
self._kafkaTestUtils.createTopic(topic)
self._kafkaTestUtils.sendMessages(topic, sendData)
stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams)
self._validateStreamResult(sendData, stream)
def test_kafka_direct_stream_from_offset(self):
"""Test the Python direct Kafka stream API with start offset specified."""
topic = self._randomTopic()
sendData = {"a": 1, "b": 2, "c": 3}
fromOffsets = {TopicAndPartition(topic, 0): long(0)}
kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()}
self._kafkaTestUtils.createTopic(topic)
self._kafkaTestUtils.sendMessages(topic, sendData)
stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams, fromOffsets)
self._validateStreamResult(sendData, stream)
def test_kafka_rdd(self):
"""Test the Python direct Kafka RDD API."""
topic = self._randomTopic()
sendData = {"a": 1, "b": 2}
offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))]
kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()}
self._kafkaTestUtils.createTopic(topic)
self._kafkaTestUtils.sendMessages(topic, sendData)
rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges)
self._validateRddResult(sendData, rdd)
def test_kafka_rdd_with_leaders(self):
"""Test the Python direct Kafka RDD API with leaders."""
topic = self._randomTopic()
sendData = {"a": 1, "b": 2, "c": 3}
offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))]
kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()}
address = self._kafkaTestUtils.brokerAddress().split(":")
leaders = {TopicAndPartition(topic, 0): Broker(address[0], int(address[1]))}
self._kafkaTestUtils.createTopic(topic)
self._kafkaTestUtils.sendMessages(topic, sendData)
rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges, leaders)
self._validateRddResult(sendData, rdd)
def test_kafka_rdd_get_offsetRanges(self):
"""Test Python direct Kafka RDD get OffsetRanges."""
topic = self._randomTopic()
sendData = {"a": 3, "b": 4, "c": 5}
offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))]
kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()}
self._kafkaTestUtils.createTopic(topic)
self._kafkaTestUtils.sendMessages(topic, sendData)
rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges)
self.assertEqual(offsetRanges, rdd.offsetRanges())
def test_kafka_direct_stream_foreach_get_offsetRanges(self):
"""Test the Python direct Kafka stream foreachRDD get offsetRanges."""
topic = self._randomTopic()
sendData = {"a": 1, "b": 2, "c": 3}
kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(),
"auto.offset.reset": "smallest"}
self._kafkaTestUtils.createTopic(topic)
self._kafkaTestUtils.sendMessages(topic, sendData)
stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams)
offsetRanges = []
def getOffsetRanges(_, rdd):
for o in rdd.offsetRanges():
offsetRanges.append(o)
stream.foreachRDD(getOffsetRanges)
self.ssc.start()
self.wait_for(offsetRanges, 1)
self.assertEqual(offsetRanges, [OffsetRange(topic, 0, long(0), long(6))])
def test_kafka_direct_stream_transform_get_offsetRanges(self):
"""Test the Python direct Kafka stream transform get offsetRanges."""
topic = self._randomTopic()
sendData = {"a": 1, "b": 2, "c": 3}
kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(),
"auto.offset.reset": "smallest"}
self._kafkaTestUtils.createTopic(topic)
self._kafkaTestUtils.sendMessages(topic, sendData)
stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams)
offsetRanges = []
def transformWithOffsetRanges(rdd):
for o in rdd.offsetRanges():
offsetRanges.append(o)
return rdd
# Test whether it is ok mixing KafkaTransformedDStream and TransformedDStream together,
# only the TransformedDstreams can be folded together.
stream.transform(transformWithOffsetRanges).map(lambda kv: kv[1]).count().pprint()
self.ssc.start()
self.wait_for(offsetRanges, 1)
self.assertEqual(offsetRanges, [OffsetRange(topic, 0, long(0), long(6))])
def test_topic_and_partition_equality(self):
topic_and_partition_a = TopicAndPartition("foo", 0)
topic_and_partition_b = TopicAndPartition("foo", 0)
topic_and_partition_c = TopicAndPartition("bar", 0)
topic_and_partition_d = TopicAndPartition("foo", 1)
self.assertEqual(topic_and_partition_a, topic_and_partition_b)
self.assertNotEqual(topic_and_partition_a, topic_and_partition_c)
self.assertNotEqual(topic_and_partition_a, topic_and_partition_d)
def test_kafka_direct_stream_transform_with_checkpoint(self):
"""Test the Python direct Kafka stream transform with checkpoint correctly recovered."""
topic = self._randomTopic()
sendData = {"a": 1, "b": 2, "c": 3}
kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(),
"auto.offset.reset": "smallest"}
self._kafkaTestUtils.createTopic(topic)
self._kafkaTestUtils.sendMessages(topic, sendData)
offsetRanges = []
def transformWithOffsetRanges(rdd):
for o in rdd.offsetRanges():
offsetRanges.append(o)
return rdd
self.ssc.stop(False)
self.ssc = None
tmpdir = "checkpoint-test-%d" % random.randint(0, 10000)
def setup():
ssc = StreamingContext(self.sc, 0.5)
ssc.checkpoint(tmpdir)
stream = KafkaUtils.createDirectStream(ssc, [topic], kafkaParams)
stream.transform(transformWithOffsetRanges).count().pprint()
return ssc
try:
ssc1 = StreamingContext.getOrCreate(tmpdir, setup)
ssc1.start()
self.wait_for(offsetRanges, 1)
self.assertEqual(offsetRanges, [OffsetRange(topic, 0, long(0), long(6))])
# To make sure some checkpoint is written
time.sleep(3)
ssc1.stop(False)
ssc1 = None
# Restart again to make sure the checkpoint is recovered correctly
ssc2 = StreamingContext.getOrCreate(tmpdir, setup)
ssc2.start()
ssc2.awaitTermination(3)
ssc2.stop(stopSparkContext=False, stopGraceFully=True)
ssc2 = None
finally:
shutil.rmtree(tmpdir)
def test_kafka_rdd_message_handler(self):
"""Test Python direct Kafka RDD MessageHandler."""
topic = self._randomTopic()
sendData = {"a": 1, "b": 1, "c": 2}
offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))]
kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()}
def getKeyAndDoubleMessage(m):
return m and (m.key, m.message * 2)
self._kafkaTestUtils.createTopic(topic)
self._kafkaTestUtils.sendMessages(topic, sendData)
rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges,
messageHandler=getKeyAndDoubleMessage)
self._validateRddResult({"aa": 1, "bb": 1, "cc": 2}, rdd)
def test_kafka_direct_stream_message_handler(self):
"""Test the Python direct Kafka stream MessageHandler."""
topic = self._randomTopic()
sendData = {"a": 1, "b": 2, "c": 3}
kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(),
"auto.offset.reset": "smallest"}
self._kafkaTestUtils.createTopic(topic)
self._kafkaTestUtils.sendMessages(topic, sendData)
def getKeyAndDoubleMessage(m):
return m and (m.key, m.message * 2)
stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams,
messageHandler=getKeyAndDoubleMessage)
self._validateStreamResult({"aa": 1, "bb": 2, "cc": 3}, stream)
class KinesisStreamTests(PySparkStreamingTestCase): class KinesisStreamTests(PySparkStreamingTestCase):
def test_kinesis_stream_api(self): def test_kinesis_stream_api(self):
@ -1371,23 +1117,6 @@ def search_jar(dir, name_prefix):
return [jar for jar in jars if not jar.endswith(ignored_jar_suffixes)] return [jar for jar in jars if not jar.endswith(ignored_jar_suffixes)]
def search_kafka_assembly_jar():
SPARK_HOME = os.environ["SPARK_HOME"]
kafka_assembly_dir = os.path.join(SPARK_HOME, "external/kafka-0-8-assembly")
jars = search_jar(kafka_assembly_dir, "spark-streaming-kafka-0-8-assembly")
if not jars:
raise Exception(
("Failed to find Spark Streaming kafka assembly jar in %s. " % kafka_assembly_dir) +
"You need to build Spark with "
"'build/sbt -Pkafka-0-8 assembly/package streaming-kafka-0-8-assembly/assembly' or "
"'build/mvn -DskipTests -Pkafka-0-8 package' before running this test.")
elif len(jars) > 1:
raise Exception(("Found multiple Spark Streaming Kafka assembly JARs: %s; please "
"remove all but one") % (", ".join(jars)))
else:
return jars[0]
def _kinesis_asl_assembly_dir(): def _kinesis_asl_assembly_dir():
SPARK_HOME = os.environ["SPARK_HOME"] SPARK_HOME = os.environ["SPARK_HOME"]
return os.path.join(SPARK_HOME, "external/kinesis-asl-assembly") return os.path.join(SPARK_HOME, "external/kinesis-asl-assembly")
@ -1404,38 +1133,26 @@ def search_kinesis_asl_assembly_jar():
return jars[0] return jars[0]
# Must be same as the variable and condition defined in modules.py
kafka_test_environ_var = "ENABLE_KAFKA_0_8_TESTS"
are_kafka_tests_enabled = os.environ.get(kafka_test_environ_var) == '1'
# Must be same as the variable and condition defined in KinesisTestUtils.scala and modules.py # Must be same as the variable and condition defined in KinesisTestUtils.scala and modules.py
kinesis_test_environ_var = "ENABLE_KINESIS_TESTS" kinesis_test_environ_var = "ENABLE_KINESIS_TESTS"
are_kinesis_tests_enabled = os.environ.get(kinesis_test_environ_var) == '1' are_kinesis_tests_enabled = os.environ.get(kinesis_test_environ_var) == '1'
if __name__ == "__main__": if __name__ == "__main__":
from pyspark.streaming.tests import * from pyspark.streaming.tests import *
kafka_assembly_jar = search_kafka_assembly_jar()
kinesis_asl_assembly_jar = search_kinesis_asl_assembly_jar() kinesis_asl_assembly_jar = search_kinesis_asl_assembly_jar()
if kinesis_asl_assembly_jar is None: if kinesis_asl_assembly_jar is None:
kinesis_jar_present = False kinesis_jar_present = False
jars = kafka_assembly_jar jars_args = ""
else: else:
kinesis_jar_present = True kinesis_jar_present = True
jars = "%s,%s" % (kafka_assembly_jar, kinesis_asl_assembly_jar) jars_args = "--jars %s" % kinesis_asl_assembly_jar
existing_args = os.environ.get("PYSPARK_SUBMIT_ARGS", "pyspark-shell") existing_args = os.environ.get("PYSPARK_SUBMIT_ARGS", "pyspark-shell")
jars_args = "--jars %s" % jars
os.environ["PYSPARK_SUBMIT_ARGS"] = " ".join([jars_args, existing_args]) os.environ["PYSPARK_SUBMIT_ARGS"] = " ".join([jars_args, existing_args])
testcases = [BasicOperationTests, WindowFunctionTests, StreamingContextTests, CheckpointTests, testcases = [BasicOperationTests, WindowFunctionTests, StreamingContextTests, CheckpointTests,
StreamingListenerTests] StreamingListenerTests]
if are_kafka_tests_enabled:
testcases.append(KafkaStreamTests)
else:
sys.stderr.write(
"Skipped test_kafka_stream (enable by setting environment variable %s=1"
% kafka_test_environ_var)
if kinesis_jar_present is True: if kinesis_jar_present is True:
testcases.append(KinesisStreamTests) testcases.append(KinesisStreamTests)
elif are_kinesis_tests_enabled is False: elif are_kinesis_tests_enabled is False: