[SPARK-5154] [PySpark] [Streaming] Kafka streaming support in Python
This PR brings the Python API for Spark Streaming Kafka data source.
```
class KafkaUtils(__builtin__.object)
| Static methods defined here:
|
| createStream(ssc, zkQuorum, groupId, topics, storageLevel=StorageLevel(True, True, False, False,
2), keyDecoder=<function utf8_decoder>, valueDecoder=<function utf8_decoder>)
| Create an input stream that pulls messages from a Kafka Broker.
|
| :param ssc: StreamingContext object
| :param zkQuorum: Zookeeper quorum (hostname:port,hostname:port,..).
| :param groupId: The group id for this consumer.
| :param topics: Dict of (topic_name -> numPartitions) to consume.
| Each partition is consumed in its own thread.
| :param storageLevel: RDD storage level.
| :param keyDecoder: A function used to decode key
| :param valueDecoder: A function used to decode value
| :return: A DStream object
```
run the example:
```
bin/spark-submit --driver-class-path external/kafka-assembly/target/scala-*/spark-streaming-kafka-assembly-*.jar examples/src/main/python/streaming/kafka_wordcount.py localhost:2181 test
```
Author: Davies Liu <davies@databricks.com>
Author: Tathagata Das <tdas@databricks.com>
Closes #3715 from davies/kafka and squashes the following commits:
d93bfe0 [Davies Liu] Update make-distribution.sh
4280d04 [Davies Liu] address comments
e6d0427 [Davies Liu] Merge branch 'master' of github.com:apache/spark into kafka
f257071 [Davies Liu] add tests for null in RDD
23b039a [Davies Liu] address comments
9af51c4 [Davies Liu] Merge branch 'kafka' of github.com:davies/spark into kafka
a74da87 [Davies Liu] address comments
dc1eed0 [Davies Liu] Update kafka_wordcount.py
31e2317 [Davies Liu] Update kafka_wordcount.py
370ba61 [Davies Liu] Update kafka.py
97386b3 [Davies Liu] address comment
2c567a5 [Davies Liu] update logging and comment
33730d1 [Davies Liu] Merge branch 'master' of github.com:apache/spark into kafka
adeeb38 [Davies Liu] Merge pull request #3 from tdas/kafka-python-api
aea8953 [Tathagata Das] Kafka-assembly for Python API
eea16a7 [Davies Liu] refactor
f6ce899 [Davies Liu] add example and fix bugs
98c8d17 [Davies Liu] fix python style
5697a01 [Davies Liu] bypass decoder in scala
048dbe6 [Davies Liu] fix python style
75d485e [Davies Liu] add mqtt
07923c4 [Davies Liu] support kafka in Python
2015-02-02 22:16:27 -05:00
|
|
|
<?xml version="1.0" encoding="UTF-8"?>
|
|
|
|
<!--
|
|
|
|
~ Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
~ contributor license agreements. See the NOTICE file distributed with
|
|
|
|
~ this work for additional information regarding copyright ownership.
|
|
|
|
~ The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
~ (the "License"); you may not use this file except in compliance with
|
|
|
|
~ the License. You may obtain a copy of the License at
|
|
|
|
~
|
|
|
|
~ http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
~
|
|
|
|
~ Unless required by applicable law or agreed to in writing, software
|
|
|
|
~ distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
~ See the License for the specific language governing permissions and
|
|
|
|
~ limitations under the License.
|
|
|
|
-->
|
|
|
|
|
|
|
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
|
|
|
<modelVersion>4.0.0</modelVersion>
|
|
|
|
<parent>
|
|
|
|
<groupId>org.apache.spark</groupId>
|
2015-03-05 14:31:48 -05:00
|
|
|
<artifactId>spark-parent_2.10</artifactId>
|
2015-05-19 02:06:40 -04:00
|
|
|
<version>1.4.0</version>
|
[SPARK-5154] [PySpark] [Streaming] Kafka streaming support in Python
This PR brings the Python API for Spark Streaming Kafka data source.
```
class KafkaUtils(__builtin__.object)
| Static methods defined here:
|
| createStream(ssc, zkQuorum, groupId, topics, storageLevel=StorageLevel(True, True, False, False,
2), keyDecoder=<function utf8_decoder>, valueDecoder=<function utf8_decoder>)
| Create an input stream that pulls messages from a Kafka Broker.
|
| :param ssc: StreamingContext object
| :param zkQuorum: Zookeeper quorum (hostname:port,hostname:port,..).
| :param groupId: The group id for this consumer.
| :param topics: Dict of (topic_name -> numPartitions) to consume.
| Each partition is consumed in its own thread.
| :param storageLevel: RDD storage level.
| :param keyDecoder: A function used to decode key
| :param valueDecoder: A function used to decode value
| :return: A DStream object
```
run the example:
```
bin/spark-submit --driver-class-path external/kafka-assembly/target/scala-*/spark-streaming-kafka-assembly-*.jar examples/src/main/python/streaming/kafka_wordcount.py localhost:2181 test
```
Author: Davies Liu <davies@databricks.com>
Author: Tathagata Das <tdas@databricks.com>
Closes #3715 from davies/kafka and squashes the following commits:
d93bfe0 [Davies Liu] Update make-distribution.sh
4280d04 [Davies Liu] address comments
e6d0427 [Davies Liu] Merge branch 'master' of github.com:apache/spark into kafka
f257071 [Davies Liu] add tests for null in RDD
23b039a [Davies Liu] address comments
9af51c4 [Davies Liu] Merge branch 'kafka' of github.com:davies/spark into kafka
a74da87 [Davies Liu] address comments
dc1eed0 [Davies Liu] Update kafka_wordcount.py
31e2317 [Davies Liu] Update kafka_wordcount.py
370ba61 [Davies Liu] Update kafka.py
97386b3 [Davies Liu] address comment
2c567a5 [Davies Liu] update logging and comment
33730d1 [Davies Liu] Merge branch 'master' of github.com:apache/spark into kafka
adeeb38 [Davies Liu] Merge pull request #3 from tdas/kafka-python-api
aea8953 [Tathagata Das] Kafka-assembly for Python API
eea16a7 [Davies Liu] refactor
f6ce899 [Davies Liu] add example and fix bugs
98c8d17 [Davies Liu] fix python style
5697a01 [Davies Liu] bypass decoder in scala
048dbe6 [Davies Liu] fix python style
75d485e [Davies Liu] add mqtt
07923c4 [Davies Liu] support kafka in Python
2015-02-02 22:16:27 -05:00
|
|
|
<relativePath>../../pom.xml</relativePath>
|
|
|
|
</parent>
|
|
|
|
|
|
|
|
<groupId>org.apache.spark</groupId>
|
|
|
|
<artifactId>spark-streaming-kafka-assembly_2.10</artifactId>
|
|
|
|
<packaging>jar</packaging>
|
|
|
|
<name>Spark Project External Kafka Assembly</name>
|
|
|
|
<url>http://spark.apache.org/</url>
|
|
|
|
|
|
|
|
<properties>
|
|
|
|
<sbt.project.name>streaming-kafka-assembly</sbt.project.name>
|
|
|
|
</properties>
|
|
|
|
|
|
|
|
<dependencies>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.spark</groupId>
|
|
|
|
<artifactId>spark-streaming-kafka_${scala.binary.version}</artifactId>
|
|
|
|
<version>${project.version}</version>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.spark</groupId>
|
|
|
|
<artifactId>spark-streaming_${scala.binary.version}</artifactId>
|
|
|
|
<version>${project.version}</version>
|
|
|
|
<scope>provided</scope>
|
|
|
|
</dependency>
|
|
|
|
</dependencies>
|
|
|
|
|
|
|
|
<build>
|
|
|
|
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
|
|
|
|
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
|
|
|
|
<plugins>
|
|
|
|
<plugin>
|
|
|
|
<groupId>org.apache.maven.plugins</groupId>
|
|
|
|
<artifactId>maven-shade-plugin</artifactId>
|
|
|
|
<configuration>
|
|
|
|
<shadedArtifactAttached>false</shadedArtifactAttached>
|
|
|
|
<artifactSet>
|
|
|
|
<includes>
|
|
|
|
<include>*:*</include>
|
|
|
|
</includes>
|
|
|
|
</artifactSet>
|
|
|
|
<filters>
|
|
|
|
<filter>
|
|
|
|
<artifact>*:*</artifact>
|
|
|
|
<excludes>
|
|
|
|
<exclude>META-INF/*.SF</exclude>
|
|
|
|
<exclude>META-INF/*.DSA</exclude>
|
|
|
|
<exclude>META-INF/*.RSA</exclude>
|
|
|
|
</excludes>
|
|
|
|
</filter>
|
|
|
|
</filters>
|
|
|
|
</configuration>
|
|
|
|
<executions>
|
|
|
|
<execution>
|
|
|
|
<phase>package</phase>
|
|
|
|
<goals>
|
|
|
|
<goal>shade</goal>
|
|
|
|
</goals>
|
|
|
|
<configuration>
|
|
|
|
<transformers>
|
|
|
|
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
|
|
|
|
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
|
|
|
|
<resource>reference.conf</resource>
|
|
|
|
</transformer>
|
|
|
|
<transformer implementation="org.apache.maven.plugins.shade.resource.DontIncludeResourceTransformer">
|
|
|
|
<resource>log4j.properties</resource>
|
|
|
|
</transformer>
|
|
|
|
<transformer implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer"/>
|
|
|
|
<transformer implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer"/>
|
|
|
|
</transformers>
|
|
|
|
</configuration>
|
|
|
|
</execution>
|
|
|
|
</executions>
|
|
|
|
</plugin>
|
|
|
|
</plugins>
|
|
|
|
</build>
|
|
|
|
</project>
|
|
|
|
|