[SPARK-21293][SS][SPARKR] Add doc example for streaming join, dedup

## What changes were proposed in this pull request?

streaming programming guide changes

## How was this patch tested?

manually

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #20340 from felixcheung/rstreamdoc.
This commit is contained in:
Felix Cheung 2018-01-21 11:23:51 -08:00 committed by Felix Cheung
parent 4f43d27c9e
commit 2239d7a410

View file

@ -1100,6 +1100,21 @@ streamingDf.join(staticDf, "type") # inner equi-join with a static DF
streamingDf.join(staticDf, "type", "right_join") # right outer join with a static DF
{% endhighlight %}
</div>
<div data-lang="r" markdown="1">
{% highlight r %}
staticDf <- read.df(...)
streamingDf <- read.stream(...)
joined <- merge(streamingDf, staticDf, sort = FALSE) # inner equi-join with a static DF
joined <- join(
staticDf,
streamingDf,
streamingDf$value == staticDf$value,
"right_outer") # right outer join with a static DF
{% endhighlight %}
</div>
</div>
@ -1227,6 +1242,30 @@ impressionsWithWatermark.join(
{% endhighlight %}
</div>
<div data-lang="r" markdown="1">
{% highlight r %}
impressions <- read.stream(...)
clicks <- read.stream(...)
# Apply watermarks on event-time columns
impressionsWithWatermark <- withWatermark(impressions, "impressionTime", "2 hours")
clicksWithWatermark <- withWatermark(clicks, "clickTime", "3 hours")
# Join with event-time constraints
joined <- join(
impressionsWithWatermark,
clicksWithWatermark,
expr(
paste(
"clickAdId = impressionAdId AND",
"clickTime >= impressionTime AND",
"clickTime <= impressionTime + interval 1 hour"
)))
{% endhighlight %}
</div>
</div>
@ -1287,6 +1326,23 @@ impressionsWithWatermark.join(
{% endhighlight %}
</div>
<div data-lang="r" markdown="1">
{% highlight r %}
joined <- join(
impressionsWithWatermark,
clicksWithWatermark,
expr(
paste(
"clickAdId = impressionAdId AND",
"clickTime >= impressionTime AND",
"clickTime <= impressionTime + interval 1 hour"),
"left_outer" # can be "inner", "left_outer", "right_outer"
))
{% endhighlight %}
</div>
</div>
@ -1441,15 +1497,29 @@ streamingDf
{% highlight python %}
streamingDf = spark.readStream. ...
// Without watermark using guid column
# Without watermark using guid column
streamingDf.dropDuplicates("guid")
// With watermark using guid and eventTime columns
# With watermark using guid and eventTime columns
streamingDf \
.withWatermark("eventTime", "10 seconds") \
.dropDuplicates("guid", "eventTime")
{% endhighlight %}
</div>
<div data-lang="r" markdown="1">
{% highlight r %}
streamingDf <- read.stream(...)
# Without watermark using guid column
streamingDf <- dropDuplicates(streamingDf, "guid")
# With watermark using guid and eventTime columns
streamingDf <- withWatermark(streamingDf, "eventTime", "10 seconds")
streamingDf <- dropDuplicates(streamingDf, "guid", "eventTime")
{% endhighlight %}
</div>
</div>