[SPARK-21293][SS][SPARKR] Add doc example for streaming join, dedup
## What changes were proposed in this pull request? streaming programming guide changes ## How was this patch tested? manually Author: Felix Cheung <felixcheung_m@hotmail.com> Closes #20340 from felixcheung/rstreamdoc.
This commit is contained in:
parent
4f43d27c9e
commit
2239d7a410
|
@ -1100,6 +1100,21 @@ streamingDf.join(staticDf, "type") # inner equi-join with a static DF
|
||||||
streamingDf.join(staticDf, "type", "right_join") # right outer join with a static DF
|
streamingDf.join(staticDf, "type", "right_join") # right outer join with a static DF
|
||||||
{% endhighlight %}
|
{% endhighlight %}
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div data-lang="r" markdown="1">
|
||||||
|
|
||||||
|
{% highlight r %}
|
||||||
|
staticDf <- read.df(...)
|
||||||
|
streamingDf <- read.stream(...)
|
||||||
|
joined <- merge(streamingDf, staticDf, sort = FALSE) # inner equi-join with a static DF
|
||||||
|
joined <- join(
|
||||||
|
staticDf,
|
||||||
|
streamingDf,
|
||||||
|
streamingDf$value == staticDf$value,
|
||||||
|
"right_outer") # right outer join with a static DF
|
||||||
|
{% endhighlight %}
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -1227,6 +1242,30 @@ impressionsWithWatermark.join(
|
||||||
|
|
||||||
{% endhighlight %}
|
{% endhighlight %}
|
||||||
|
|
||||||
|
</div>
|
||||||
|
<div data-lang="r" markdown="1">
|
||||||
|
|
||||||
|
{% highlight r %}
|
||||||
|
impressions <- read.stream(...)
|
||||||
|
clicks <- read.stream(...)
|
||||||
|
|
||||||
|
# Apply watermarks on event-time columns
|
||||||
|
impressionsWithWatermark <- withWatermark(impressions, "impressionTime", "2 hours")
|
||||||
|
clicksWithWatermark <- withWatermark(clicks, "clickTime", "3 hours")
|
||||||
|
|
||||||
|
# Join with event-time constraints
|
||||||
|
joined <- join(
|
||||||
|
impressionsWithWatermark,
|
||||||
|
clicksWithWatermark,
|
||||||
|
expr(
|
||||||
|
paste(
|
||||||
|
"clickAdId = impressionAdId AND",
|
||||||
|
"clickTime >= impressionTime AND",
|
||||||
|
"clickTime <= impressionTime + interval 1 hour"
|
||||||
|
)))
|
||||||
|
|
||||||
|
{% endhighlight %}
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -1287,6 +1326,23 @@ impressionsWithWatermark.join(
|
||||||
|
|
||||||
{% endhighlight %}
|
{% endhighlight %}
|
||||||
|
|
||||||
|
</div>
|
||||||
|
<div data-lang="r" markdown="1">
|
||||||
|
|
||||||
|
{% highlight r %}
|
||||||
|
joined <- join(
|
||||||
|
impressionsWithWatermark,
|
||||||
|
clicksWithWatermark,
|
||||||
|
expr(
|
||||||
|
paste(
|
||||||
|
"clickAdId = impressionAdId AND",
|
||||||
|
"clickTime >= impressionTime AND",
|
||||||
|
"clickTime <= impressionTime + interval 1 hour"),
|
||||||
|
"left_outer" # can be "inner", "left_outer", "right_outer"
|
||||||
|
))
|
||||||
|
|
||||||
|
{% endhighlight %}
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -1441,15 +1497,29 @@ streamingDf
|
||||||
{% highlight python %}
|
{% highlight python %}
|
||||||
streamingDf = spark.readStream. ...
|
streamingDf = spark.readStream. ...
|
||||||
|
|
||||||
// Without watermark using guid column
|
# Without watermark using guid column
|
||||||
streamingDf.dropDuplicates("guid")
|
streamingDf.dropDuplicates("guid")
|
||||||
|
|
||||||
// With watermark using guid and eventTime columns
|
# With watermark using guid and eventTime columns
|
||||||
streamingDf \
|
streamingDf \
|
||||||
.withWatermark("eventTime", "10 seconds") \
|
.withWatermark("eventTime", "10 seconds") \
|
||||||
.dropDuplicates("guid", "eventTime")
|
.dropDuplicates("guid", "eventTime")
|
||||||
{% endhighlight %}
|
{% endhighlight %}
|
||||||
|
|
||||||
|
</div>
|
||||||
|
<div data-lang="r" markdown="1">
|
||||||
|
|
||||||
|
{% highlight r %}
|
||||||
|
streamingDf <- read.stream(...)
|
||||||
|
|
||||||
|
# Without watermark using guid column
|
||||||
|
streamingDf <- dropDuplicates(streamingDf, "guid")
|
||||||
|
|
||||||
|
# With watermark using guid and eventTime columns
|
||||||
|
streamingDf <- withWatermark(streamingDf, "eventTime", "10 seconds")
|
||||||
|
streamingDf <- dropDuplicates(streamingDf, "guid", "eventTime")
|
||||||
|
{% endhighlight %}
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue