[SPARK-21293][SS][SPARKR] Add doc example for streaming join, dedup
## What changes were proposed in this pull request? streaming programming guide changes ## How was this patch tested? manually Author: Felix Cheung <felixcheung_m@hotmail.com> Closes #20340 from felixcheung/rstreamdoc.
This commit is contained in:
parent
4f43d27c9e
commit
2239d7a410
|
@ -1100,6 +1100,21 @@ streamingDf.join(staticDf, "type") # inner equi-join with a static DF
|
|||
streamingDf.join(staticDf, "type", "right_join") # right outer join with a static DF
|
||||
{% endhighlight %}
|
||||
|
||||
</div>
|
||||
|
||||
<div data-lang="r" markdown="1">
|
||||
|
||||
{% highlight r %}
|
||||
staticDf <- read.df(...)
|
||||
streamingDf <- read.stream(...)
|
||||
joined <- merge(streamingDf, staticDf, sort = FALSE) # inner equi-join with a static DF
|
||||
joined <- join(
|
||||
staticDf,
|
||||
streamingDf,
|
||||
streamingDf$value == staticDf$value,
|
||||
"right_outer") # right outer join with a static DF
|
||||
{% endhighlight %}
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
@ -1227,6 +1242,30 @@ impressionsWithWatermark.join(
|
|||
|
||||
{% endhighlight %}
|
||||
|
||||
</div>
|
||||
<div data-lang="r" markdown="1">
|
||||
|
||||
{% highlight r %}
|
||||
impressions <- read.stream(...)
|
||||
clicks <- read.stream(...)
|
||||
|
||||
# Apply watermarks on event-time columns
|
||||
impressionsWithWatermark <- withWatermark(impressions, "impressionTime", "2 hours")
|
||||
clicksWithWatermark <- withWatermark(clicks, "clickTime", "3 hours")
|
||||
|
||||
# Join with event-time constraints
|
||||
joined <- join(
|
||||
impressionsWithWatermark,
|
||||
clicksWithWatermark,
|
||||
expr(
|
||||
paste(
|
||||
"clickAdId = impressionAdId AND",
|
||||
"clickTime >= impressionTime AND",
|
||||
"clickTime <= impressionTime + interval 1 hour"
|
||||
)))
|
||||
|
||||
{% endhighlight %}
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
@ -1287,6 +1326,23 @@ impressionsWithWatermark.join(
|
|||
|
||||
{% endhighlight %}
|
||||
|
||||
</div>
|
||||
<div data-lang="r" markdown="1">
|
||||
|
||||
{% highlight r %}
|
||||
joined <- join(
|
||||
impressionsWithWatermark,
|
||||
clicksWithWatermark,
|
||||
expr(
|
||||
paste(
|
||||
"clickAdId = impressionAdId AND",
|
||||
"clickTime >= impressionTime AND",
|
||||
"clickTime <= impressionTime + interval 1 hour"),
|
||||
"left_outer" # can be "inner", "left_outer", "right_outer"
|
||||
))
|
||||
|
||||
{% endhighlight %}
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
@ -1441,15 +1497,29 @@ streamingDf
|
|||
{% highlight python %}
|
||||
streamingDf = spark.readStream. ...
|
||||
|
||||
// Without watermark using guid column
|
||||
# Without watermark using guid column
|
||||
streamingDf.dropDuplicates("guid")
|
||||
|
||||
// With watermark using guid and eventTime columns
|
||||
# With watermark using guid and eventTime columns
|
||||
streamingDf \
|
||||
.withWatermark("eventTime", "10 seconds") \
|
||||
.dropDuplicates("guid", "eventTime")
|
||||
{% endhighlight %}
|
||||
|
||||
</div>
|
||||
<div data-lang="r" markdown="1">
|
||||
|
||||
{% highlight r %}
|
||||
streamingDf <- read.stream(...)
|
||||
|
||||
# Without watermark using guid column
|
||||
streamingDf <- dropDuplicates(streamingDf, "guid")
|
||||
|
||||
# With watermark using guid and eventTime columns
|
||||
streamingDf <- withWatermark(streamingDf, "eventTime", "10 seconds")
|
||||
streamingDf <- dropDuplicates(streamingDf, "guid", "eventTime")
|
||||
{% endhighlight %}
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
|
Loading…
Reference in a new issue