From 9283bebbbd5d8fdf2ed03d886773cc851fdd6094 Mon Sep 17 00:00:00 2001
From: Kousuke Saruta <sarutak@oss.nttdata.com>
Date: Wed, 19 May 2021 20:07:28 +0900
Subject: [PATCH] [SPARK-35418][SQL] Add sentences function to
 functions.{scala,py}

### What changes were proposed in this pull request?

This PR adds `sentences`, a string function, which is present as of `2.0.0` but missing in `functions.{scala,py}`.

### Why are the changes needed?

This function can be only used from SQL for now.
It's good if we can use this function from Scala/Python code as well as SQL.

### Does this PR introduce _any_ user-facing change?

Yes. Users can use this function from Scala and Python.

### How was this patch tested?

New test.

Closes #32566 from sarutak/sentences-function.

Authored-by: Kousuke Saruta <sarutak@oss.nttdata.com>
Signed-off-by: Kousuke Saruta <sarutak@oss.nttdata.com>
---
 python/docs/source/reference/pyspark.sql.rst  |  1 +
 python/pyspark/sql/functions.py               | 39 +++++++++++++++++++
 python/pyspark/sql/functions.pyi              |  5 +++
 .../org/apache/spark/sql/functions.scala      | 19 +++++++++
 .../spark/sql/StringFunctionsSuite.scala      |  7 ++++
 5 files changed, 71 insertions(+)

diff --git a/python/docs/source/reference/pyspark.sql.rst b/python/docs/source/reference/pyspark.sql.rst
index 19e4e00960..a9d4af805c 100644
--- a/python/docs/source/reference/pyspark.sql.rst
+++ b/python/docs/source/reference/pyspark.sql.rst
@@ -493,6 +493,7 @@ Functions
     schema_of_csv
     schema_of_json
     second
+    sentences
     sequence
     sha1
     sha2
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 6ea7343036..f6526f0922 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -2681,6 +2681,45 @@ def overlay(src, replace, pos, len=-1):
     ))
 
 
+def sentences(string, language=None, country=None):
+    """
+    Splits a string into arrays of sentences, where each sentence is an array of words.
+    The 'language' and 'country' arguments are optional, and if omitted, the default locale is used.
+
+    .. versionadded:: 3.2.0
+
+    Parameters
+    ----------
+    string : :class:`~pyspark.sql.Column` or str
+        a string to be split
+    language : :class:`~pyspark.sql.Column` or str, optional
+        a language of the locale
+    country : :class:`~pyspark.sql.Column` or str, optional
+        a country of the locale
+
+    Examples
+    --------
+    >>> df = spark.createDataFrame([["This is an example sentence."]], ["string"])
+    >>> df.select(sentences(df.string, lit("en"), lit("US"))).show(truncate=False)
+    +-----------------------------------+
+    |sentences(string, en, US)          |
+    +-----------------------------------+
+    |[[This, is, an, example, sentence]]|
+    +-----------------------------------+
+    """
+    if language is None:
+        language = lit("")
+    if country is None:
+        country = lit("")
+
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.sentences(
+        _to_java_column(string),
+        _to_java_column(language),
+        _to_java_column(country)
+    ))
+
+
 def substring(str, pos, len):
     """
     Substring starts at `pos` and is of length `len` when str is String type or
diff --git a/python/pyspark/sql/functions.pyi b/python/pyspark/sql/functions.pyi
index 674c5a7e65..0a4aabf1bf 100644
--- a/python/pyspark/sql/functions.pyi
+++ b/python/pyspark/sql/functions.pyi
@@ -221,6 +221,11 @@ def map_from_entries(col: ColumnOrName) -> Column: ...
 def array_repeat(col: ColumnOrName, count: Union[Column, int]) -> Column: ...
 def arrays_zip(*cols: ColumnOrName) -> Column: ...
 def map_concat(*cols: ColumnOrName) -> Column: ...
+def sentences(
+    col: ColumnOrName,
+    language: Optional[ColumnOrName] = ...,
+    country: Optional[ColumnOrName] = ...
+) -> Column: ...
 def sequence(
     start: ColumnOrName, stop: ColumnOrName, step: Optional[ColumnOrName] = ...
 ) -> Column: ...
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index bd222d150d..b17aaa49fc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2867,6 +2867,25 @@ object functions {
     new Overlay(src.expr, replace.expr, pos.expr)
   }
 
+  /**
+   * Splits a string into arrays of sentences, where each sentence is an array of words.
+   * @group string_funcs
+   * @since 3.2.0
+   */
+  def sentences(string: Column, language: Column, country: Column): Column = withExpr {
+    Sentences(string.expr, language.expr, country.expr)
+  }
+
+  /**
+   * Splits a string into arrays of sentences, where each sentence is an array of words.
+   * The default locale is used.
+   * @group string_funcs
+   * @since 3.2.0
+   */
+  def sentences(string: Column): Column = withExpr {
+    Sentences(string.expr)
+  }
+
   /**
    * Translate any character in the src by a character in replaceString.
    * The characters in replaceString correspond to the characters in matchingString.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index 10f2b3004d..dd8a1a8478 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -557,11 +557,18 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
       df.selectExpr("sentences(str, language, country)"),
       Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now"))))
 
+    checkAnswer(
+      df.select(sentences($"str", $"language", $"country")),
+      Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now"))))
+
     // Type coercion
     checkAnswer(
       df.selectExpr("sentences(null)", "sentences(10)", "sentences(3.14)"),
       Row(null, Seq(Seq("10")), Seq(Seq("3.14"))))
 
+    checkAnswer(df.select(sentences(lit(null)), sentences(lit(10)), sentences(lit(3.14))),
+      Row(null, Seq(Seq("10")), Seq(Seq("3.14"))))
+
     // Argument number exception
     val m = intercept[AnalysisException] {
       df.selectExpr("sentences()")