[SPARK-35418][SQL] Add sentences function to functions.{scala,py}

### What changes were proposed in this pull request? This PR adds `sentences`, a string function, which is present as of `2.0.0` but missing in `functions.{scala,py}`. ### Why are the changes needed? This function can be only used from SQL for now. It's good if we can use this function from Scala/Python code as well as SQL. ### Does this PR introduce _any_ user-facing change? Yes. Users can use this function from Scala and Python. ### How was this patch tested? New test. Closes #32566 from sarutak/sentences-function. Authored-by: Kousuke Saruta <sarutak@oss.nttdata.com> Signed-off-by: Kousuke Saruta <sarutak@oss.nttdata.com>
2021-05-19 20:07:28 +09:00 · 2021-05-19 20:07:28 +09:00 · 9283bebbbd
parent 46f7d780d3
commit 9283bebbbd
5 changed files with 71 additions and 0 deletions
--- a/python/docs/source/reference/pyspark.sql.rst
+++ b/python/docs/source/reference/pyspark.sql.rst
@ -493,6 +493,7 @@ Functions
    schema_of_csv
    schema_of_json
    second
+    sentences
    sequence
    sha1
    sha2
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@ -2681,6 +2681,45 @@ def overlay(src, replace, pos, len=-1):
    ))


+def sentences(string, language=None, country=None):
+    """
+    Splits a string into arrays of sentences, where each sentence is an array of words.
+    The 'language' and 'country' arguments are optional, and if omitted, the default locale is used.
+
+    .. versionadded:: 3.2.0
+
+    Parameters
+    ----------
+    string : :class:`~pyspark.sql.Column` or str
+        a string to be split
+    language : :class:`~pyspark.sql.Column` or str, optional
+        a language of the locale
+    country : :class:`~pyspark.sql.Column` or str, optional
+        a country of the locale
+
+    Examples
+    --------
+    >>> df = spark.createDataFrame([["This is an example sentence."]], ["string"])
+    >>> df.select(sentences(df.string, lit("en"), lit("US"))).show(truncate=False)
+    +-----------------------------------+
+    |sentences(string, en, US)          |
+    +-----------------------------------+
+    |[[This, is, an, example, sentence]]|
+    +-----------------------------------+
+    """
+    if language is None:
+        language = lit("")
+    if country is None:
+        country = lit("")
+
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.sentences(
+        _to_java_column(string),
+        _to_java_column(language),
+        _to_java_column(country)
+    ))
+
+
 def substring(str, pos, len):
    """
    Substring starts at `pos` and is of length `len` when str is String type or
--- a/python/pyspark/sql/functions.pyi
+++ b/python/pyspark/sql/functions.pyi
@ -221,6 +221,11 @@ def map_from_entries(col: ColumnOrName) -> Column: ...
 def array_repeat(col: ColumnOrName, count: Union[Column, int]) -> Column: ...
 def arrays_zip(*cols: ColumnOrName) -> Column: ...
 def map_concat(*cols: ColumnOrName) -> Column: ...
+def sentences(
+    col: ColumnOrName,
+    language: Optional[ColumnOrName] = ...,
+    country: Optional[ColumnOrName] = ...
+) -> Column: ...
 def sequence(
    start: ColumnOrName, stop: ColumnOrName, step: Optional[ColumnOrName] = ...
 ) -> Column: ...
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@ -2867,6 +2867,25 @@ object functions {
    new Overlay(src.expr, replace.expr, pos.expr)
  }

+  /**
+   * Splits a string into arrays of sentences, where each sentence is an array of words.
+   * @group string_funcs
+   * @since 3.2.0
+   */
+  def sentences(string: Column, language: Column, country: Column): Column = withExpr {
+    Sentences(string.expr, language.expr, country.expr)
+  }
+
+  /**
+   * Splits a string into arrays of sentences, where each sentence is an array of words.
+   * The default locale is used.
+   * @group string_funcs
+   * @since 3.2.0
+   */
+  def sentences(string: Column): Column = withExpr {
+    Sentences(string.expr)
+  }
+
  /**
   * Translate any character in the src by a character in replaceString.
   * The characters in replaceString correspond to the characters in matchingString.
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@ -557,11 +557,18 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
      df.selectExpr("sentences(str, language, country)"),
      Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now"))))

+    checkAnswer(
+      df.select(sentences($"str", $"language", $"country")),
+      Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now"))))
+
    // Type coercion
    checkAnswer(
      df.selectExpr("sentences(null)", "sentences(10)", "sentences(3.14)"),
      Row(null, Seq(Seq("10")), Seq(Seq("3.14"))))

+    checkAnswer(df.select(sentences(lit(null)), sentences(lit(10)), sentences(lit(3.14))),
+      Row(null, Seq(Seq("10")), Seq(Seq("3.14"))))
+
    // Argument number exception
    val m = intercept[AnalysisException] {
      df.selectExpr("sentences()")