From 9283bebbbd5d8fdf2ed03d886773cc851fdd6094 Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Wed, 19 May 2021 20:07:28 +0900 Subject: [PATCH] [SPARK-35418][SQL] Add sentences function to functions.{scala,py} ### What changes were proposed in this pull request? This PR adds `sentences`, a string function, which is present as of `2.0.0` but missing in `functions.{scala,py}`. ### Why are the changes needed? This function can be only used from SQL for now. It's good if we can use this function from Scala/Python code as well as SQL. ### Does this PR introduce _any_ user-facing change? Yes. Users can use this function from Scala and Python. ### How was this patch tested? New test. Closes #32566 from sarutak/sentences-function. Authored-by: Kousuke Saruta Signed-off-by: Kousuke Saruta --- python/docs/source/reference/pyspark.sql.rst | 1 + python/pyspark/sql/functions.py | 39 +++++++++++++++++++ python/pyspark/sql/functions.pyi | 5 +++ .../org/apache/spark/sql/functions.scala | 19 +++++++++ .../spark/sql/StringFunctionsSuite.scala | 7 ++++ 5 files changed, 71 insertions(+) diff --git a/python/docs/source/reference/pyspark.sql.rst b/python/docs/source/reference/pyspark.sql.rst index 19e4e00960..a9d4af805c 100644 --- a/python/docs/source/reference/pyspark.sql.rst +++ b/python/docs/source/reference/pyspark.sql.rst @@ -493,6 +493,7 @@ Functions schema_of_csv schema_of_json second + sentences sequence sha1 sha2 diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 6ea7343036..f6526f0922 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -2681,6 +2681,45 @@ def overlay(src, replace, pos, len=-1): )) +def sentences(string, language=None, country=None): + """ + Splits a string into arrays of sentences, where each sentence is an array of words. + The 'language' and 'country' arguments are optional, and if omitted, the default locale is used. + + .. versionadded:: 3.2.0 + + Parameters + ---------- + string : :class:`~pyspark.sql.Column` or str + a string to be split + language : :class:`~pyspark.sql.Column` or str, optional + a language of the locale + country : :class:`~pyspark.sql.Column` or str, optional + a country of the locale + + Examples + -------- + >>> df = spark.createDataFrame([["This is an example sentence."]], ["string"]) + >>> df.select(sentences(df.string, lit("en"), lit("US"))).show(truncate=False) + +-----------------------------------+ + |sentences(string, en, US) | + +-----------------------------------+ + |[[This, is, an, example, sentence]]| + +-----------------------------------+ + """ + if language is None: + language = lit("") + if country is None: + country = lit("") + + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.sentences( + _to_java_column(string), + _to_java_column(language), + _to_java_column(country) + )) + + def substring(str, pos, len): """ Substring starts at `pos` and is of length `len` when str is String type or diff --git a/python/pyspark/sql/functions.pyi b/python/pyspark/sql/functions.pyi index 674c5a7e65..0a4aabf1bf 100644 --- a/python/pyspark/sql/functions.pyi +++ b/python/pyspark/sql/functions.pyi @@ -221,6 +221,11 @@ def map_from_entries(col: ColumnOrName) -> Column: ... def array_repeat(col: ColumnOrName, count: Union[Column, int]) -> Column: ... def arrays_zip(*cols: ColumnOrName) -> Column: ... def map_concat(*cols: ColumnOrName) -> Column: ... +def sentences( + col: ColumnOrName, + language: Optional[ColumnOrName] = ..., + country: Optional[ColumnOrName] = ... +) -> Column: ... def sequence( start: ColumnOrName, stop: ColumnOrName, step: Optional[ColumnOrName] = ... ) -> Column: ... diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index bd222d150d..b17aaa49fc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -2867,6 +2867,25 @@ object functions { new Overlay(src.expr, replace.expr, pos.expr) } + /** + * Splits a string into arrays of sentences, where each sentence is an array of words. + * @group string_funcs + * @since 3.2.0 + */ + def sentences(string: Column, language: Column, country: Column): Column = withExpr { + Sentences(string.expr, language.expr, country.expr) + } + + /** + * Splits a string into arrays of sentences, where each sentence is an array of words. + * The default locale is used. + * @group string_funcs + * @since 3.2.0 + */ + def sentences(string: Column): Column = withExpr { + Sentences(string.expr) + } + /** * Translate any character in the src by a character in replaceString. * The characters in replaceString correspond to the characters in matchingString. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index 10f2b3004d..dd8a1a8478 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -557,11 +557,18 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { df.selectExpr("sentences(str, language, country)"), Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now")))) + checkAnswer( + df.select(sentences($"str", $"language", $"country")), + Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now")))) + // Type coercion checkAnswer( df.selectExpr("sentences(null)", "sentences(10)", "sentences(3.14)"), Row(null, Seq(Seq("10")), Seq(Seq("3.14")))) + checkAnswer(df.select(sentences(lit(null)), sentences(lit(10)), sentences(lit(3.14))), + Row(null, Seq(Seq("10")), Seq(Seq("3.14")))) + // Argument number exception val m = intercept[AnalysisException] { df.selectExpr("sentences()")