[SPARK-35418][SQL] Add sentences function to functions.{scala,py}

### What changes were proposed in this pull request?

This PR adds `sentences`, a string function, which is present as of `2.0.0` but missing in `functions.{scala,py}`.

### Why are the changes needed?

This function can be only used from SQL for now.
It's good if we can use this function from Scala/Python code as well as SQL.

### Does this PR introduce _any_ user-facing change?

Yes. Users can use this function from Scala and Python.

### How was this patch tested?

New test.

Closes #32566 from sarutak/sentences-function.

Authored-by: Kousuke Saruta <sarutak@oss.nttdata.com>
Signed-off-by: Kousuke Saruta <sarutak@oss.nttdata.com>
This commit is contained in:
Kousuke Saruta 2021-05-19 20:07:28 +09:00
parent 46f7d780d3
commit 9283bebbbd
5 changed files with 71 additions and 0 deletions

View file

@ -493,6 +493,7 @@ Functions
schema_of_csv
schema_of_json
second
sentences
sequence
sha1
sha2

View file

@ -2681,6 +2681,45 @@ def overlay(src, replace, pos, len=-1):
))
def sentences(string, language=None, country=None):
"""
Splits a string into arrays of sentences, where each sentence is an array of words.
The 'language' and 'country' arguments are optional, and if omitted, the default locale is used.
.. versionadded:: 3.2.0
Parameters
----------
string : :class:`~pyspark.sql.Column` or str
a string to be split
language : :class:`~pyspark.sql.Column` or str, optional
a language of the locale
country : :class:`~pyspark.sql.Column` or str, optional
a country of the locale
Examples
--------
>>> df = spark.createDataFrame([["This is an example sentence."]], ["string"])
>>> df.select(sentences(df.string, lit("en"), lit("US"))).show(truncate=False)
+-----------------------------------+
|sentences(string, en, US) |
+-----------------------------------+
|[[This, is, an, example, sentence]]|
+-----------------------------------+
"""
if language is None:
language = lit("")
if country is None:
country = lit("")
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.sentences(
_to_java_column(string),
_to_java_column(language),
_to_java_column(country)
))
def substring(str, pos, len):
"""
Substring starts at `pos` and is of length `len` when str is String type or

View file

@ -221,6 +221,11 @@ def map_from_entries(col: ColumnOrName) -> Column: ...
def array_repeat(col: ColumnOrName, count: Union[Column, int]) -> Column: ...
def arrays_zip(*cols: ColumnOrName) -> Column: ...
def map_concat(*cols: ColumnOrName) -> Column: ...
def sentences(
col: ColumnOrName,
language: Optional[ColumnOrName] = ...,
country: Optional[ColumnOrName] = ...
) -> Column: ...
def sequence(
start: ColumnOrName, stop: ColumnOrName, step: Optional[ColumnOrName] = ...
) -> Column: ...

View file

@ -2867,6 +2867,25 @@ object functions {
new Overlay(src.expr, replace.expr, pos.expr)
}
/**
* Splits a string into arrays of sentences, where each sentence is an array of words.
* @group string_funcs
* @since 3.2.0
*/
def sentences(string: Column, language: Column, country: Column): Column = withExpr {
Sentences(string.expr, language.expr, country.expr)
}
/**
* Splits a string into arrays of sentences, where each sentence is an array of words.
* The default locale is used.
* @group string_funcs
* @since 3.2.0
*/
def sentences(string: Column): Column = withExpr {
Sentences(string.expr)
}
/**
* Translate any character in the src by a character in replaceString.
* The characters in replaceString correspond to the characters in matchingString.

View file

@ -557,11 +557,18 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
df.selectExpr("sentences(str, language, country)"),
Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now"))))
checkAnswer(
df.select(sentences($"str", $"language", $"country")),
Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now"))))
// Type coercion
checkAnswer(
df.selectExpr("sentences(null)", "sentences(10)", "sentences(3.14)"),
Row(null, Seq(Seq("10")), Seq(Seq("3.14"))))
checkAnswer(df.select(sentences(lit(null)), sentences(lit(10)), sentences(lit(3.14))),
Row(null, Seq(Seq("10")), Seq(Seq("3.14"))))
// Argument number exception
val m = intercept[AnalysisException] {
df.selectExpr("sentences()")