[SPARK-35418][SQL] Add sentences function to functions.{scala,py}
### What changes were proposed in this pull request? This PR adds `sentences`, a string function, which is present as of `2.0.0` but missing in `functions.{scala,py}`. ### Why are the changes needed? This function can be only used from SQL for now. It's good if we can use this function from Scala/Python code as well as SQL. ### Does this PR introduce _any_ user-facing change? Yes. Users can use this function from Scala and Python. ### How was this patch tested? New test. Closes #32566 from sarutak/sentences-function. Authored-by: Kousuke Saruta <sarutak@oss.nttdata.com> Signed-off-by: Kousuke Saruta <sarutak@oss.nttdata.com>
This commit is contained in:
parent
46f7d780d3
commit
9283bebbbd
|
@ -493,6 +493,7 @@ Functions
|
|||
schema_of_csv
|
||||
schema_of_json
|
||||
second
|
||||
sentences
|
||||
sequence
|
||||
sha1
|
||||
sha2
|
||||
|
|
|
@ -2681,6 +2681,45 @@ def overlay(src, replace, pos, len=-1):
|
|||
))
|
||||
|
||||
|
||||
def sentences(string, language=None, country=None):
|
||||
"""
|
||||
Splits a string into arrays of sentences, where each sentence is an array of words.
|
||||
The 'language' and 'country' arguments are optional, and if omitted, the default locale is used.
|
||||
|
||||
.. versionadded:: 3.2.0
|
||||
|
||||
Parameters
|
||||
----------
|
||||
string : :class:`~pyspark.sql.Column` or str
|
||||
a string to be split
|
||||
language : :class:`~pyspark.sql.Column` or str, optional
|
||||
a language of the locale
|
||||
country : :class:`~pyspark.sql.Column` or str, optional
|
||||
a country of the locale
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = spark.createDataFrame([["This is an example sentence."]], ["string"])
|
||||
>>> df.select(sentences(df.string, lit("en"), lit("US"))).show(truncate=False)
|
||||
+-----------------------------------+
|
||||
|sentences(string, en, US) |
|
||||
+-----------------------------------+
|
||||
|[[This, is, an, example, sentence]]|
|
||||
+-----------------------------------+
|
||||
"""
|
||||
if language is None:
|
||||
language = lit("")
|
||||
if country is None:
|
||||
country = lit("")
|
||||
|
||||
sc = SparkContext._active_spark_context
|
||||
return Column(sc._jvm.functions.sentences(
|
||||
_to_java_column(string),
|
||||
_to_java_column(language),
|
||||
_to_java_column(country)
|
||||
))
|
||||
|
||||
|
||||
def substring(str, pos, len):
|
||||
"""
|
||||
Substring starts at `pos` and is of length `len` when str is String type or
|
||||
|
|
|
@ -221,6 +221,11 @@ def map_from_entries(col: ColumnOrName) -> Column: ...
|
|||
def array_repeat(col: ColumnOrName, count: Union[Column, int]) -> Column: ...
|
||||
def arrays_zip(*cols: ColumnOrName) -> Column: ...
|
||||
def map_concat(*cols: ColumnOrName) -> Column: ...
|
||||
def sentences(
|
||||
col: ColumnOrName,
|
||||
language: Optional[ColumnOrName] = ...,
|
||||
country: Optional[ColumnOrName] = ...
|
||||
) -> Column: ...
|
||||
def sequence(
|
||||
start: ColumnOrName, stop: ColumnOrName, step: Optional[ColumnOrName] = ...
|
||||
) -> Column: ...
|
||||
|
|
|
@ -2867,6 +2867,25 @@ object functions {
|
|||
new Overlay(src.expr, replace.expr, pos.expr)
|
||||
}
|
||||
|
||||
/**
|
||||
* Splits a string into arrays of sentences, where each sentence is an array of words.
|
||||
* @group string_funcs
|
||||
* @since 3.2.0
|
||||
*/
|
||||
def sentences(string: Column, language: Column, country: Column): Column = withExpr {
|
||||
Sentences(string.expr, language.expr, country.expr)
|
||||
}
|
||||
|
||||
/**
|
||||
* Splits a string into arrays of sentences, where each sentence is an array of words.
|
||||
* The default locale is used.
|
||||
* @group string_funcs
|
||||
* @since 3.2.0
|
||||
*/
|
||||
def sentences(string: Column): Column = withExpr {
|
||||
Sentences(string.expr)
|
||||
}
|
||||
|
||||
/**
|
||||
* Translate any character in the src by a character in replaceString.
|
||||
* The characters in replaceString correspond to the characters in matchingString.
|
||||
|
|
|
@ -557,11 +557,18 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
|
|||
df.selectExpr("sentences(str, language, country)"),
|
||||
Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now"))))
|
||||
|
||||
checkAnswer(
|
||||
df.select(sentences($"str", $"language", $"country")),
|
||||
Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now"))))
|
||||
|
||||
// Type coercion
|
||||
checkAnswer(
|
||||
df.selectExpr("sentences(null)", "sentences(10)", "sentences(3.14)"),
|
||||
Row(null, Seq(Seq("10")), Seq(Seq("3.14"))))
|
||||
|
||||
checkAnswer(df.select(sentences(lit(null)), sentences(lit(10)), sentences(lit(3.14))),
|
||||
Row(null, Seq(Seq("10")), Seq(Seq("3.14"))))
|
||||
|
||||
// Argument number exception
|
||||
val m = intercept[AnalysisException] {
|
||||
df.selectExpr("sentences()")
|
||||
|
|
Loading…
Reference in a new issue