[SPARK-16324][SQL] regexp_extract should doc that it returns empty string when match fails

## What changes were proposed in this pull request?

Doc that regexp_extract returns empty string when regex or group does not match

## How was this patch tested?

Jenkins test, with a few new test cases

Author: Sean Owen <sowen@cloudera.com>

Closes #14525 from srowen/SPARK-16324.
This commit is contained in:
Sean Owen 2016-08-10 10:14:43 +01:00
parent eca58755fb
commit 0578ff9681
3 changed files with 11 additions and 2 deletions

View file

@ -1440,11 +1440,15 @@ def split(str, pattern):
@ignore_unicode_prefix
@since(1.5)
def regexp_extract(str, pattern, idx):
"""Extract a specific(idx) group identified by a java regex, from the specified string column.
"""Extract a specific group matched by a Java regex, from the specified string column.
If the regex did not match, or the specified group did not match, an empty string is returned.
>>> df = spark.createDataFrame([('100-200',)], ['str'])
>>> df.select(regexp_extract('str', '(\d+)-(\d+)', 1).alias('d')).collect()
[Row(d=u'100')]
>>> df = spark.createDataFrame([('foo',)], ['str'])
>>> df.select(regexp_extract('str', '(\d+)', 1).alias('d')).collect()
[Row(d=u'')]
>>> df = spark.createDataFrame([('aaaac',)], ['str'])
>>> df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect()
[Row(d=u'')]

View file

@ -2175,7 +2175,8 @@ object functions {
def ltrim(e: Column): Column = withExpr {StringTrimLeft(e.expr) }
/**
* Extract a specific(idx) group identified by a java regex, from the specified string column.
* Extract a specific group matched by a Java regex, from the specified string column.
* If the regex did not match, or the specified group did not match, an empty string is returned.
*
* @group string_funcs
* @since 1.5.0

View file

@ -96,6 +96,10 @@ class StringFunctionsSuite extends QueryTest with SharedSQLContext {
test("non-matching optional group") {
val df = Seq(Tuple1("aaaac")).toDF("s")
checkAnswer(
df.select(regexp_extract($"s", "(foo)", 1)),
Row("")
)
checkAnswer(
df.select(regexp_extract($"s", "(a+)(b)?(c)", 2)),
Row("")