[SPARK-28066][CORE] Optimize UTF8String.trim() for common case of no whitespace

## What changes were proposed in this pull request?

UTF8String.trim() allocates a new object even if the string has no whitespace, when it can just return itself. A simple check for this case makes the method about 3x faster in the common case.

## How was this patch tested?

Existing tests.

A rough benchmark of 90% strings without whitespace (at ends), and 10% that do have whitespace, suggests the average runtime goes from 20 ns to 6 ns.

Closes #24884 from srowen/SPARK-28066.

Authored-by: Sean Owen <sean.owen@databricks.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
This commit is contained in:
Sean Owen 2019-06-17 08:49:11 -07:00 committed by Dongjoon Hyun
parent b7b4452553
commit 4576dfde19

View file

@ -529,26 +529,35 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
return UTF8String.fromBytes(newBytes);
}
/**
* Trims space characters (ASCII 32) from both ends of this string.
*
* @return this string with no spaces at the start or end
*/
public UTF8String trim() {
int s = 0;
// skip all of the space (0x20) in the left side
while (s < this.numBytes && getByte(s) == 0x20) s++;
if (s == this.numBytes) {
// empty string
// Everything trimmed
return EMPTY_UTF8;
}
// skip all of the space (0x20) in the right side
int e = this.numBytes - 1;
while (e > s && getByte(e) == 0x20) e--;
if (s == 0 && e == numBytes - 1) {
// Nothing trimmed
return this;
}
return copyUTF8String(s, e);
}
/**
* Based on the given trim string, trim this string starting from both ends
* This method searches for each character in the source string, removes the character if it is
* found in the trim string, stops at the first not found. It calls the trimLeft first, then
* trimRight. It returns a new string in which both ends trim characters have been removed.
* Trims instances of the given trim string from both ends of this string.
*
* @param trimString the trim character string
* @return this string with no occurrences of the trim string at the start or end, or `null`
* if `trimString` is `null`
*/
public UTF8String trim(UTF8String trimString) {
if (trimString != null) {
@ -558,24 +567,32 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
}
}
/**
* Trims space characters (ASCII 32) from the start of this string.
*
* @return this string with no spaces at the start
*/
public UTF8String trimLeft() {
int s = 0;
// skip all of the space (0x20) in the left side
while (s < this.numBytes && getByte(s) == 0x20) s++;
if (s == this.numBytes) {
// empty string
return EMPTY_UTF8;
} else {
return copyUTF8String(s, this.numBytes - 1);
if (s == 0) {
// Nothing trimmed
return this;
}
if (s == this.numBytes) {
// Everything trimmed
return EMPTY_UTF8;
}
return copyUTF8String(s, this.numBytes - 1);
}
/**
* Based on the given trim string, trim this string starting from left end
* This method searches each character in the source string starting from the left end, removes
* the character if it is in the trim string, stops at the first character which is not in the
* trim string, returns the new string.
* Trims instances of the given trim string from the start of this string.
*
* @param trimString the trim character string
* @return this string with no occurrences of the trim string at the start, or `null`
* if `trimString` is `null`
*/
public UTF8String trimLeft(UTF8String trimString) {
if (trimString == null) return null;
@ -597,34 +614,43 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
}
srchIdx += searchCharBytes;
}
if (trimIdx >= numBytes) {
// empty string
return EMPTY_UTF8;
} else {
return copyUTF8String(trimIdx, numBytes - 1);
if (srchIdx == 0) {
// Nothing trimmed
return this;
}
if (trimIdx >= numBytes) {
// Everything trimmed
return EMPTY_UTF8;
}
return copyUTF8String(trimIdx, numBytes - 1);
}
/**
* Trims space characters (ASCII 32) from the end of this string.
*
* @return this string with no spaces at the end
*/
public UTF8String trimRight() {
int e = numBytes - 1;
// skip all of the space (0x20) in the right side
while (e >= 0 && getByte(e) == 0x20) e--;
if (e < 0) {
// empty string
return EMPTY_UTF8;
} else {
return copyUTF8String(0, e);
if (e == numBytes - 1) {
// Nothing trimmed
return this;
}
if (e < 0) {
// Everything trimmed
return EMPTY_UTF8;
}
return copyUTF8String(0, e);
}
/**
* Based on the given trim string, trim this string starting from right end
* This method searches each character in the source string starting from the right end,
* removes the character if it is in the trim string, stops at the first character which is not
* in the trim string, returns the new string.
* Trims instances of the given trim string from the end of this string.
*
* @param trimString the trim character string
* @return this string with no occurrences of the trim string at the end, or `null`
* if `trimString` is `null`
*/
public UTF8String trimRight(UTF8String trimString) {
if (trimString == null) return null;
@ -658,12 +684,15 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
numChars --;
}
if (trimEnd < 0) {
// empty string
return EMPTY_UTF8;
} else {
return copyUTF8String(0, trimEnd);
if (trimEnd == numBytes - 1) {
// Nothing trimmed
return this;
}
if (trimEnd < 0) {
// Everything trimmed
return EMPTY_UTF8;
}
return copyUTF8String(0, trimEnd);
}
public UTF8String reverse() {