[SPARK-28066][CORE] Optimize UTF8String.trim() for common case of no whitespace
## What changes were proposed in this pull request? UTF8String.trim() allocates a new object even if the string has no whitespace, when it can just return itself. A simple check for this case makes the method about 3x faster in the common case. ## How was this patch tested? Existing tests. A rough benchmark of 90% strings without whitespace (at ends), and 10% that do have whitespace, suggests the average runtime goes from 20 ns to 6 ns. Closes #24884 from srowen/SPARK-28066. Authored-by: Sean Owen <sean.owen@databricks.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
This commit is contained in:
parent
b7b4452553
commit
4576dfde19
|
@ -529,26 +529,35 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
|
|||
return UTF8String.fromBytes(newBytes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Trims space characters (ASCII 32) from both ends of this string.
|
||||
*
|
||||
* @return this string with no spaces at the start or end
|
||||
*/
|
||||
public UTF8String trim() {
|
||||
int s = 0;
|
||||
// skip all of the space (0x20) in the left side
|
||||
while (s < this.numBytes && getByte(s) == 0x20) s++;
|
||||
if (s == this.numBytes) {
|
||||
// empty string
|
||||
// Everything trimmed
|
||||
return EMPTY_UTF8;
|
||||
}
|
||||
// skip all of the space (0x20) in the right side
|
||||
int e = this.numBytes - 1;
|
||||
while (e > s && getByte(e) == 0x20) e--;
|
||||
if (s == 0 && e == numBytes - 1) {
|
||||
// Nothing trimmed
|
||||
return this;
|
||||
}
|
||||
return copyUTF8String(s, e);
|
||||
}
|
||||
|
||||
/**
|
||||
* Based on the given trim string, trim this string starting from both ends
|
||||
* This method searches for each character in the source string, removes the character if it is
|
||||
* found in the trim string, stops at the first not found. It calls the trimLeft first, then
|
||||
* trimRight. It returns a new string in which both ends trim characters have been removed.
|
||||
* Trims instances of the given trim string from both ends of this string.
|
||||
*
|
||||
* @param trimString the trim character string
|
||||
* @return this string with no occurrences of the trim string at the start or end, or `null`
|
||||
* if `trimString` is `null`
|
||||
*/
|
||||
public UTF8String trim(UTF8String trimString) {
|
||||
if (trimString != null) {
|
||||
|
@ -558,24 +567,32 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Trims space characters (ASCII 32) from the start of this string.
|
||||
*
|
||||
* @return this string with no spaces at the start
|
||||
*/
|
||||
public UTF8String trimLeft() {
|
||||
int s = 0;
|
||||
// skip all of the space (0x20) in the left side
|
||||
while (s < this.numBytes && getByte(s) == 0x20) s++;
|
||||
if (s == this.numBytes) {
|
||||
// empty string
|
||||
return EMPTY_UTF8;
|
||||
} else {
|
||||
return copyUTF8String(s, this.numBytes - 1);
|
||||
if (s == 0) {
|
||||
// Nothing trimmed
|
||||
return this;
|
||||
}
|
||||
if (s == this.numBytes) {
|
||||
// Everything trimmed
|
||||
return EMPTY_UTF8;
|
||||
}
|
||||
return copyUTF8String(s, this.numBytes - 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Based on the given trim string, trim this string starting from left end
|
||||
* This method searches each character in the source string starting from the left end, removes
|
||||
* the character if it is in the trim string, stops at the first character which is not in the
|
||||
* trim string, returns the new string.
|
||||
* Trims instances of the given trim string from the start of this string.
|
||||
*
|
||||
* @param trimString the trim character string
|
||||
* @return this string with no occurrences of the trim string at the start, or `null`
|
||||
* if `trimString` is `null`
|
||||
*/
|
||||
public UTF8String trimLeft(UTF8String trimString) {
|
||||
if (trimString == null) return null;
|
||||
|
@ -597,34 +614,43 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
|
|||
}
|
||||
srchIdx += searchCharBytes;
|
||||
}
|
||||
|
||||
if (trimIdx >= numBytes) {
|
||||
// empty string
|
||||
return EMPTY_UTF8;
|
||||
} else {
|
||||
return copyUTF8String(trimIdx, numBytes - 1);
|
||||
if (srchIdx == 0) {
|
||||
// Nothing trimmed
|
||||
return this;
|
||||
}
|
||||
if (trimIdx >= numBytes) {
|
||||
// Everything trimmed
|
||||
return EMPTY_UTF8;
|
||||
}
|
||||
return copyUTF8String(trimIdx, numBytes - 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Trims space characters (ASCII 32) from the end of this string.
|
||||
*
|
||||
* @return this string with no spaces at the end
|
||||
*/
|
||||
public UTF8String trimRight() {
|
||||
int e = numBytes - 1;
|
||||
// skip all of the space (0x20) in the right side
|
||||
while (e >= 0 && getByte(e) == 0x20) e--;
|
||||
|
||||
if (e < 0) {
|
||||
// empty string
|
||||
return EMPTY_UTF8;
|
||||
} else {
|
||||
return copyUTF8String(0, e);
|
||||
if (e == numBytes - 1) {
|
||||
// Nothing trimmed
|
||||
return this;
|
||||
}
|
||||
if (e < 0) {
|
||||
// Everything trimmed
|
||||
return EMPTY_UTF8;
|
||||
}
|
||||
return copyUTF8String(0, e);
|
||||
}
|
||||
|
||||
/**
|
||||
* Based on the given trim string, trim this string starting from right end
|
||||
* This method searches each character in the source string starting from the right end,
|
||||
* removes the character if it is in the trim string, stops at the first character which is not
|
||||
* in the trim string, returns the new string.
|
||||
* Trims instances of the given trim string from the end of this string.
|
||||
*
|
||||
* @param trimString the trim character string
|
||||
* @return this string with no occurrences of the trim string at the end, or `null`
|
||||
* if `trimString` is `null`
|
||||
*/
|
||||
public UTF8String trimRight(UTF8String trimString) {
|
||||
if (trimString == null) return null;
|
||||
|
@ -658,12 +684,15 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
|
|||
numChars --;
|
||||
}
|
||||
|
||||
if (trimEnd < 0) {
|
||||
// empty string
|
||||
return EMPTY_UTF8;
|
||||
} else {
|
||||
return copyUTF8String(0, trimEnd);
|
||||
if (trimEnd == numBytes - 1) {
|
||||
// Nothing trimmed
|
||||
return this;
|
||||
}
|
||||
if (trimEnd < 0) {
|
||||
// Everything trimmed
|
||||
return EMPTY_UTF8;
|
||||
}
|
||||
return copyUTF8String(0, trimEnd);
|
||||
}
|
||||
|
||||
public UTF8String reverse() {
|
||||
|
|
Loading…
Reference in a new issue