[SPARK-22130][CORE] UTF8String.trim() scans " " twice
## What changes were proposed in this pull request? This PR allows us to scan a string including only white space (e.g. `" "`) once while the current implementation scans twice (right to left, and then left to right). ## How was this patch tested? Existing test suites Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com> Closes #19355 from kiszk/SPARK-22130.
This commit is contained in:
parent
d2b8b63b93
commit
12e740bba1
|
@ -498,17 +498,16 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
|
|||
|
||||
public UTF8String trim() {
|
||||
int s = 0;
|
||||
int e = this.numBytes - 1;
|
||||
// skip all of the space (0x20) in the left side
|
||||
while (s < this.numBytes && getByte(s) == 0x20) s++;
|
||||
// skip all of the space (0x20) in the right side
|
||||
while (e >= 0 && getByte(e) == 0x20) e--;
|
||||
if (s > e) {
|
||||
if (s == this.numBytes) {
|
||||
// empty string
|
||||
return EMPTY_UTF8;
|
||||
} else {
|
||||
return copyUTF8String(s, e);
|
||||
}
|
||||
// skip all of the space (0x20) in the right side
|
||||
int e = this.numBytes - 1;
|
||||
while (e > s && getByte(e) == 0x20) e--;
|
||||
return copyUTF8String(s, e);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -222,10 +222,13 @@ public class UTF8StringSuite {
|
|||
|
||||
@Test
|
||||
public void trims() {
|
||||
assertEquals(fromString("1"), fromString("1").trim());
|
||||
|
||||
assertEquals(fromString("hello"), fromString(" hello ").trim());
|
||||
assertEquals(fromString("hello "), fromString(" hello ").trimLeft());
|
||||
assertEquals(fromString(" hello"), fromString(" hello ").trimRight());
|
||||
|
||||
assertEquals(EMPTY_UTF8, EMPTY_UTF8.trim());
|
||||
assertEquals(EMPTY_UTF8, fromString(" ").trim());
|
||||
assertEquals(EMPTY_UTF8, fromString(" ").trimLeft());
|
||||
assertEquals(EMPTY_UTF8, fromString(" ").trimRight());
|
||||
|
|
Loading…
Reference in a new issue