[SPARK-22130][CORE] UTF8String.trim() scans " " twice

## What changes were proposed in this pull request?

This PR allows us to scan a string including only white space (e.g. `"     "`) once while the current implementation scans twice (right to left, and then left to right).

## How was this patch tested?

Existing test suites

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #19355 from kiszk/SPARK-22130.
This commit is contained in:
Kazuaki Ishizaki 2017-09-27 23:19:10 +09:00 committed by hyukjinkwon
parent d2b8b63b93
commit 12e740bba1
2 changed files with 8 additions and 6 deletions

View file

@ -498,17 +498,16 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
public UTF8String trim() {
int s = 0;
int e = this.numBytes - 1;
// skip all of the space (0x20) in the left side
while (s < this.numBytes && getByte(s) == 0x20) s++;
// skip all of the space (0x20) in the right side
while (e >= 0 && getByte(e) == 0x20) e--;
if (s > e) {
if (s == this.numBytes) {
// empty string
return EMPTY_UTF8;
} else {
return copyUTF8String(s, e);
}
// skip all of the space (0x20) in the right side
int e = this.numBytes - 1;
while (e > s && getByte(e) == 0x20) e--;
return copyUTF8String(s, e);
}
/**

View file

@ -222,10 +222,13 @@ public class UTF8StringSuite {
@Test
public void trims() {
assertEquals(fromString("1"), fromString("1").trim());
assertEquals(fromString("hello"), fromString(" hello ").trim());
assertEquals(fromString("hello "), fromString(" hello ").trimLeft());
assertEquals(fromString(" hello"), fromString(" hello ").trimRight());
assertEquals(EMPTY_UTF8, EMPTY_UTF8.trim());
assertEquals(EMPTY_UTF8, fromString(" ").trim());
assertEquals(EMPTY_UTF8, fromString(" ").trimLeft());
assertEquals(EMPTY_UTF8, fromString(" ").trimRight());