[SPARK-25221][DEPLOY] Consistent trailing whitespace treatment of conf values

## What changes were proposed in this pull request?

Stop trimming values of properties loaded from a file

## How was this patch tested?

Added unit test demonstrating the issue hit in production.

Closes #22213 from gerashegalov/gera/SPARK-25221.

Authored-by: Gera Shegalov <gera@apache.org>
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
This commit is contained in:
Gera Shegalov 2018-09-11 09:28:32 -07:00 committed by Marcelo Vanzin
parent 77579aa8c3
commit bcb9a8c83f
3 changed files with 103 additions and 3 deletions

View file

@ -19,7 +19,6 @@ package org.apache.spark.util
import java.io._
import java.lang.{Byte => JByte}
import java.lang.InternalError
import java.lang.management.{LockInfo, ManagementFactory, MonitorInfo, ThreadInfo}
import java.lang.reflect.InvocationTargetException
import java.math.{MathContext, RoundingMode}
@ -2052,6 +2051,30 @@ private[spark] object Utils extends Logging {
}
}
/**
* Implements the same logic as JDK `java.lang.String#trim` by removing leading and trailing
* non-printable characters less or equal to '\u0020' (SPACE) but preserves natural line
* delimiters according to [[java.util.Properties]] load method. The natural line delimiters are
* removed by JDK during load. Therefore any remaining ones have been specifically provided and
* escaped by the user, and must not be ignored
*
* @param str
* @return the trimmed value of str
*/
private[util] def trimExceptCRLF(str: String): String = {
val nonSpaceOrNaturalLineDelimiter: Char => Boolean = { ch =>
ch > ' ' || ch == '\r' || ch == '\n'
}
val firstPos = str.indexWhere(nonSpaceOrNaturalLineDelimiter)
val lastPos = str.lastIndexWhere(nonSpaceOrNaturalLineDelimiter)
if (firstPos >= 0 && lastPos >= 0) {
str.substring(firstPos, lastPos + 1)
} else {
""
}
}
/** Load properties present in the given file. */
def getPropertiesFromFile(filename: String): Map[String, String] = {
val file = new File(filename)
@ -2062,8 +2085,10 @@ private[spark] object Utils extends Logging {
try {
val properties = new Properties()
properties.load(inReader)
properties.stringPropertyNames().asScala.map(
k => (k, properties.getProperty(k).trim)).toMap
properties.stringPropertyNames().asScala
.map { k => (k, trimExceptCRLF(properties.getProperty(k))) }
.toMap
} catch {
case e: IOException =>
throw new SparkException(s"Failed when loading Spark properties from $filename", e)

View file

@ -1144,6 +1144,53 @@ class SparkSubmitSuite
conf1.get(PY_FILES.key) should be (s"s3a://${pyFile.getAbsolutePath}")
conf1.get("spark.submit.pyFiles") should (startWith("/"))
}
test("handles natural line delimiters in --properties-file and --conf uniformly") {
val delimKey = "spark.my.delimiter."
val LF = "\n"
val CR = "\r"
val lineFeedFromCommandLine = s"${delimKey}lineFeedFromCommandLine" -> LF
val leadingDelimKeyFromFile = s"${delimKey}leadingDelimKeyFromFile" -> s"${LF}blah"
val trailingDelimKeyFromFile = s"${delimKey}trailingDelimKeyFromFile" -> s"blah${CR}"
val infixDelimFromFile = s"${delimKey}infixDelimFromFile" -> s"${CR}blah${LF}"
val nonDelimSpaceFromFile = s"${delimKey}nonDelimSpaceFromFile" -> " blah\f"
val testProps = Seq(leadingDelimKeyFromFile, trailingDelimKeyFromFile, infixDelimFromFile,
nonDelimSpaceFromFile)
val props = new java.util.Properties()
val propsFile = File.createTempFile("test-spark-conf", ".properties",
Utils.createTempDir())
val propsOutputStream = new FileOutputStream(propsFile)
try {
testProps.foreach { case (k, v) => props.put(k, v) }
props.store(propsOutputStream, "test whitespace")
} finally {
propsOutputStream.close()
}
val clArgs = Seq(
"--class", "org.SomeClass",
"--conf", s"${lineFeedFromCommandLine._1}=${lineFeedFromCommandLine._2}",
"--conf", "spark.master=yarn",
"--properties-file", propsFile.getPath,
"thejar.jar")
val appArgs = new SparkSubmitArguments(clArgs)
val (_, _, conf, _) = submit.prepareSubmitEnvironment(appArgs)
Seq(
lineFeedFromCommandLine,
leadingDelimKeyFromFile,
trailingDelimKeyFromFile,
infixDelimFromFile
).foreach { case (k, v) =>
conf.get(k) should be (v)
}
conf.get(nonDelimSpaceFromFile._1) should be ("blah")
}
}
object SparkSubmitSuite extends SparkFunSuite with TimeLimits {

View file

@ -1205,6 +1205,34 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
assert(Utils.stringHalfWidth("\u0967\u0968\u0969") == 3)
// scalastyle:on nonascii
}
test("trimExceptCRLF standalone") {
val crlfSet = Set("\r", "\n")
val nonPrintableButCRLF = (0 to 32).map(_.toChar.toString).toSet -- crlfSet
// identity for CRLF
crlfSet.foreach { s => Utils.trimExceptCRLF(s) === s }
// empty for other non-printables
nonPrintableButCRLF.foreach { s => assert(Utils.trimExceptCRLF(s) === "") }
// identity for a printable string
assert(Utils.trimExceptCRLF("a") === "a")
// identity for strings with CRLF
crlfSet.foreach { s =>
assert(Utils.trimExceptCRLF(s"${s}a") === s"${s}a")
assert(Utils.trimExceptCRLF(s"a${s}") === s"a${s}")
assert(Utils.trimExceptCRLF(s"b${s}b") === s"b${s}b")
}
// trim nonPrintableButCRLF except when inside a string
nonPrintableButCRLF.foreach { s =>
assert(Utils.trimExceptCRLF(s"${s}a") === "a")
assert(Utils.trimExceptCRLF(s"a${s}") === "a")
assert(Utils.trimExceptCRLF(s"b${s}b") === s"b${s}b")
}
}
}
private class SimpleExtension