[SPARK-24542][SQL] UDF series UDFXPathXXXX allow users to pass carefully crafted XML to access arbitrary files

## What changes were proposed in this pull request?

UDF series UDFXPathXXXX allow users to pass carefully crafted XML to access arbitrary files. Spark does not have built-in access control. When users use the external access control library, users might bypass them and access the file contents.

This PR basically patches the Hive fix to Apache Spark. https://issues.apache.org/jira/browse/HIVE-18879

## How was this patch tested?

A unit test case

Author: Xiao Li <gatorsmile@gmail.com>

Closes #21549 from gatorsmile/xpathSecurity.
This commit is contained in:
Xiao Li 2018-06-18 20:17:04 -07:00 committed by Wenchen Fan
parent 1737d45e08
commit 9a75c18290
3 changed files with 51 additions and 3 deletions

View file

@ -21,6 +21,9 @@ import java.io.IOException;
import java.io.Reader;
import javax.xml.namespace.QName;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
@ -37,9 +40,15 @@ import org.xml.sax.InputSource;
* This is based on Hive's UDFXPathUtil implementation.
*/
public class UDFXPathUtil {
public static final String SAX_FEATURE_PREFIX = "http://xml.org/sax/features/";
public static final String EXTERNAL_GENERAL_ENTITIES_FEATURE = "external-general-entities";
public static final String EXTERNAL_PARAMETER_ENTITIES_FEATURE = "external-parameter-entities";
private DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
private DocumentBuilder builder = null;
private XPath xpath = XPathFactory.newInstance().newXPath();
private ReusableStringReader reader = new ReusableStringReader();
private InputSource inputSource = new InputSource(reader);
private XPathExpression expression = null;
private String oldPath = null;
@ -65,14 +74,31 @@ public class UDFXPathUtil {
return null;
}
if (builder == null){
try {
initializeDocumentBuilderFactory();
builder = dbf.newDocumentBuilder();
} catch (ParserConfigurationException e) {
throw new RuntimeException(
"Error instantiating DocumentBuilder, cannot build xml parser", e);
}
}
reader.set(xml);
try {
return expression.evaluate(inputSource, qname);
return expression.evaluate(builder.parse(inputSource), qname);
} catch (XPathExpressionException e) {
throw new RuntimeException("Invalid XML document: " + e.getMessage() + "\n" + xml, e);
} catch (Exception e) {
throw new RuntimeException("Error loading expression '" + oldPath + "'", e);
}
}
private void initializeDocumentBuilderFactory() throws ParserConfigurationException {
dbf.setFeature(SAX_FEATURE_PREFIX + EXTERNAL_GENERAL_ENTITIES_FEATURE, false);
dbf.setFeature(SAX_FEATURE_PREFIX + EXTERNAL_PARAMETER_ENTITIES_FEATURE, false);
}
public Boolean evalBoolean(String xml, String path) throws XPathExpressionException {
return (Boolean) eval(xml, path, XPathConstants.BOOLEAN);
}

View file

@ -77,6 +77,27 @@ class UDFXPathUtilSuite extends SparkFunSuite {
assert(ret == "foo")
}
test("embedFailure") {
import org.apache.commons.io.FileUtils
import java.io.File
val secretValue = String.valueOf(Math.random)
val tempFile = File.createTempFile("verifyembed", ".tmp")
tempFile.deleteOnExit()
val fname = tempFile.getAbsolutePath
FileUtils.writeStringToFile(tempFile, secretValue)
val xml =
s"""<?xml version="1.0" encoding="utf-8"?>
|<!DOCTYPE test [
| <!ENTITY embed SYSTEM "$fname">
|]>
|<foo>&embed;</foo>
""".stripMargin
val evaled = new UDFXPathUtil().evalString(xml, "/foo")
assert(evaled.isEmpty)
}
test("number eval") {
var ret =
util.evalNumber("<a><b>true</b><b>false</b><b>b3</b><c>c1</c><c>-77</c></a>", "a/c[2]")

View file

@ -40,8 +40,9 @@ class XPathExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
// Test error message for invalid XML document
val e1 = intercept[RuntimeException] { testExpr("<a>/a>", "a", null.asInstanceOf[T]) }
assert(e1.getCause.getMessage.contains("Invalid XML document") &&
e1.getCause.getMessage.contains("<a>/a>"))
assert(e1.getCause.getCause.getMessage.contains(
"XML document structures must start and end within the same entity."))
assert(e1.getMessage.contains("<a>/a>"))
// Test error message for invalid xpath
val e2 = intercept[RuntimeException] { testExpr("<a></a>", "!#$", null.asInstanceOf[T]) }