spark-instrumented-optimizer/dev/run-tests

#!/usr/bin/env bash

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Go to the Spark project root directory
FWDIR="$(cd "`dirname $0`"/..; pwd)"
cd "$FWDIR"

# Remove work directory
rm -rf ./work

source "$FWDIR/dev/run-tests-codes.sh"

CURRENT_BLOCK=$BLOCK_GENERAL

function handle_error () {
  echo "[error] Got a return code of $? on line $1 of the run-tests script."
  exit $CURRENT_BLOCK
}


# Build against the right verison of Hadoop.
{
  if [ -n "$AMPLAB_JENKINS_BUILD_PROFILE" ]; then
    if [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop1.0" ]; then
      export SBT_MAVEN_PROFILES_ARGS="-Dhadoop.version=1.0.4"
    elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.0" ]; then
      export SBT_MAVEN_PROFILES_ARGS="-Dhadoop.version=2.0.0-mr1-cdh4.1.1"
    elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.2" ]; then
      export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.2 -Dhadoop.version=2.2.0"
    elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.3" ]; then
      export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0"
    fi
  fi

  if [ -z "$SBT_MAVEN_PROFILES_ARGS" ]; then
    export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0"
  fi
}

export SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Pkinesis-asl"

# Determine Java path and version.
{
  if test -x "$JAVA_HOME/bin/java"; then
      declare java_cmd="$JAVA_HOME/bin/java"
  else
      declare java_cmd=java
  fi

  # We can't use sed -r -e due to OS X / BSD compatibility; hence, all the parentheses.
  JAVA_VERSION=$(
    $java_cmd -version 2>&1 \
    | grep -e "^java version" --max-count=1 \
    | sed "s/java version \"\(.*\)\.\(.*\)\.\(.*\)\"/\1\2/"
  )

  if [ "$JAVA_VERSION" -lt 18 ]; then
    echo "[warn] Java 8 tests will not run because JDK version is < 1.8."
  fi
}

# Only run Hive tests if there are sql changes.
# Partial solution for SPARK-1455.
if [ -n "$AMPLAB_JENKINS" ]; then
  git fetch origin master:master

  sql_diffs=$(
    git diff --name-only master \
    | grep -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh"
  )

  non_sql_diffs=$(
    git diff --name-only master \
    | grep -v -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh"
  )

  if [ -n "$sql_diffs" ]; then
    echo "[info] Detected changes in SQL. Will run Hive test suite."
    _RUN_SQL_TESTS=true

    if [ -z "$non_sql_diffs" ]; then
      echo "[info] Detected no changes except in SQL. Will only run SQL tests."
      _SQL_TESTS_ONLY=true
    fi
  fi
fi

set -o pipefail
trap 'handle_error $LINENO' ERR

echo ""
echo "========================================================================="
echo "Running Apache RAT checks"
echo "========================================================================="

CURRENT_BLOCK=$BLOCK_RAT

./dev/check-license

echo ""
echo "========================================================================="
echo "Running Scala style checks"
echo "========================================================================="

CURRENT_BLOCK=$BLOCK_SCALA_STYLE

./dev/lint-scala

echo ""
echo "========================================================================="
echo "Running Python style checks"
echo "========================================================================="

CURRENT_BLOCK=$BLOCK_PYTHON_STYLE

./dev/lint-python

echo ""
echo "========================================================================="
echo "Building Spark"
echo "========================================================================="

CURRENT_BLOCK=$BLOCK_BUILD

{
  # We always build with Hive because the PySpark Spark SQL tests need it.
  BUILD_MVN_PROFILE_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive -Phive-0.12.0"


  # NOTE: echo "q" is needed because sbt on encountering a build file with failure
  #+ (either resolution or compilation) prompts the user for input either q, r, etc
  #+ to quit or retry. This echo is there to make it not block.
  # NOTE: Do not quote $BUILD_MVN_PROFILE_ARGS or else it will be interpreted as a
  #+ single argument!
  # QUESTION: Why doesn't 'yes "q"' work?
  # QUESTION: Why doesn't 'grep -v -e "^\[info\] Resolving"' work?
  # First build with 0.12 to ensure patches do not break the hive 12 build
  echo "[info] Compile with hive 0.12"
  echo -e "q\n" \
    | sbt/sbt $BUILD_MVN_PROFILE_ARGS clean hive/compile hive-thriftserver/compile \
    | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"

  # Then build with default version(0.13.1) because tests are based on this version
  echo "[info] Building Spark with these arguments: $SBT_MAVEN_PROFILES_ARGS -Phive"
  echo -e "q\n" \
    | sbt/sbt $SBT_MAVEN_PROFILES_ARGS -Phive package assembly/assembly  \
    | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
}

echo ""
echo "========================================================================="
echo "Running Spark unit tests"
echo "========================================================================="

CURRENT_BLOCK=$BLOCK_SPARK_UNIT_TESTS

{
  # If the Spark SQL tests are enabled, run the tests with the Hive profiles enabled.
  # This must be a single argument, as it is.
  if [ -n "$_RUN_SQL_TESTS" ]; then
    SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive"
  fi

  if [ -n "$_SQL_TESTS_ONLY" ]; then
    # This must be an array of individual arguments. Otherwise, having one long string
    #+ will be interpreted as a single test, which doesn't work.
    SBT_MAVEN_TEST_ARGS=("catalyst/test" "sql/test" "hive/test")
  else
    SBT_MAVEN_TEST_ARGS=("test")
  fi

  echo "[info] Running Spark tests with these arguments: $SBT_MAVEN_PROFILES_ARGS ${SBT_MAVEN_TEST_ARGS[@]}"

  # NOTE: echo "q" is needed because sbt on encountering a build file with failure
  #+ (either resolution or compilation) prompts the user for input either q, r, etc
  #+ to quit or retry. This echo is there to make it not block.
  # NOTE: Do not quote $SBT_MAVEN_PROFILES_ARGS or else it will be interpreted as a
  #+ single argument!
  #+ "${SBT_MAVEN_TEST_ARGS[@]}" is cool because it's an array.
  # QUESTION: Why doesn't 'yes "q"' work?
  # QUESTION: Why doesn't 'grep -v -e "^\[info\] Resolving"' work?
  echo -e "q\n" \
    | sbt/sbt $SBT_MAVEN_PROFILES_ARGS "${SBT_MAVEN_TEST_ARGS[@]}" \
    | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
}

echo ""
echo "========================================================================="
echo "Running PySpark tests"
echo "========================================================================="

CURRENT_BLOCK=$BLOCK_PYSPARK_UNIT_TESTS

./python/run-tests

echo ""
echo "========================================================================="
echo "Detecting binary incompatibilites with MiMa"
echo "========================================================================="

CURRENT_BLOCK=$BLOCK_MIMA

./dev/mima