spark-instrumented-optimizer/dev/test-dependencies.sh

137 lines
5.1 KiB
Bash
Raw Normal View History

#!/usr/bin/env bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
set -e
FWDIR="$(cd "`dirname $0`"/..; pwd)"
cd "$FWDIR"
# Explicitly set locale in order to make `sort` output consistent across machines.
# See https://stackoverflow.com/questions/28881 for more details.
export LC_ALL=C
# TODO: This would be much nicer to do in SBT, once SBT supports Maven-style resolution.
# NOTE: These should match those in the release publishing script
HADOOP2_MODULE_PROFILES="-Phive-thriftserver -Pmesos -Pkubernetes -Pyarn -Phive"
MVN="build/mvn"
HADOOP_HIVE_PROFILES=(
hadoop-2.7-hive-1.2
hadoop-2.7-hive-2.3
hadoop-3.2-hive-2.3
)
# We'll switch the version to a temp. one, publish POMs using that new version, then switch back to
# the old version. We need to do this because the `dependency:build-classpath` task needs to
# resolve Spark's internal submodule dependencies.
# From http://stackoverflow.com/a/26514030
set +e
OLD_VERSION=$($MVN -q \
-Dexec.executable="echo" \
-Dexec.args='${project.version}' \
--non-recursive \
org.codehaus.mojo:exec-maven-plugin:1.6.0:exec)
if [ $? != 0 ]; then
echo -e "Error while getting version string from Maven:\n$OLD_VERSION"
exit 1
fi
set -e
TEMP_VERSION="spark-$(python -S -c "import random; print(random.randrange(100000, 999999))")"
function reset_version {
# Delete the temporary POMs that we wrote to the local Maven repo:
find "$HOME/.m2/" | grep "$TEMP_VERSION" | xargs rm -rf
# Restore the original version number:
$MVN -q versions:set -DnewVersion=$OLD_VERSION -DgenerateBackupPoms=false > /dev/null
}
trap reset_version EXIT
$MVN -q versions:set -DnewVersion=$TEMP_VERSION -DgenerateBackupPoms=false > /dev/null
# Generate manifests for each Hadoop profile:
for HADOOP_HIVE_PROFILE in "${HADOOP_HIVE_PROFILES[@]}"; do
if [[ $HADOOP_HIVE_PROFILE == **hadoop-3.2-hive-2.3** ]]; then
HADOOP_PROFILE=hadoop-3.2
HIVE_PROFILE=hive-2.3
elif [[ $HADOOP_HIVE_PROFILE == **hadoop-2.7-hive-2.3** ]]; then
HADOOP_PROFILE=hadoop-2.7
[SPARK-29981][BUILD] Add hive-1.2/2.3 profiles ### What changes were proposed in this pull request? This PR aims the followings. - Add two profiles, `hive-1.2` and `hive-2.3` (default) - Validate if we keep the existing combination at least. (Hadoop-2.7 + Hive 1.2 / Hadoop-3.2 + Hive 2.3). For now, we assumes that `hive-1.2` is explicitly used with `hadoop-2.7` and `hive-2.3` with `hadoop-3.2`. The followings are beyond the scope of this PR. - SPARK-29988 Adjust Jenkins jobs for `hive-1.2/2.3` combination - SPARK-29989 Update release-script for `hive-1.2/2.3` combination - SPARK-29991 Support `hive-1.2/2.3` in PR Builder ### Why are the changes needed? This will help to switch our dependencies to update the exposed dependencies. ### Does this PR introduce any user-facing change? This is a dev-only change that the build profile combinations are changed. - `-Phadoop-2.7` => `-Phadoop-2.7 -Phive-1.2` - `-Phadoop-3.2` => `-Phadoop-3.2 -Phive-2.3` ### How was this patch tested? Pass the Jenkins with the dependency check and tests to make it sure we don't change anything for now. - [Jenkins (-Phadoop-2.7 -Phive-1.2)](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/114192/consoleFull) - [Jenkins (-Phadoop-3.2 -Phive-2.3)](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/114192/consoleFull) Also, from now, GitHub Action validates the following combinations. ![gha](https://user-images.githubusercontent.com/9700541/69355365-822d5e00-0c36-11ea-93f7-e00e5459e1d0.png) Closes #26619 from dongjoon-hyun/SPARK-29981. Authored-by: Dongjoon Hyun <dhyun@apple.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
2019-11-23 13:02:22 -05:00
HIVE_PROFILE=hive-2.3
else
HADOOP_PROFILE=hadoop-2.7
[SPARK-29981][BUILD] Add hive-1.2/2.3 profiles ### What changes were proposed in this pull request? This PR aims the followings. - Add two profiles, `hive-1.2` and `hive-2.3` (default) - Validate if we keep the existing combination at least. (Hadoop-2.7 + Hive 1.2 / Hadoop-3.2 + Hive 2.3). For now, we assumes that `hive-1.2` is explicitly used with `hadoop-2.7` and `hive-2.3` with `hadoop-3.2`. The followings are beyond the scope of this PR. - SPARK-29988 Adjust Jenkins jobs for `hive-1.2/2.3` combination - SPARK-29989 Update release-script for `hive-1.2/2.3` combination - SPARK-29991 Support `hive-1.2/2.3` in PR Builder ### Why are the changes needed? This will help to switch our dependencies to update the exposed dependencies. ### Does this PR introduce any user-facing change? This is a dev-only change that the build profile combinations are changed. - `-Phadoop-2.7` => `-Phadoop-2.7 -Phive-1.2` - `-Phadoop-3.2` => `-Phadoop-3.2 -Phive-2.3` ### How was this patch tested? Pass the Jenkins with the dependency check and tests to make it sure we don't change anything for now. - [Jenkins (-Phadoop-2.7 -Phive-1.2)](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/114192/consoleFull) - [Jenkins (-Phadoop-3.2 -Phive-2.3)](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/114192/consoleFull) Also, from now, GitHub Action validates the following combinations. ![gha](https://user-images.githubusercontent.com/9700541/69355365-822d5e00-0c36-11ea-93f7-e00e5459e1d0.png) Closes #26619 from dongjoon-hyun/SPARK-29981. Authored-by: Dongjoon Hyun <dhyun@apple.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
2019-11-23 13:02:22 -05:00
HIVE_PROFILE=hive-1.2
fi
echo "Performing Maven install for $HADOOP_HIVE_PROFILE"
[SPARK-29981][BUILD] Add hive-1.2/2.3 profiles ### What changes were proposed in this pull request? This PR aims the followings. - Add two profiles, `hive-1.2` and `hive-2.3` (default) - Validate if we keep the existing combination at least. (Hadoop-2.7 + Hive 1.2 / Hadoop-3.2 + Hive 2.3). For now, we assumes that `hive-1.2` is explicitly used with `hadoop-2.7` and `hive-2.3` with `hadoop-3.2`. The followings are beyond the scope of this PR. - SPARK-29988 Adjust Jenkins jobs for `hive-1.2/2.3` combination - SPARK-29989 Update release-script for `hive-1.2/2.3` combination - SPARK-29991 Support `hive-1.2/2.3` in PR Builder ### Why are the changes needed? This will help to switch our dependencies to update the exposed dependencies. ### Does this PR introduce any user-facing change? This is a dev-only change that the build profile combinations are changed. - `-Phadoop-2.7` => `-Phadoop-2.7 -Phive-1.2` - `-Phadoop-3.2` => `-Phadoop-3.2 -Phive-2.3` ### How was this patch tested? Pass the Jenkins with the dependency check and tests to make it sure we don't change anything for now. - [Jenkins (-Phadoop-2.7 -Phive-1.2)](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/114192/consoleFull) - [Jenkins (-Phadoop-3.2 -Phive-2.3)](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/114192/consoleFull) Also, from now, GitHub Action validates the following combinations. ![gha](https://user-images.githubusercontent.com/9700541/69355365-822d5e00-0c36-11ea-93f7-e00e5459e1d0.png) Closes #26619 from dongjoon-hyun/SPARK-29981. Authored-by: Dongjoon Hyun <dhyun@apple.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
2019-11-23 13:02:22 -05:00
$MVN $HADOOP2_MODULE_PROFILES -P$HADOOP_PROFILE -P$HIVE_PROFILE jar:jar jar:test-jar install:install clean -q
echo "Performing Maven validate for $HADOOP_HIVE_PROFILE"
[SPARK-29981][BUILD] Add hive-1.2/2.3 profiles ### What changes were proposed in this pull request? This PR aims the followings. - Add two profiles, `hive-1.2` and `hive-2.3` (default) - Validate if we keep the existing combination at least. (Hadoop-2.7 + Hive 1.2 / Hadoop-3.2 + Hive 2.3). For now, we assumes that `hive-1.2` is explicitly used with `hadoop-2.7` and `hive-2.3` with `hadoop-3.2`. The followings are beyond the scope of this PR. - SPARK-29988 Adjust Jenkins jobs for `hive-1.2/2.3` combination - SPARK-29989 Update release-script for `hive-1.2/2.3` combination - SPARK-29991 Support `hive-1.2/2.3` in PR Builder ### Why are the changes needed? This will help to switch our dependencies to update the exposed dependencies. ### Does this PR introduce any user-facing change? This is a dev-only change that the build profile combinations are changed. - `-Phadoop-2.7` => `-Phadoop-2.7 -Phive-1.2` - `-Phadoop-3.2` => `-Phadoop-3.2 -Phive-2.3` ### How was this patch tested? Pass the Jenkins with the dependency check and tests to make it sure we don't change anything for now. - [Jenkins (-Phadoop-2.7 -Phive-1.2)](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/114192/consoleFull) - [Jenkins (-Phadoop-3.2 -Phive-2.3)](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/114192/consoleFull) Also, from now, GitHub Action validates the following combinations. ![gha](https://user-images.githubusercontent.com/9700541/69355365-822d5e00-0c36-11ea-93f7-e00e5459e1d0.png) Closes #26619 from dongjoon-hyun/SPARK-29981. Authored-by: Dongjoon Hyun <dhyun@apple.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
2019-11-23 13:02:22 -05:00
$MVN $HADOOP2_MODULE_PROFILES -P$HADOOP_PROFILE -P$HIVE_PROFILE validate -q
echo "Generating dependency manifest for $HADOOP_HIVE_PROFILE"
mkdir -p dev/pr-deps
[SPARK-29981][BUILD] Add hive-1.2/2.3 profiles ### What changes were proposed in this pull request? This PR aims the followings. - Add two profiles, `hive-1.2` and `hive-2.3` (default) - Validate if we keep the existing combination at least. (Hadoop-2.7 + Hive 1.2 / Hadoop-3.2 + Hive 2.3). For now, we assumes that `hive-1.2` is explicitly used with `hadoop-2.7` and `hive-2.3` with `hadoop-3.2`. The followings are beyond the scope of this PR. - SPARK-29988 Adjust Jenkins jobs for `hive-1.2/2.3` combination - SPARK-29989 Update release-script for `hive-1.2/2.3` combination - SPARK-29991 Support `hive-1.2/2.3` in PR Builder ### Why are the changes needed? This will help to switch our dependencies to update the exposed dependencies. ### Does this PR introduce any user-facing change? This is a dev-only change that the build profile combinations are changed. - `-Phadoop-2.7` => `-Phadoop-2.7 -Phive-1.2` - `-Phadoop-3.2` => `-Phadoop-3.2 -Phive-2.3` ### How was this patch tested? Pass the Jenkins with the dependency check and tests to make it sure we don't change anything for now. - [Jenkins (-Phadoop-2.7 -Phive-1.2)](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/114192/consoleFull) - [Jenkins (-Phadoop-3.2 -Phive-2.3)](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/114192/consoleFull) Also, from now, GitHub Action validates the following combinations. ![gha](https://user-images.githubusercontent.com/9700541/69355365-822d5e00-0c36-11ea-93f7-e00e5459e1d0.png) Closes #26619 from dongjoon-hyun/SPARK-29981. Authored-by: Dongjoon Hyun <dhyun@apple.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
2019-11-23 13:02:22 -05:00
$MVN $HADOOP2_MODULE_PROFILES -P$HADOOP_PROFILE -P$HIVE_PROFILE dependency:build-classpath -pl assembly -am \
| grep "Dependencies classpath:" -A 1 \
[SPARK-30491][INFRA] Enable dependency audit files to tell dependency classifier ### What changes were proposed in this pull request? Enable dependency audit files to tell the value of artifact id, version, and classifier of a dependency. For example, `avro-mapred-1.8.2-hadoop2.jar` should be expanded to `avro-mapred/1.8.2/hadoop2/avro-mapred-1.8.2-hadoop2.jar` where `avro-mapred` is the artifact id, `1.8.2` is the version, and `haddop2` is the classifier. ### Why are the changes needed? Dependency audit files are expected to be consumed by automated tests or downstream tools. However, current dependency audit files under `dev/deps` only show jar names. And there isn't a simple rule on how to parse the jar name to get the values of different fields. For example, `hadoop2` is the classifier of `avro-mapred-1.8.2-hadoop2.jar`, in contrast, `incubating` is the version of `htrace-core-3.1.0-incubating.jar`. Reference: There is a good example of the downstream tool that would be enabled as yhuai suggested, > Say we have a Spark application that depends on a third-party dependency `foo`, which pulls in `jackson` as a transient dependency. Unfortunately, `foo` depends on a different version of `jackson` than Spark. So, in the pom of this Spark application, we use the dependency management section to pin the version of `jackson`. By doing this, we are lifting `jackson` to the top-level dependency of my application and I want to have a way to keep tracking what Spark uses. What we can do is to cross-check my Spark application's classpath with what Spark uses. Then, with a test written in my code base, whenever my application bumps Spark version, this test will check what we define in the application and what Spark has, and then remind us to change our application's pom if needed. In my case, I am fine to directly access git to get these audit files. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Code changes are verified by generated dependency audit files naturally. Thus, there are no tests added. Closes #27177 from mengCareers/depsOptimize. Lead-authored-by: Xinrong Meng <meng.careers@gmail.com> Co-authored-by: mengCareers <meng.careers@gmail.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
2020-01-15 23:19:44 -05:00
| tail -n 1 | tr ":" "\n" | awk -F '/' '{
# For each dependency classpath, we fetch the last three parts split by "/": artifact id, version, and jar name.
# Since classifier, if exists, always sits between "artifact_id-version-" and ".jar" suffix in the jar name,
# we extract classifier and put it right before the jar name explicitly.
# For example, `orc-core/1.5.5/nohive/orc-core-1.5.5-nohive.jar`
# ^^^^^^
# extracted classifier
# `okio/1.15.0//okio-1.15.0.jar`
# ^
# empty for dependencies without classifier
artifact_id=$(NF-2);
version=$(NF-1);
jar_name=$NF;
classifier_start_index=length(artifact_id"-"version"-") + 1;
classifier_end_index=index(jar_name, ".jar") - 1;
classifier=substr(jar_name, classifier_start_index, classifier_end_index - classifier_start_index + 1);
print artifact_id"/"version"/"classifier"/"jar_name
}' | sort | grep -v spark > dev/pr-deps/spark-deps-$HADOOP_HIVE_PROFILE
done
if [[ $@ == **replace-manifest** ]]; then
echo "Replacing manifests and creating new files at dev/deps"
rm -rf dev/deps
mv dev/pr-deps dev/deps
exit 0
fi
for HADOOP_HIVE_PROFILE in "${HADOOP_HIVE_PROFILES[@]}"; do
set +e
dep_diff="$(
git diff \
--no-index \
dev/deps/spark-deps-$HADOOP_HIVE_PROFILE \
dev/pr-deps/spark-deps-$HADOOP_HIVE_PROFILE \
)"
set -e
if [ "$dep_diff" != "" ]; then
echo "Spark's published dependencies DO NOT MATCH the manifest file (dev/spark-deps)."
echo "To update the manifest file, run './dev/test-dependencies.sh --replace-manifest'."
echo "$dep_diff"
rm -rf dev/pr-deps
exit 1
fi
done
exit 0