Skip to content

Commit

Permalink
ZEPPELIN-783 fix CI failure on Spark download
Browse files Browse the repository at this point in the history
### What is this PR for?
Improve CI by hard-ending spark download failures that are responsible for recent CI red on `master`.

### What type of PR is it?
Bug Fix | Hot Fix

### Todos
 - [x] cleanup on spark download attempts
 - [x] leverage Travis CI [cacheing](https://docs.travis-ci.com/user/caching) for spark and pyspark binaries under `.spark-dist`

### What is the Jira issue?
[ZEPPELIN-783](https://issues.apache.org/jira/browse/ZEPPELIN-783)

### How should this be tested?
CI must be green

### Questions:
* Does the licenses files need update? No
* Is there breaking changes for older versions? No
* Does this needs documentation? No

Author: Alexander Bezzubov <[email protected]>

Closes apache#810 from bzz/ZEPPELIN-783-fix-ci-spark-download and squashes the following commits:

9d59646 [Alexander Bezzubov] ZEPPELIN-783: consistent download timeout
b6310f0 [Alexander Bezzubov] ZEPPELIN-783: add debug info: download, Zepeplin config
5d0eb2d [Alexander Bezzubov] ZEPPELIN-783: pyspark&spark cache under .spark-distr, but unpack to root
d4ef96d [Alexander Bezzubov] ZEPPELIN-783: exclude .spark-dist cache from RAT
388d76b [Alexander Bezzubov] ZEPPELIN-783: backport from Spark download to start\stop scripts
fa8b516 [Alexander Bezzubov] ZEPPELIN-783: reconcile CI-time and build-time Spark download locations
542a305 [Alexander Bezzubov] ZEPPELIN-783: use TravisCI caching for relieable Spark download
bd1d5e2 [Alexander Bezzubov] ZEPPELIN-783: add cleanup on download failure
b413743 [Alexander Bezzubov] ZEPPELIN-783: refactoring - extract SPARK_ARCHIVE var
346e075 [Alexander Bezzubov] ZEPPELIN-783: upd shell style
  • Loading branch information
bzz committed Apr 2, 2016
1 parent a7a7bdb commit 67e0fd5
Show file tree
Hide file tree
Showing 9 changed files with 107 additions and 46 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
spark/derby.log
spark/metastore_db
spark-1.*-bin-hadoop*
.spark-dist
zeppelin-server/derby.log

lens/lens-cli-hist.log
Expand Down
5 changes: 5 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
language: java

sudo: false
cache:
directories:
- .spark-dist

matrix:
include:
Expand Down Expand Up @@ -48,6 +51,7 @@ matrix:
env: TEST_SELENIUM="true" SPARK_VER="1.6.0" HADOOP_VER="2.3" PROFILE="-Pspark-1.6 -Phadoop-2.3 -Ppyspark" BUILD_FLAG="package -DskipTests" TEST_FLAG="verify" TEST_PROJECTS="-pl zeppelin-interpreter,zeppelin-zengine,zeppelin-server,zeppelin-display,spark-dependencies,spark -Dtest=org.apache.zeppelin.AbstractFunctionalSuite -DfailIfNoTests=false"

before_install:
- "ls -la .spark-dist"
- "export DISPLAY=:99.0"
- "sh -e /etc/init.d/xvfb start"

Expand All @@ -58,6 +62,7 @@ before_script:
- travis_retry ./testing/downloadSpark.sh $SPARK_VER $HADOOP_VER
- ./testing/startSparkCluster.sh $SPARK_VER $HADOOP_VER
- echo "export SPARK_HOME=`pwd`/spark-$SPARK_VER-bin-hadoop$HADOOP_VER" > conf/zeppelin-env.sh
- tail conf/zeppelin-env.sh

script:
- mvn $TEST_FLAG $PROFILE -B $TEST_PROJECTS
Expand Down
11 changes: 11 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@
<groupId>org.apache.rat</groupId>
<artifactId>apache-rat-plugin</artifactId>
</plugin>

<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
Expand All @@ -249,6 +250,7 @@
<target>1.7</target>
</configuration>
</plugin>

<!-- Test coverage plugin -->
<plugin>
<groupId>org.codehaus.mojo</groupId>
Expand All @@ -270,6 +272,7 @@
</execution>
</executions>
</plugin>

<!-- Checkstyle plugin -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
Expand Down Expand Up @@ -488,6 +491,7 @@
<exclude>conf/notebook-authorization.json</exclude>
<exclude>conf/zeppelin-env.sh</exclude>
<exclude>spark-*-bin*/**</exclude>
<exclude>.spark-dist/**</exclude>

<!-- bundled from bootstrap -->
<exclude>docs/assets/themes/zeppelin/bootstrap/**</exclude>
Expand Down Expand Up @@ -640,6 +644,13 @@
</lifecycleMappingMetadata>
</configuration>
</plugin>

<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-antrun-plugin</artifactId>
<version>1.7</version>
</plugin>

</plugins>
</pluginManagement>
</build>
Expand Down
33 changes: 26 additions & 7 deletions spark-dependencies/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,11 @@
<akka.group>org.spark-project.akka</akka.group>
<akka.version>2.3.4-spark</akka.version>

<spark.download.url>http://archive.apache.org/dist/spark/spark-${spark.version}/spark-${spark.version}.tgz</spark.download.url>
<spark.archive>spark-${spark.version}</spark.archive>
<spark.download.url>
http://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}.tgz
</spark.download.url>
<spark.dist.cache>${project.build.directory}/../../.spark-dist</spark.dist.cache>
<py4j.version>0.8.2.1</py4j.version>
</properties>

Expand Down Expand Up @@ -787,12 +791,12 @@
</goals>
<configuration>
<url>${spark.download.url}</url>
<unpack>true</unpack>
<outputDirectory>${project.build.directory}/spark-dist</outputDirectory>
<outputDirectory>${spark.dist.cache}</outputDirectory>
</configuration>
</execution>
</executions>
</plugin>

<plugin>
<artifactId>maven-clean-plugin</artifactId>
<configuration>
Expand All @@ -806,13 +810,28 @@
</filesets>
</configuration>
</plugin>

<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-antrun-plugin</artifactId>
<version>1.7</version>
<executions>
<execution>
<id>download-and-zip-pyspark-files</id>
<id>unzip-pyspark-files</id>
<phase>validate</phase>
<goals>
<goal>run</goal>
</goals>
<configuration>
<target>
<untar src="${spark.dist.cache}/${spark.archive}.tgz"
dest="${project.build.directory}/spark-dist"
compression="gzip"/>
</target>
</configuration>
</execution>

<execution>
<id>zip-pyspark-files</id>
<phase>generate-resources</phase>
<goals>
<goal>run</goal>
Expand All @@ -821,9 +840,9 @@
<target>
<delete dir="../interpreter/spark/pyspark"/>
<copy todir="../interpreter/spark/pyspark"
file="${project.build.directory}/spark-dist/spark-${spark.version}/python/lib/py4j-${py4j.version}-src.zip"/>
file="${project.build.directory}/spark-dist/${spark.archive}/python/lib/py4j-${py4j.version}-src.zip"/>
<zip destfile="${project.build.directory}/../../interpreter/spark/pyspark/pyspark.zip"
basedir="${project.build.directory}/spark-dist/spark-${spark.version}/python"
basedir="${project.build.directory}/spark-dist/${spark.archive}/python"
includes="pyspark/*.py,pyspark/**/*.py"/>
</target>
</configuration>
Expand Down
69 changes: 45 additions & 24 deletions testing/downloadSpark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#


if [ $# -ne 2 ]; then
if [[ "$#" -ne 2 ]]; then
echo "usage) $0 [spark version] [hadoop version]"
echo " eg) $0 1.3.1 2.6"
exit 1
Expand All @@ -26,10 +26,10 @@ fi
SPARK_VERSION="${1}"
HADOOP_VERSION="${2}"

echo ${SPARK_VERSION} | grep "^1.[123].[0-9]" > /dev/null
if [ $? -eq 0 ]; then
echo "${SPARK_VERSION}" | grep "^1.[123].[0-9]" > /dev/null
if [[ "$?" -eq 0 ]]; then
echo "${SPARK_VERSION}" | grep "^1.[12].[0-9]" > /dev/null
if [ $? -eq 0 ]; then
if [[ "$?" -eq 0 ]]; then
SPARK_VER_RANGE="<=1.2"
else
SPARK_VER_RANGE="<=1.3"
Expand All @@ -40,31 +40,52 @@ fi

set -xe

FWDIR=$(dirname "${BASH_SOURCE-$0}")
TIMEOUT_SEC=590
FWDIR="$(dirname "${BASH_SOURCE-$0}")"
ZEPPELIN_HOME="$(cd "${FWDIR}/.."; pwd)"
export SPARK_HOME=${ZEPPELIN_HOME}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}

SPARK_CACHE=".spark-dist"
SPARK_ARCHIVE="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}"
export SPARK_HOME="${ZEPPELIN_HOME}/${SPARK_ARCHIVE}"
echo "SPARK_HOME is ${SPARK_HOME}"
if [ ! -d "${SPARK_HOME}" ]; then
if [ "${SPARK_VER_RANGE}" == "<=1.2" ]; then
# spark 1.1.x and spark 1.2.x can be downloaded from archive
STARTTIME=`date +%s`
timeout -s KILL 300 wget -q http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz
ENDTIME=`date +%s`
DOWNLOADTIME=$((ENDTIME-STARTTIME))
else
# spark 1.3.x and later can be downloaded from mirror
# get download address from mirror
MIRROR_INFO=$(curl -s "http://www.apache.org/dyn/closer.cgi/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz?asjson=1")

PREFFERED=$(echo "${MIRROR_INFO}" | grep preferred | sed 's/[^"]*.preferred.: .\([^"]*\).*/\1/g')
PATHINFO=$(echo "${MIRROR_INFO}" | grep path_info | sed 's/[^"]*.path_info.: .\([^"]*\).*/\1/g')
if [[ ! -d "${SPARK_HOME}" ]]; then
mkdir -p "${SPARK_CACHE}"
cd "${SPARK_CACHE}"
if [[ ! -f "${SPARK_ARCHIVE}.tgz" ]]; then
pwd
ls -la .
echo "${SPARK_CACHE} does not have ${SPARK_ARCHIVE} downloading ..."
# download archive if not cached
if [[ "${SPARK_VER_RANGE}" == "<=1.2" ]]; then
# spark 1.1.x and spark 1.2.x can be downloaded from archive
STARTTIME=`date +%s`
timeout -s KILL "${TIMEOUT_SEC}" wget -q "http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_ARCHIVE}.tgz"
ENDTIME=`date +%s`
DOWNLOADTIME="$((ENDTIME-STARTTIME))"
else
# spark 1.3.x and later can be downloaded from mirror
# get download address from mirror
MIRROR_INFO=$(curl -s "http://www.apache.org/dyn/closer.cgi/spark/spark-${SPARK_VERSION}/${SPARK_ARCHIVE}.tgz?asjson=1")

PREFFERED=$(echo "${MIRROR_INFO}" | grep preferred | sed 's/[^"]*.preferred.: .\([^"]*\).*/\1/g')
PATHINFO=$(echo "${MIRROR_INFO}" | grep path_info | sed 's/[^"]*.path_info.: .\([^"]*\).*/\1/g')

STARTTIME=`date +%s`
timeout -s KILL "${TIMEOUT_SEC}" wget -q "${PREFFERED}${PATHINFO}"
ENDTIME=`date +%s`
DOWNLOADTIME="$((ENDTIME-STARTTIME))"
fi
fi

STARTTIME=`date +%s`
timeout -s KILL 590 wget -q "${PREFFERED}${PATHINFO}"
ENDTIME=`date +%s`
DOWNLOADTIME=$((ENDTIME-STARTTIME))
# extract archive in un-cached root, clean-up on failure
cp "${SPARK_ARCHIVE}.tgz" ..
cd ..
if ! tar zxf "${SPARK_ARCHIVE}.tgz" ; then
echo "Unable to extract ${SPARK_ARCHIVE}.tgz" >&2
rm -rf "${SPARK_ARCHIVE}"
rm -f "${SPARK_ARCHIVE}.tgz"
fi
tar zxf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz
fi

set +xe
19 changes: 10 additions & 9 deletions testing/startSparkCluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#


if [ $# -ne 2 ]; then
if [[ "$#" -ne 2 ]]; then
echo "usage) $0 [spark version] [hadoop version]"
echo " eg) $0 1.3.1 2.6"
exit 1
Expand All @@ -26,10 +26,10 @@ fi
SPARK_VERSION="${1}"
HADOOP_VERSION="${2}"

echo ${SPARK_VERSION} | grep "^1.[123].[0-9]" > /dev/null
if [ $? -eq 0 ]; then
echo "${SPARK_VERSION}" | grep "^1.[123].[0-9]" > /dev/null
if [[ "$?" -eq 0 ]]; then
echo "${SPARK_VERSION}" | grep "^1.[12].[0-9]" > /dev/null
if [ $? -eq 0 ]; then
if [[ "$?" -eq 0 ]]; then
SPARK_VER_RANGE="<=1.2"
else
SPARK_VER_RANGE="<=1.3"
Expand All @@ -38,17 +38,18 @@ else
SPARK_VER_RANGE=">1.3"
fi


set -xe

FWDIR=$(dirname "${BASH_SOURCE-$0}")
FWDIR="$(dirname "${BASH_SOURCE-$0}")"
ZEPPELIN_HOME="$(cd "${FWDIR}/.."; pwd)"
export SPARK_HOME=${ZEPPELIN_HOME}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}

SPARK_ARCHIVE="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}"
export SPARK_HOME="${ZEPPELIN_HOME}/${SPARK_ARCHIVE}"
echo "SPARK_HOME is ${SPARK_HOME}"

# create PID dir. test case detect pid file so they can select active spark home dir for test
mkdir -p ${SPARK_HOME}/run
export SPARK_PID_DIR=${SPARK_HOME}/run
export SPARK_PID_DIR="${SPARK_HOME}/run"
mkdir -p "${SPARK_PID_DIR}"

# start
export SPARK_MASTER_PORT=7071
Expand Down
11 changes: 7 additions & 4 deletions testing/stopSparkCluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# limitations under the License.
#

if [ $# -ne 2 ]; then
if [[ "$#" -ne 2 ]]; then
echo "usage) $0 [spark version] [hadoop version]"
echo " eg) $0 1.3.1 2.6"
exit 1
Expand All @@ -27,12 +27,15 @@ HADOOP_VERSION="${2}"

set -xe

FWDIR=$(dirname "${BASH_SOURCE-$0}")
FWDIR="$(dirname "${BASH_SOURCE-$0}")"
ZEPPELIN_HOME="$(cd "${FWDIR}/.."; pwd)"
export SPARK_HOME=${ZEPPELIN_HOME}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}

SPARK_ARCHIVE="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}"
export SPARK_HOME="${ZEPPELIN_HOME}/${SPARK_ARCHIVE}"
echo "SPARK_HOME is ${SPARK_HOME}"

# set create PID dir
export SPARK_PID_DIR=${SPARK_HOME}/run
export SPARK_PID_DIR="${SPARK_HOME}/run"

${SPARK_HOME}/sbin/spark-daemon.sh stop org.apache.spark.deploy.worker.Worker 1
${SPARK_HOME}/sbin/stop-master.sh
Expand Down
2 changes: 1 addition & 1 deletion zeppelin-server/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -369,8 +369,8 @@
</plugin>

<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-antrun-plugin</artifactId>
<version>1.6</version>
<executions>
<execution>
<id>start-zeppelin</id>
Expand Down
2 changes: 1 addition & 1 deletion zeppelin-web/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,10 @@
<webXml>dist\WEB-INF\web.xml</webXml>
</configuration>
</plugin>

<plugin>
<groupId>org.apache.rat</groupId>
<artifactId>apache-rat-plugin</artifactId>
<version>0.11</version>
<configuration>
<excludes>
<exclude>**/.idea/</exclude>
Expand Down

0 comments on commit 67e0fd5

Please sign in to comment.