ZEPPELIN-783 fix CI failure on Spark download

### What is this PR for? Improve CI by hard-ending spark download failures that are responsible for recent CI red on `master`. ### What type of PR is it? Bug Fix | Hot Fix ### Todos - [x] cleanup on spark download attempts - [x] leverage Travis CI [cacheing](https://docs.travis-ci.com/user/caching) for spark and pyspark binaries under `.spark-dist` ### What is the Jira issue? [ZEPPELIN-783](https://issues.apache.org/jira/browse/ZEPPELIN-783) ### How should this be tested? CI must be green ### Questions: * Does the licenses files need update? No * Is there breaking changes for older versions? No * Does this needs documentation? No Author: Alexander Bezzubov <[email protected]> Closes apache#810 from bzz/ZEPPELIN-783-fix-ci-spark-download and squashes the following commits: 9d59646 [Alexander Bezzubov] ZEPPELIN-783: consistent download timeout b6310f0 [Alexander Bezzubov] ZEPPELIN-783: add debug info: download, Zepeplin config 5d0eb2d [Alexander Bezzubov] ZEPPELIN-783: pyspark&spark cache under .spark-distr, but unpack to root d4ef96d [Alexander Bezzubov] ZEPPELIN-783: exclude .spark-dist cache from RAT 388d76b [Alexander Bezzubov] ZEPPELIN-783: backport from Spark download to start\stop scripts fa8b516 [Alexander Bezzubov] ZEPPELIN-783: reconcile CI-time and build-time Spark download locations 542a305 [Alexander Bezzubov] ZEPPELIN-783: use TravisCI caching for relieable Spark download bd1d5e2 [Alexander Bezzubov] ZEPPELIN-783: add cleanup on download failure b413743 [Alexander Bezzubov] ZEPPELIN-783: refactoring - extract SPARK_ARCHIVE var 346e075 [Alexander Bezzubov] ZEPPELIN-783: upd shell style
doanduyhai · Apr 2, 2016 · 67e0fd5 · 67e0fd5
1 parent a7a7bdb
commit 67e0fd5
Show file tree

Hide file tree

Showing 9 changed files with 107 additions and 46 deletions.
diff --git a/.gitignore b/.gitignore
@@ -12,6 +12,7 @@
 spark/derby.log
 spark/metastore_db
 spark-1.*-bin-hadoop*
+.spark-dist
 zeppelin-server/derby.log
 
 lens/lens-cli-hist.log

diff --git a/.travis.yml b/.travis.yml
@@ -16,6 +16,9 @@
 language: java
 
 sudo: false
+cache:
+  directories:
+    - .spark-dist
 
 matrix:
   include:
@@ -48,6 +51,7 @@ matrix:
       env: TEST_SELENIUM="true" SPARK_VER="1.6.0" HADOOP_VER="2.3" PROFILE="-Pspark-1.6 -Phadoop-2.3 -Ppyspark" BUILD_FLAG="package -DskipTests" TEST_FLAG="verify" TEST_PROJECTS="-pl zeppelin-interpreter,zeppelin-zengine,zeppelin-server,zeppelin-display,spark-dependencies,spark -Dtest=org.apache.zeppelin.AbstractFunctionalSuite -DfailIfNoTests=false"
 
 before_install:
+  - "ls -la .spark-dist"
   - "export DISPLAY=:99.0"
   - "sh -e /etc/init.d/xvfb start"
 
@@ -58,6 +62,7 @@ before_script:
   - travis_retry ./testing/downloadSpark.sh $SPARK_VER $HADOOP_VER
   - ./testing/startSparkCluster.sh $SPARK_VER $HADOOP_VER
   - echo "export SPARK_HOME=`pwd`/spark-$SPARK_VER-bin-hadoop$HADOOP_VER" > conf/zeppelin-env.sh
+  - tail conf/zeppelin-env.sh
 
 script:
   - mvn $TEST_FLAG $PROFILE -B $TEST_PROJECTS

diff --git a/pom.xml b/pom.xml
@@ -241,6 +241,7 @@
         <groupId>org.apache.rat</groupId>
         <artifactId>apache-rat-plugin</artifactId>
       </plugin>
+
       <plugin>
         <artifactId>maven-compiler-plugin</artifactId>
         <version>3.1</version>
@@ -249,6 +250,7 @@
           <target>1.7</target>
         </configuration>
       </plugin>
+
       <!-- Test coverage plugin -->
       <plugin>
         <groupId>org.codehaus.mojo</groupId>
@@ -270,6 +272,7 @@
           </execution>
         </executions>
       </plugin>
+
       <!-- Checkstyle plugin -->
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
@@ -488,6 +491,7 @@
               <exclude>conf/notebook-authorization.json</exclude>
               <exclude>conf/zeppelin-env.sh</exclude>
               <exclude>spark-*-bin*/**</exclude>
+              <exclude>.spark-dist/**</exclude>
 
               <!-- bundled from bootstrap -->
               <exclude>docs/assets/themes/zeppelin/bootstrap/**</exclude>
@@ -640,6 +644,13 @@
             </lifecycleMappingMetadata>
           </configuration>
         </plugin>
+
+        <plugin>
+          <groupId>org.apache.maven.plugins</groupId>
+          <artifactId>maven-antrun-plugin</artifactId>
+          <version>1.7</version>
+        </plugin>
+
       </plugins>
     </pluginManagement>
   </build>

diff --git a/spark-dependencies/pom.xml b/spark-dependencies/pom.xml
@@ -50,7 +50,11 @@
     <akka.group>org.spark-project.akka</akka.group>
     <akka.version>2.3.4-spark</akka.version>
 
-    <spark.download.url>http://archive.apache.org/dist/spark/spark-${spark.version}/spark-${spark.version}.tgz</spark.download.url>
+    <spark.archive>spark-${spark.version}</spark.archive>
+    <spark.download.url>
+      http://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}.tgz
+    </spark.download.url>
+    <spark.dist.cache>${project.build.directory}/../../.spark-dist</spark.dist.cache>
     <py4j.version>0.8.2.1</py4j.version>
   </properties>
 
@@ -787,12 +791,12 @@
                 </goals>
                 <configuration>
                   <url>${spark.download.url}</url>
-                  <unpack>true</unpack>
-                  <outputDirectory>${project.build.directory}/spark-dist</outputDirectory>
+                  <outputDirectory>${spark.dist.cache}</outputDirectory>
                 </configuration>
               </execution>
             </executions>
           </plugin>
+
           <plugin>
             <artifactId>maven-clean-plugin</artifactId>
             <configuration>
@@ -806,13 +810,28 @@
               </filesets>
             </configuration>
           </plugin>
+
           <plugin>
             <groupId>org.apache.maven.plugins</groupId>
             <artifactId>maven-antrun-plugin</artifactId>
-            <version>1.7</version>
             <executions>
               <execution>
-                <id>download-and-zip-pyspark-files</id>
+                <id>unzip-pyspark-files</id>
+                <phase>validate</phase>
+                <goals>
+                  <goal>run</goal>
+                </goals>
+                <configuration>
+                  <target>
+                    <untar src="${spark.dist.cache}/${spark.archive}.tgz"
+                           dest="${project.build.directory}/spark-dist"
+                           compression="gzip"/>
+                  </target>
+                </configuration>
+              </execution>
+
+              <execution>
+                <id>zip-pyspark-files</id>
                 <phase>generate-resources</phase>
                 <goals>
                   <goal>run</goal>
@@ -821,9 +840,9 @@
                   <target>
                     <delete dir="../interpreter/spark/pyspark"/>
                     <copy todir="../interpreter/spark/pyspark"
-                        file="${project.build.directory}/spark-dist/spark-${spark.version}/python/lib/py4j-${py4j.version}-src.zip"/>
+                          file="${project.build.directory}/spark-dist/${spark.archive}/python/lib/py4j-${py4j.version}-src.zip"/>
                     <zip destfile="${project.build.directory}/../../interpreter/spark/pyspark/pyspark.zip"
-                         basedir="${project.build.directory}/spark-dist/spark-${spark.version}/python"
+                         basedir="${project.build.directory}/spark-dist/${spark.archive}/python"
                          includes="pyspark/*.py,pyspark/**/*.py"/>
                   </target>
                 </configuration>

diff --git a/testing/downloadSpark.sh b/testing/downloadSpark.sh
@@ -17,7 +17,7 @@
 #
 
 
-if [ $# -ne 2 ]; then
+if [[ "$#" -ne 2 ]]; then
     echo "usage) $0 [spark version] [hadoop version]"
     echo "   eg) $0 1.3.1 2.6"
     exit 1
@@ -26,10 +26,10 @@ fi
 SPARK_VERSION="${1}"
 HADOOP_VERSION="${2}"
 
-echo ${SPARK_VERSION} | grep "^1.[123].[0-9]" > /dev/null
-if [ $? -eq 0 ]; then
+echo "${SPARK_VERSION}" | grep "^1.[123].[0-9]" > /dev/null
+if [[ "$?" -eq 0 ]]; then
   echo "${SPARK_VERSION}" | grep "^1.[12].[0-9]" > /dev/null
-  if [ $? -eq 0 ]; then
+  if [[ "$?" -eq 0 ]]; then
     SPARK_VER_RANGE="<=1.2"
   else
     SPARK_VER_RANGE="<=1.3"
@@ -40,31 +40,52 @@ fi
 
 set -xe
 
-FWDIR=$(dirname "${BASH_SOURCE-$0}")
+TIMEOUT_SEC=590
+FWDIR="$(dirname "${BASH_SOURCE-$0}")"
 ZEPPELIN_HOME="$(cd "${FWDIR}/.."; pwd)"
-export SPARK_HOME=${ZEPPELIN_HOME}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}
+
+SPARK_CACHE=".spark-dist"
+SPARK_ARCHIVE="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}"
+export SPARK_HOME="${ZEPPELIN_HOME}/${SPARK_ARCHIVE}"
 echo "SPARK_HOME is ${SPARK_HOME}"
-if [ ! -d "${SPARK_HOME}" ]; then
-    if [ "${SPARK_VER_RANGE}" == "<=1.2" ]; then
-        # spark 1.1.x and spark 1.2.x can be downloaded from archive
-        STARTTIME=`date +%s`
-        timeout -s KILL 300 wget -q http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz
-        ENDTIME=`date +%s`
-        DOWNLOADTIME=$((ENDTIME-STARTTIME))
-    else
-        # spark 1.3.x and later can be downloaded from mirror
-        # get download address from mirror
-        MIRROR_INFO=$(curl -s "http://www.apache.org/dyn/closer.cgi/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz?asjson=1")
 
-        PREFFERED=$(echo "${MIRROR_INFO}" | grep preferred | sed 's/[^"]*.preferred.: .\([^"]*\).*/\1/g')
-        PATHINFO=$(echo "${MIRROR_INFO}" | grep path_info | sed 's/[^"]*.path_info.: .\([^"]*\).*/\1/g')
+if [[ ! -d "${SPARK_HOME}" ]]; then
+    mkdir -p "${SPARK_CACHE}"
+    cd "${SPARK_CACHE}"
+    if [[ ! -f "${SPARK_ARCHIVE}.tgz" ]]; then
+        pwd
+        ls -la .
+        echo "${SPARK_CACHE} does not have ${SPARK_ARCHIVE} downloading ..."
+        # download archive if not cached
+        if [[ "${SPARK_VER_RANGE}" == "<=1.2" ]]; then
+            # spark 1.1.x and spark 1.2.x can be downloaded from archive
+            STARTTIME=`date +%s`
+            timeout -s KILL "${TIMEOUT_SEC}" wget -q "http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_ARCHIVE}.tgz"
+            ENDTIME=`date +%s`
+            DOWNLOADTIME="$((ENDTIME-STARTTIME))"
+        else
+            # spark 1.3.x and later can be downloaded from mirror
+            # get download address from mirror
+            MIRROR_INFO=$(curl -s "http://www.apache.org/dyn/closer.cgi/spark/spark-${SPARK_VERSION}/${SPARK_ARCHIVE}.tgz?asjson=1")
+
+            PREFFERED=$(echo "${MIRROR_INFO}" | grep preferred | sed 's/[^"]*.preferred.: .\([^"]*\).*/\1/g')
+            PATHINFO=$(echo "${MIRROR_INFO}" | grep path_info | sed 's/[^"]*.path_info.: .\([^"]*\).*/\1/g')
+
+            STARTTIME=`date +%s`
+            timeout -s KILL "${TIMEOUT_SEC}" wget -q "${PREFFERED}${PATHINFO}"
+            ENDTIME=`date +%s`
+            DOWNLOADTIME="$((ENDTIME-STARTTIME))"
+        fi
+    fi
 
-        STARTTIME=`date +%s`
-        timeout -s KILL 590 wget -q "${PREFFERED}${PATHINFO}"
-        ENDTIME=`date +%s`
-        DOWNLOADTIME=$((ENDTIME-STARTTIME))
+    # extract archive in un-cached root, clean-up on failure
+    cp "${SPARK_ARCHIVE}.tgz" ..
+    cd ..
+    if ! tar zxf "${SPARK_ARCHIVE}.tgz" ; then
+        echo "Unable to extract ${SPARK_ARCHIVE}.tgz" >&2
+        rm -rf "${SPARK_ARCHIVE}"
+        rm -f "${SPARK_ARCHIVE}.tgz"
     fi
-    tar zxf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz
 fi
 
 set +xe
diff --git a/testing/startSparkCluster.sh b/testing/startSparkCluster.sh
@@ -17,7 +17,7 @@
 #
 
 
-if [ $# -ne 2 ]; then
+if [[ "$#" -ne 2 ]]; then
     echo "usage) $0 [spark version] [hadoop version]"
     echo "   eg) $0 1.3.1 2.6"
     exit 1
@@ -26,10 +26,10 @@ fi
 SPARK_VERSION="${1}"
 HADOOP_VERSION="${2}"
 
-echo ${SPARK_VERSION} | grep "^1.[123].[0-9]" > /dev/null
-if [ $? -eq 0 ]; then
+echo "${SPARK_VERSION}" | grep "^1.[123].[0-9]" > /dev/null
+if [[ "$?" -eq 0 ]]; then
   echo "${SPARK_VERSION}" | grep "^1.[12].[0-9]" > /dev/null
-  if [ $? -eq 0 ]; then
+  if [[ "$?" -eq 0 ]]; then
     SPARK_VER_RANGE="<=1.2"
   else
     SPARK_VER_RANGE="<=1.3"
@@ -38,17 +38,18 @@ else
   SPARK_VER_RANGE=">1.3"
 fi
 
-
 set -xe
 
-FWDIR=$(dirname "${BASH_SOURCE-$0}")
+FWDIR="$(dirname "${BASH_SOURCE-$0}")"
 ZEPPELIN_HOME="$(cd "${FWDIR}/.."; pwd)"
-export SPARK_HOME=${ZEPPELIN_HOME}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}
+
+SPARK_ARCHIVE="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}"
+export SPARK_HOME="${ZEPPELIN_HOME}/${SPARK_ARCHIVE}"
 echo "SPARK_HOME is ${SPARK_HOME}"
 
 # create PID dir. test case detect pid file so they can select active spark home dir for test
-mkdir -p ${SPARK_HOME}/run
-export SPARK_PID_DIR=${SPARK_HOME}/run
+export SPARK_PID_DIR="${SPARK_HOME}/run"
+mkdir -p "${SPARK_PID_DIR}"
 
 # start
 export SPARK_MASTER_PORT=7071

diff --git a/testing/stopSparkCluster.sh b/testing/stopSparkCluster.sh
@@ -16,7 +16,7 @@
 # limitations under the License.
 #
 
-if [ $# -ne 2 ]; then
+if [[ "$#" -ne 2 ]]; then
     echo "usage) $0 [spark version] [hadoop version]"
     echo "   eg) $0 1.3.1 2.6"
     exit 1
@@ -27,12 +27,15 @@ HADOOP_VERSION="${2}"
 
 set -xe
 
-FWDIR=$(dirname "${BASH_SOURCE-$0}")
+FWDIR="$(dirname "${BASH_SOURCE-$0}")"
 ZEPPELIN_HOME="$(cd "${FWDIR}/.."; pwd)"
-export SPARK_HOME=${ZEPPELIN_HOME}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}
+
+SPARK_ARCHIVE="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}"
+export SPARK_HOME="${ZEPPELIN_HOME}/${SPARK_ARCHIVE}"
+echo "SPARK_HOME is ${SPARK_HOME}"
 
 # set create PID dir
-export SPARK_PID_DIR=${SPARK_HOME}/run
+export SPARK_PID_DIR="${SPARK_HOME}/run"
 
 ${SPARK_HOME}/sbin/spark-daemon.sh stop org.apache.spark.deploy.worker.Worker 1
 ${SPARK_HOME}/sbin/stop-master.sh

diff --git a/zeppelin-server/pom.xml b/zeppelin-server/pom.xml
@@ -369,8 +369,8 @@
       </plugin>
 
       <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-antrun-plugin</artifactId>
-        <version>1.6</version>
         <executions>
           <execution>
             <id>start-zeppelin</id>

diff --git a/zeppelin-web/pom.xml b/zeppelin-web/pom.xml
@@ -47,10 +47,10 @@
           <webXml>dist\WEB-INF\web.xml</webXml>
         </configuration>
       </plugin>
+
       <plugin>
         <groupId>org.apache.rat</groupId>
         <artifactId>apache-rat-plugin</artifactId>
-        <version>0.11</version>
         <configuration>
           <excludes>
             <exclude>**/.idea/</exclude>