HIVE-9281 : Code cleanup [Spark Branch] (Szehon, reviewed by Xuefu)

git-svn-id: https://svn.apache.org/repos/asf/hive/branches/spark@1650201 13f79535-47bb-0310-9956-ffa450edef68
gautamboddeda · Jan 8, 2015 · 153e9ec · 153e9ec
1 parent 10e7dbb
commit 153e9ec
Show file tree

Hide file tree

Showing 83 changed files with 603 additions and 719 deletions.
diff --git a/itests/hive-unit/src/test/java/org/apache/hive/jdbc/TestJdbcWithLocalClusterSpark.java b/itests/hive-unit/src/test/java/org/apache/hive/jdbc/TestJdbcWithLocalClusterSpark.java
@@ -124,7 +124,7 @@ public static void afterTest() throws Exception {
   }
 
   /**
-   * Verify that the connection to HS2 with MiniMr is successful
+   * Verify that the connection to HS2 with MiniMr is successful.
    * @throws Exception
    */
   @Test
@@ -134,7 +134,7 @@ public void testConnection() throws Exception {
   }
 
   /**
-   * Run nonMr query
+   * Run nonMr query.
    * @throws Exception
    */
   @Test
@@ -147,15 +147,15 @@ public void testNonSparkQuery() throws Exception {
   }
 
   /**
-   * Run nonMr query
+   * Run nonMr query.
    * @throws Exception
    */
   @Test
   public void testSparkQuery() throws Exception {
     String tableName = "testTab2";
     String resultVal = "val_238";
-    String queryStr = "SELECT * FROM " + tableName +
-        " where value = '" + resultVal + "'";
+    String queryStr = "SELECT * FROM " + tableName
+        + " where value = '" + resultVal + "'";
 
     testKvQuery(tableName, queryStr, resultVal);
   }
@@ -233,8 +233,8 @@ public void testTempTable() throws Exception {
         + dataFilePath.toString() + "' into table " + tempTableName);
 
     String resultVal = "val_238";
-    String queryStr = "SELECT * FROM " + tempTableName +
-        " where value = '" + resultVal + "'";
+    String queryStr = "SELECT * FROM " + tempTableName
+        + " where value = '" + resultVal + "'";
     verifyResult(queryStr, resultVal, 2);
 
     // A second connection should not be able to see the table
@@ -244,8 +244,7 @@ public void testTempTable() throws Exception {
     stmt2.execute("USE " + dbName);
     boolean gotException = false;
     try {
-      ResultSet res;
-      res = stmt2.executeQuery(queryStr);
+      stmt2.executeQuery(queryStr);
     } catch (SQLException err) {
       // This is expected to fail.
       assertTrue("Expecting table not found error, instead got: " + err,
@@ -266,7 +265,7 @@ private void checkForNotExist(ResultSet res) throws Exception {
   }
 
   /**
-   * Verify if the given property contains the expected value
+   * Verify if the given property contains the expected value.
    * @param propertyName
    * @param expectedValue
    * @throws Exception
@@ -275,7 +274,7 @@ private void verifyProperty(String propertyName, String expectedValue) throws Ex
     Statement stmt = hs2Conn .createStatement();
     ResultSet res = stmt.executeQuery("set " + propertyName);
     assertTrue(res.next());
-    String results[] = res.getString(1).split("=");
+    String[] results = res.getString(1).split("=");
     assertEquals("Property should be set", results.length, 2);
     assertEquals("Property should be set", expectedValue, results[1]);
   }

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/SparkHashTableSinkOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/SparkHashTableSinkOperator.java
@@ -44,6 +44,7 @@
 
 public class SparkHashTableSinkOperator
     extends TerminalOperator<SparkHashTableSinkDesc> implements Serializable {
+  private static final int MIN_REPLICATION = 10;
   private static final long serialVersionUID = 1L;
   private final String CLASS_NAME = this.getClass().getName();
   private final PerfLogger perfLogger = PerfLogger.getPerfLogger();
@@ -122,7 +123,6 @@ protected void flushToFile(MapJoinPersistableTableContainer tableContainer,
         + "-" + Math.abs(Utilities.randGen.nextInt()));
       try {
         // This will guarantee file name uniqueness.
-        // TODO: can we use the task id, which should be unique
         if (fs.createNewFile(path)) {
           break;
         }
@@ -131,10 +131,10 @@ protected void flushToFile(MapJoinPersistableTableContainer tableContainer,
       }
       // TODO find out numOfPartitions for the big table
       int numOfPartitions = replication;
-      replication = (short)Math.min(10, numOfPartitions);
+      replication = (short) Math.min(MIN_REPLICATION, numOfPartitions);
     }
-    htsOperator.console.printInfo(Utilities.now() + "\tDump the side-table for tag: " + tag +
-      " with group count: " + tableContainer.size() + " into file: " + path);
+    htsOperator.console.printInfo(Utilities.now() + "\tDump the side-table for tag: " + tag
+      + " with group count: " + tableContainer.size() + " into file: " + path);
     // get the hashtable file and path
     // get the hashtable file and path
     OutputStream os = null;
@@ -153,8 +153,8 @@ protected void flushToFile(MapJoinPersistableTableContainer tableContainer,
     }
     tableContainer.clear();
     FileStatus status = fs.getFileStatus(path);
-    htsOperator.console.printInfo(Utilities.now() + "\tUploaded 1 File to: " + path +
-      " (" + status.getLen() + " bytes)");
+    htsOperator.console.printInfo(Utilities.now() + "\tUploaded 1 File to: " + path
+      + " (" + status.getLen() + " bytes)");
     perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.SPARK_FLUSH_HASHTABLE + this.getName());
   }
 

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HashTableLoader.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HashTableLoader.java
@@ -101,7 +101,7 @@ public void load(
             bigInputPath = null;
           } else {
             Set<String> aliases =
-              ((SparkBucketMapJoinContext)mapJoinCtx).getPosToAliasMap().get(pos);
+              ((SparkBucketMapJoinContext) mapJoinCtx).getPosToAliasMap().get(pos);
             String alias = aliases.iterator().next();
             // Any one small table input path
             String smallInputPath =
@@ -110,7 +110,7 @@ public void load(
           }
         }
         String fileName = localWork.getBucketFileName(bigInputPath);
-        Path path = Utilities.generatePath(baseDir, desc.getDumpFilePrefix(), (byte)pos, fileName);
+        Path path = Utilities.generatePath(baseDir, desc.getDumpFilePrefix(), (byte) pos, fileName);
         LOG.info("\tLoad back all hashtable files from tmp folder uri:" + path);
         mapJoinTables[pos] = mapJoinTableSerdes[pos].load(fs, path);
       }

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HiveBaseFunctionResultList.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HiveBaseFunctionResultList.java
@@ -70,13 +70,15 @@ public void collect(HiveKey key, BytesWritable value) throws IOException {
   /** Process the given record. */
   protected abstract void processNextRecord(T inputRecord) throws IOException;
 
-  /** Is the current state of the record processor done? */
+  /**
+   * @return true if current state of the record processor is done.
+   */
   protected abstract boolean processingDone();
 
-  /** Close the record processor */
+  /** Close the record processor. */
   protected abstract void closeRecordProcessor();
 
-  /** Implement Iterator interface */
+  /** Implement Iterator interface. */
   public class ResultIterator implements Iterator {
     @Override
     public boolean hasNext(){
@@ -98,8 +100,7 @@ public boolean hasNext(){
             return true;
           }
         } catch (IOException ex) {
-          // TODO: better handling of exception.
-          throw new RuntimeException("Error while processing input.", ex);
+          throw new IllegalStateException("Error while processing input.", ex);
         }
       }
 

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HiveKVResultCache.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HiveKVResultCache.java
@@ -79,7 +79,7 @@ private static RowContainer initRowContainer(Configuration conf) {
 
       container.setSerDe(serDe, oi);
       container.setTableDesc(tableDesc);
-    } catch(Exception ex) {
+    } catch (Exception ex) {
       throw new RuntimeException("Failed to create RowContainer", ex);
     }
     return container;
@@ -114,7 +114,7 @@ public synchronized void clear() {
     }
     try {
       container.clearRows();
-    } catch(HiveException ex) {
+    } catch (HiveException ex) {
       throw new RuntimeException("Failed to clear rows in RowContainer", ex);
     }
     cursor = 0;

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HiveMapFunction.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HiveMapFunction.java
@@ -22,6 +22,7 @@
 import org.apache.hadoop.hive.ql.io.HiveKey;
 import org.apache.hadoop.hive.ql.io.merge.MergeFileMapper;
 import org.apache.hadoop.io.BytesWritable;
+
 import scala.Tuple2;
 
 import java.util.Iterator;
@@ -35,6 +36,7 @@ public HiveMapFunction(byte[] jobConfBuffer, SparkReporter sparkReporter) {
     super(jobConfBuffer, sparkReporter);
   }
 
+  @SuppressWarnings("unchecked")
   @Override
   public Iterable<Tuple2<HiveKey, BytesWritable>>
   call(Iterator<Tuple2<BytesWritable, BytesWritable>> it) throws Exception {
@@ -50,7 +52,6 @@ public HiveMapFunction(byte[] jobConfBuffer, SparkReporter sparkReporter) {
     }
 
     HiveMapFunctionResultList result = new HiveMapFunctionResultList(jobConf, it, mapRecordHandler);
-    //TODO we need to implement a Spark specified Reporter to collect stats, refer to HIVE-7709.
     mapRecordHandler.init(jobConf, result, sparkReporter);
 
     return result;

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HiveMapFunctionResultList.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HiveMapFunctionResultList.java
@@ -19,7 +19,6 @@
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.BytesWritable;
-import org.apache.hadoop.mapred.Reporter;
 import scala.Tuple2;
 
 import java.io.IOException;
@@ -32,6 +31,7 @@ public class HiveMapFunctionResultList extends
   /**
    * Instantiate result set Iterable for Map function output.
    *
+   * @param conf Hive configuration.
    * @param inputIterator Input record iterator.
    * @param handler Initialized {@link SparkMapRecordHandler} instance.
    */

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HivePairFlatMapFunction.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HivePairFlatMapFunction.java
@@ -26,13 +26,13 @@
 
 
 public abstract class HivePairFlatMapFunction<T, K, V> implements PairFlatMapFunction<T, K, V> {
-  private static final NumberFormat taskIdFormat = NumberFormat.getInstance();
-  private static final NumberFormat stageIdFormat = NumberFormat.getInstance();
+  private static final NumberFormat TASK_ID_FORMAT = NumberFormat.getInstance();
+  private static final NumberFormat STAGE_ID_FORMAT = NumberFormat.getInstance();
   static {
-    taskIdFormat.setGroupingUsed(false);
-    taskIdFormat.setMinimumIntegerDigits(6);
-    stageIdFormat.setGroupingUsed(false);
-    stageIdFormat.setMinimumIntegerDigits(4);
+    TASK_ID_FORMAT.setGroupingUsed(false);
+    TASK_ID_FORMAT.setMinimumIntegerDigits(6);
+    STAGE_ID_FORMAT.setGroupingUsed(false);
+    STAGE_ID_FORMAT.setMinimumIntegerDigits(4);
   }
 
   protected transient JobConf jobConf;
@@ -60,7 +60,7 @@ private void setupMRLegacyConfigs() {
     StringBuilder taskAttemptIdBuilder = new StringBuilder("attempt_");
     taskAttemptIdBuilder.append(System.currentTimeMillis())
       .append("_")
-      .append(stageIdFormat.format(TaskContext.get().stageId()))
+      .append(STAGE_ID_FORMAT.format(TaskContext.get().stageId()))
       .append("_");
 
     if (isMap()) {
@@ -71,7 +71,7 @@ private void setupMRLegacyConfigs() {
 
     // Spark task attempt id is increased by Spark context instead of task, which may introduce
     // unstable qtest output, since non Hive features depends on this, we always set it to 0 here.
-    taskAttemptIdBuilder.append(taskIdFormat.format(TaskContext.get().partitionId()))
+    taskAttemptIdBuilder.append(TASK_ID_FORMAT.format(TaskContext.get().partitionId()))
       .append("_0");
 
     String taskAttemptIdStr = taskAttemptIdBuilder.toString();

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HiveReduceFunction.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HiveReduceFunction.java
@@ -20,6 +20,7 @@
 
 import org.apache.hadoop.hive.ql.io.HiveKey;
 import org.apache.hadoop.io.BytesWritable;
+
 import scala.Tuple2;
 
 import java.util.Iterator;
@@ -33,6 +34,7 @@ public HiveReduceFunction(byte[] buffer, SparkReporter sparkReporter) {
     super(buffer, sparkReporter);
   }
 
+  @SuppressWarnings("unchecked")
   @Override
   public Iterable<Tuple2<HiveKey, BytesWritable>>
   call(Iterator<Tuple2<HiveKey, Iterable<BytesWritable>>> it) throws Exception {

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HiveReduceFunctionResultList.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HiveReduceFunctionResultList.java
@@ -32,6 +32,7 @@ public class HiveReduceFunctionResultList extends
   /**
    * Instantiate result set Iterable for Reduce function output.
    *
+   * @param conf Hive configuration.
    * @param inputIterator Input record iterator.
    * @param reducer Initialized {@link org.apache.hadoop.hive.ql.exec.mr.ExecReducer} instance.
    */

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HiveSparkClient.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HiveSparkClient.java
@@ -34,12 +34,15 @@ public interface HiveSparkClient extends Serializable, Closeable {
    * @return SparkJobRef could be used to track spark job progress and metrics.
    * @throws Exception
    */
-  public SparkJobRef execute(DriverContext driverContext, SparkWork sparkWork) throws Exception;
+  SparkJobRef execute(DriverContext driverContext, SparkWork sparkWork) throws Exception;
 
-  public SparkConf getSparkConf();
+  /**
+   * @return spark configuration
+   */
+  SparkConf getSparkConf();
 
   /**
-   * Get the count of executors
+   * @return the number of executors
    */
-  public int getExecutorCount() throws Exception;
+  int getExecutorCount() throws Exception;
 }
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HiveSparkClientFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/HiveSparkClientFactory.java
@@ -33,7 +33,7 @@
 import org.apache.spark.SparkException;
 
 public class HiveSparkClientFactory {
-  protected static transient final Log LOG = LogFactory.getLog(HiveSparkClientFactory.class);
+  protected static final transient Log LOG = LogFactory.getLog(HiveSparkClientFactory.class);
 
   private static final String SPARK_DEFAULT_CONF_FILE = "spark-defaults.conf";
   private static final String SPARK_DEFAULT_MASTER = "local";

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/KryoSerializer.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/KryoSerializer.java
@@ -27,10 +27,8 @@
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.hive.ql.exec.Utilities;
-import org.apache.hadoop.hive.ql.exec.mr.ExecMapper;
 import org.apache.hadoop.mapred.JobConf;
 
-import com.esotericsoftware.kryo.Kryo;
 import com.esotericsoftware.kryo.io.Input;
 import com.esotericsoftware.kryo.io.Output;
 

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/LocalHiveSparkClient.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/LocalHiveSparkClient.java
@@ -56,7 +56,7 @@ public class LocalHiveSparkClient implements HiveSparkClient {
   private static final long serialVersionUID = 1L;
 
   private static final String MR_JAR_PROPERTY = "tmpjars";
-  protected static transient final Log LOG = LogFactory
+  protected static final transient Log LOG = LogFactory
       .getLog(LocalHiveSparkClient.class);
 
   private static final Splitter CSV_SPLITTER = Splitter.on(",").omitEmptyStrings();
@@ -138,7 +138,7 @@ public SparkJobRef execute(DriverContext driverContext, SparkWork sparkWork) thr
    * At this point single SparkContext is used by more than one thread, so make this
    * method synchronized.
    *
-   * TODO: This method can't remove a jar/resource from SparkContext. Looks like this is an
+   * This method can't remove a jar/resource from SparkContext. Looks like this is an
    * issue we have to live with until multiple SparkContexts are supported in a single JVM.
    */
   private synchronized void refreshLocalResources(SparkWork sparkWork, HiveConf conf) {

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/RemoteHiveSparkClient.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/RemoteHiveSparkClient.java
@@ -64,10 +64,10 @@ public class RemoteHiveSparkClient implements HiveSparkClient {
   private static final long serialVersionUID = 1L;
 
   private static final String MR_JAR_PROPERTY = "tmpjars";
-  protected static transient final Log LOG = LogFactory
+  protected static final transient Log LOG = LogFactory
     .getLog(RemoteHiveSparkClient.class);
 
-  private static transient final Splitter CSV_SPLITTER = Splitter.on(",").omitEmptyStrings();
+  private static final transient Splitter CSV_SPLITTER = Splitter.on(",").omitEmptyStrings();
 
   private transient SparkClient remoteClient;
   private transient SparkConf sparkConf;