Fixed the bug that shuffle serializer is ignored by the new shuffle

block iterators for local blocks. Also added a unit test for that.
2013-05-24 14:08:37 -07:00 · 2013-05-24 14:08:37 -07:00 · 6ea085169d
parent dbbedfc535
commit 6ea085169d
2 changed files with 19 additions and 4 deletions
--- a/core/src/main/scala/spark/storage/BlockFetcherIterator.scala
+++ b/core/src/main/scala/spark/storage/BlockFetcherIterator.scala
@ -163,7 +163,7 @@ object BlockFetcherIterator {
      // these all at once because they will just memory-map some files, so they won't consume
      // any memory that might exceed our maxBytesInFlight
      for (id <- localBlockIds) {
-        getLocal(id) match {
+        getLocalFromDisk(id, serializer) match {
          case Some(iter) => {
            // Pass 0 as size since it's not in flight
            results.put(new FetchResult(id, 0, () => iter))
--- a/core/src/test/scala/spark/ShuffleSuite.scala
+++ b/core/src/test/scala/spark/ShuffleSuite.scala
@ -305,11 +305,26 @@ class ShuffleSuite extends FunSuite with ShouldMatchers with LocalSparkContext {
    assert(c.partitioner.get === p)
  }

+  test("shuffle serializer") {
+    // Use a local cluster with 2 processes to make sure there are both local and remote blocks
+    sc = new SparkContext("local-cluster[1,2,512]", "test")
+    val a = sc.parallelize(1 to 10, 2)
+    val b = a.map { x =>
+      (x, new ShuffleSuite.NonJavaSerializableClass(x * 2))
+    }
+    // If the Kryo serializer is not used correctly, the shuffle would fail because the
+    // default Java serializer cannot handle the non serializable class.
+    val c = new ShuffledRDD(b, new HashPartitioner(3), classOf[spark.KryoSerializer].getName)
+    assert(c.count === 10)
+  }
 }

 object ShuffleSuite {
+
  def mergeCombineException(x: Int, y: Int): Int = {
    throw new SparkException("Exception for map-side combine.")
    x + y
  }
+
+  class NonJavaSerializableClass(val value: Int)
 }