Split vTableReplicated into two RDDs

Previously, (vTableReplicated: IndexedRDD[Pid, VertexHashMap[VD]]) stored one hashmap per partition, taking Vid directly to VD. To take advantage of rxin's new hashmaps (see rxin/incubator-spark@32a79d6d13), this commit splits that data structure into two RDDs: (vTableReplicationMap: IndexedRDD[Pid, VertexIdToIndexMap]) stores a map per partition from vertex ID to the index where that vertex's attribute is stored. This index refers to an array in the same partition in vTableReplicatedValues. (vTableReplicatedValues: IndexedRDD[Pid, Array[VD]]) stores the vertex data and is arranged as described above.
2013-10-16 18:59:54 -07:00 · 2013-10-16 18:59:54 -07:00 · bc234bf0e1
parent af8e461841
commit bc234bf0e1
2 changed files with 68 additions and 48 deletions
--- a/graph/src/main/scala/org/apache/spark/graph/impl/GraphImpl.scala
+++ b/graph/src/main/scala/org/apache/spark/graph/impl/GraphImpl.scala
@ -24,7 +24,8 @@ import org.apache.spark.graph.impl.MessageToPartitionRDDFunctions._
 * The Iterator type returned when constructing edge triplets
 */
 class EdgeTripletIterator[VD: ClassManifest, ED: ClassManifest](
-  val vmap: VertexHashMap[VD],
+  val vidToIndex: VertexIdToIndexMap,
+  val vertexArray: Array[VD],
  val edgePartition: EdgePartition[ED]) extends Iterator[EdgeTriplet[VD, ED]] {

  private var pos = 0
@ -34,10 +35,10 @@ class EdgeTripletIterator[VD: ClassManifest, ED: ClassManifest](
  override def next() = {
    et.srcId = edgePartition.srcIds(pos)
    // assert(vmap.containsKey(e.src.id))
-    et.srcAttr = vmap.get(et.srcId)
+    et.srcAttr = vertexArray(vidToIndex(et.srcId))
    et.dstId = edgePartition.dstIds(pos)
    // assert(vmap.containsKey(e.dst.id))
-    et.dstAttr = vmap.get(et.dstId)
+    et.dstAttr = vertexArray(vidToIndex(et.dstId))
    //println("Iter called: " + pos)
    et.attr = edgePartition.data(pos)
    pos += 1
@ -50,10 +51,10 @@ class EdgeTripletIterator[VD: ClassManifest, ED: ClassManifest](
    for (i <- (0 until edgePartition.size)) {
      currentEdge.srcId = edgePartition.srcIds(i)
      // assert(vmap.containsKey(e.src.id))
-      currentEdge.srcAttr = vmap.get(currentEdge.srcId)
+      currentEdge.srcAttr = vertexArray(vidToIndex(currentEdge.srcId))
      currentEdge.dstId = edgePartition.dstIds(i)
      // assert(vmap.containsKey(e.dst.id))
-      currentEdge.dstAttr = vmap.get(currentEdge.dstId)
+      currentEdge.dstAttr = vertexArray(vidToIndex(currentEdge.dstId))
      currentEdge.attr = edgePartition.data(i)
      lb += currentEdge
    }
@ -63,17 +64,18 @@ class EdgeTripletIterator[VD: ClassManifest, ED: ClassManifest](

 object EdgeTripletBuilder {
  def makeTriplets[VD: ClassManifest, ED: ClassManifest]( 
-    vTableReplicated: IndexedRDD[Pid, VertexHashMap[VD]],
+    vTableReplicationMap: IndexedRDD[Pid, VertexIdToIndexMap],
+    vTableReplicatedValues: IndexedRDD[Pid, Array[VD]],
    eTable: IndexedRDD[Pid, EdgePartition[ED]]): RDD[EdgeTriplet[VD, ED]] = {
-    val iterFun = (iter: Iterator[(Pid, (VertexHashMap[VD], EdgePartition[ED]))]) => {
-      val (pid, (vmap, edgePartition)) = iter.next()
+    val iterFun = (iter: Iterator[(Pid, ((VertexIdToIndexMap, Array[VD]), EdgePartition[ED]))]) => {
+      val (pid, ((vidToIndex, vertexArray), edgePartition)) = iter.next()
      //assert(iter.hasNext == false)
      // Return an iterator that looks up the hash map to find matching 
      // vertices for each edge.
-      new EdgeTripletIterator(vmap, edgePartition)
+      new EdgeTripletIterator(vidToIndex, vertexArray, edgePartition)
    }
    ClosureCleaner.clean(iterFun) 
-    vTableReplicated.zipJoinRDD(eTable)
+    vTableReplicationMap.zipJoin(vTableReplicatedValues).zipJoinRDD(eTable)
      .mapPartitions( iterFun ) // end of map partition
  }

@ -93,13 +95,16 @@ class GraphImpl[VD: ClassManifest, ED: ClassManifest] protected (


  /**
-   * The vTableReplicated is a version of the vertex data after it is 
-   * replicated.
+   * (vTableReplicationMap: IndexedRDD[Pid, VertexIdToIndexMap]) is a version of the
+   * vertex data after it is replicated. Within each partition, it holds a map
+   * from vertex ID to the index where that vertex's attribute is stored. This
+   * index refers to an array in the same partition in vTableReplicatedValues.
+   *
+   * (vTableReplicatedValues: IndexedRDD[Pid, Array[VD]]) holds the vertex data
+   * and is arranged as described above.
   */
-  @transient val vTableReplicated: IndexedRDD[Pid, VertexHashMap[VD]] = 
-    createVTableReplicated(vTable, vid2pid, eTable) 
-
-
+  @transient val (vTableReplicationMap, vTableReplicatedValues) =
+    createVTableReplicated(vTable, vid2pid, eTable)


  /** Return a RDD of vertices. */
@ -114,7 +119,7 @@ class GraphImpl[VD: ClassManifest, ED: ClassManifest] protected (

  /** Return a RDD that brings edges with its source and destination vertices together. */
  @transient override val triplets: RDD[EdgeTriplet[VD, ED]] = 
-    EdgeTripletBuilder.makeTriplets(vTableReplicated, eTable)
+    EdgeTripletBuilder.makeTriplets(vTableReplicationMap, vTableReplicatedValues, eTable)


  // {
@ -136,8 +141,6 @@ class GraphImpl[VD: ClassManifest, ED: ClassManifest] protected (
    eTable.cache()
    vid2pid.cache()
    vTable.cache()
-    /** @todo should we cache the replicated data? */
-    vTableReplicated.cache()
    this
  }

@ -179,15 +182,15 @@ class GraphImpl[VD: ClassManifest, ED: ClassManifest] protected (

  override def mapTriplets[ED2: ClassManifest](f: EdgeTriplet[VD, ED] => ED2):
    Graph[VD, ED2] = {
-    val newETable = eTable.join(vTableReplicated).mapValues{ 
-      case (edgePartition, vmap) =>
-      val et = new EdgeTriplet[VD, ED]    
-      edgePartition.map{e => 
-        et.set(e)
-        et.srcAttr = vmap(e.srcId)
-        et.dstAttr = vmap(e.dstId)
-        f(et)
-      }
+    val newETable = eTable.zipJoin(vTableReplicationMap).zipJoin(vTableReplicatedValues).mapValues{ 
+      case ((edgePartition, vidToIndex), vertexArray) =>
+        val et = new EdgeTriplet[VD, ED]
+        edgePartition.map{e =>
+          et.set(e)
+          et.srcAttr = vertexArray(vidToIndex(e.srcId))
+          et.dstAttr = vertexArray(vidToIndex(e.dstId))
+          f(et)
+        }
    }.asInstanceOf[IndexedRDD[Pid, EdgePartition[ED2]]]
    new GraphImpl(vTable, vid2pid, newETable)
  }
@ -344,20 +347,20 @@ class GraphImpl[VD: ClassManifest, ED: ClassManifest] protected (
    ClosureCleaner.clean(reduceFunc)

    // Map and preaggregate 
-    val preAgg = vTableReplicated.zipJoinRDD(eTable).flatMap{
-      case (pid, (vmap, edgePartition)) => 
+    val preAgg = vTableReplicationMap.zipJoin(vTableReplicatedValues).zipJoinRDD(eTable).flatMap{
+      case (pid, ((vidToIndex, vertexArray), edgePartition)) => 
        val aggMap = new VertexHashMap[A]
        val et = new EdgeTriplet[VD, ED]
        edgePartition.foreach{e => 
          et.set(e)
-          et.srcAttr = vmap(e.srcId)
-          et.dstAttr = vmap(e.dstId)
+          et.srcAttr = vertexArray(vidToIndex(e.srcId))
+          et.dstAttr = vertexArray(vidToIndex(e.dstId))
          mapFunc(et).foreach{case (vid, a) => 
            if(aggMap.containsKey(vid)) {
-              aggMap.put(vid, reduceFunc(aggMap.get(vid), a))             
-              } else { aggMap.put(vid, a) }
-            }
+              aggMap.put(vid, reduceFunc(aggMap.get(vid), a))
+            } else { aggMap.put(vid, a) }
          }
+        }
        // Return the aggregate map
        aggMap.long2ObjectEntrySet().fastIterator().map{ 
          entry => (entry.getLongKey(), entry.getValue())
@ -475,21 +478,36 @@ object GraphImpl {
  protected def createVTableReplicated[VD: ClassManifest, ED: ClassManifest](
    vTable: IndexedRDD[Vid, VD], vid2pid: IndexedRDD[Vid, Array[Pid]],
    eTable: IndexedRDD[Pid, EdgePartition[ED]]): 
-    IndexedRDD[Pid, VertexHashMap[VD]] = {
+    (IndexedRDD[Pid, VertexIdToIndexMap], IndexedRDD[Pid, Array[VD]]) = {
    // Join vid2pid and vTable, generate a shuffle dependency on the joined 
    // result, and get the shuffle id so we can use it on the slave.
-    vTable.zipJoinRDD(vid2pid)
-      .flatMap { case (vid, (vdata, pids)) => 
-        pids.iterator.map { pid => MessageToPartition(pid, (vid, vdata)) }
-      }
-      .partitionBy(eTable.partitioner.get) //@todo assert edge table has partitioner
-      .mapPartitionsWithIndex( (pid, iter) => {
-        // Build the hashmap for each partition
-        val vmap = new VertexHashMap[VD]
-        for( msg <- iter ) { vmap.put(msg.data._1, msg.data._2) }
-        Array((pid, vmap)).iterator
-      }, preservesPartitioning = true)
-      .indexed(eTable.index)  
+    val msgsByPartition =
+      vTable.zipJoinRDD(vid2pid)
+        .flatMap { case (vid, (vdata, pids)) =>
+          pids.iterator.map { pid => MessageToPartition(pid, (vid, vdata)) }
+        }
+        .partitionBy(eTable.partitioner.get) //@todo assert edge table has partitioner
+
+    val vTableReplicationMap: IndexedRDD[Pid, VertexIdToIndexMap] =
+      msgsByPartition.mapPartitionsWithIndex( (pid, iter) => {
+        val vidToIndex = new VertexIdToIndexMap
+        var i = 0
+        for (msg <- iter) {
+          vidToIndex.put(msg.data._1, i)
+        }
+        Array((pid, vidToIndex)).iterator
+      }, preservesPartitioning = true).indexed(eTable.index)
+
+    val vTableReplicatedValues: IndexedRDD[Pid, Array[VD]] =
+      msgsByPartition.mapPartitionsWithIndex( (pid, iter) => {
+        val vertexArray = new ArrayBuffer[VD]
+        for (msg <- iter) {
+          vertexArray += msg.data._2
+        }
+        Array((pid, vertexArray.toArray)).iterator
+      }, preservesPartitioning = true).indexed(eTable.index)
+
+    (vTableReplicationMap, vTableReplicatedValues)
  }


--- a/graph/src/main/scala/org/apache/spark/graph/package.scala
+++ b/graph/src/main/scala/org/apache/spark/graph/package.scala
@ -8,6 +8,8 @@ package object graph {
  type VertexHashMap[T] = it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap[T]
  type VertexSet = it.unimi.dsi.fastutil.longs.LongOpenHashSet
  type VertexArrayList = it.unimi.dsi.fastutil.longs.LongArrayList
+  // @todo replace with rxin's fast hashmap
+  type VertexIdToIndexMap = scala.collection.mutable.HashMap[Vid, Int]

  /**
   * Return the default null-like value for a data type T.