Merge pull request #61 from ankurdave/pid2vid
Shuffle replicated vertex attributes efficiently in columnar format
This commit is contained in:
commit
b8e294a21b
|
@ -18,6 +18,7 @@ class GraphKryoRegistrator extends KryoRegistrator {
|
||||||
kryo.register(classOf[EdgePartition[Object]])
|
kryo.register(classOf[EdgePartition[Object]])
|
||||||
kryo.register(classOf[BitSet])
|
kryo.register(classOf[BitSet])
|
||||||
kryo.register(classOf[VertexIdToIndexMap])
|
kryo.register(classOf[VertexIdToIndexMap])
|
||||||
|
kryo.register(classOf[VertexAttributeBlock[Object]])
|
||||||
// This avoids a large number of hash table lookups.
|
// This avoids a large number of hash table lookups.
|
||||||
kryo.setReferences(false)
|
kryo.setReferences(false)
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
package org.apache.spark.graph.impl
|
package org.apache.spark.graph.impl
|
||||||
|
|
||||||
|
import org.apache.spark.SparkContext._
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
import org.apache.spark.util.collection.OpenHashSet
|
import org.apache.spark.util.collection.{OpenHashSet, PrimitiveKeyOpenHashMap}
|
||||||
|
|
||||||
import org.apache.spark.graph._
|
import org.apache.spark.graph._
|
||||||
import org.apache.spark.graph.impl.MsgRDDFunctions._
|
import org.apache.spark.graph.impl.MsgRDDFunctions._
|
||||||
|
@ -34,7 +35,7 @@ class VTableReplicatedValues[VD: ClassManifest](
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class VertexAttributeBlock[VD: ClassManifest](val vids: Array[Vid], val attrs: Array[VD])
|
||||||
|
|
||||||
object VTableReplicatedValues {
|
object VTableReplicatedValues {
|
||||||
protected def createVTableReplicated[VD: ClassManifest](
|
protected def createVTableReplicated[VD: ClassManifest](
|
||||||
|
@ -44,13 +45,22 @@ object VTableReplicatedValues {
|
||||||
includeSrcAttr: Boolean,
|
includeSrcAttr: Boolean,
|
||||||
includeDstAttr: Boolean): RDD[(Pid, Array[VD])] = {
|
includeDstAttr: Boolean): RDD[(Pid, Array[VD])] = {
|
||||||
|
|
||||||
// Join vid2pid and vTable, generate a shuffle dependency on the joined
|
val pid2vid = vid2pid.getPid2Vid(includeSrcAttr, includeDstAttr)
|
||||||
// result, and get the shuffle id so we can use it on the slave.
|
|
||||||
val msgsByPartition = vTable.zipJoinFlatMap(vid2pid.get(includeSrcAttr, includeDstAttr)) {
|
val msgsByPartition = pid2vid.zipPartitions(vTable.index.rdd, vTable.valuesRDD) {
|
||||||
// TODO(rxin): reuse VertexBroadcastMessage
|
(pid2vidIter, indexIter, valuesIter) =>
|
||||||
(vid, vdata, pids) => pids.iterator.map { pid =>
|
val pid2vid = pid2vidIter.next()
|
||||||
new VertexBroadcastMsg[VD](pid, vid, vdata)
|
val index = indexIter.next()
|
||||||
|
val values = valuesIter.next()
|
||||||
|
val vmap = new PrimitiveKeyOpenHashMap(index, values._1)
|
||||||
|
|
||||||
|
// Send each partition the vertex attributes it wants
|
||||||
|
val output = new Array[(Pid, VertexAttributeBlock[VD])](pid2vid.size)
|
||||||
|
for (pid <- 0 until pid2vid.size) {
|
||||||
|
val block = new VertexAttributeBlock(pid2vid(pid), pid2vid(pid).map(vid => vmap(vid)))
|
||||||
|
output(pid) = (pid, block)
|
||||||
}
|
}
|
||||||
|
output.iterator
|
||||||
}.partitionBy(localVidMap.partitioner.get).cache()
|
}.partitionBy(localVidMap.partitioner.get).cache()
|
||||||
|
|
||||||
localVidMap.zipPartitions(msgsByPartition){
|
localVidMap.zipPartitions(msgsByPartition){
|
||||||
|
@ -59,14 +69,16 @@ object VTableReplicatedValues {
|
||||||
assert(!mapIter.hasNext)
|
assert(!mapIter.hasNext)
|
||||||
// Populate the vertex array using the vidToIndex map
|
// Populate the vertex array using the vidToIndex map
|
||||||
val vertexArray = new Array[VD](vidToIndex.capacity)
|
val vertexArray = new Array[VD](vidToIndex.capacity)
|
||||||
for (msg <- msgsIter) {
|
for ((_, block) <- msgsIter) {
|
||||||
val ind = vidToIndex.getPos(msg.vid) & OpenHashSet.POSITION_MASK
|
for (i <- 0 until block.vids.size) {
|
||||||
vertexArray(ind) = msg.data
|
val vid = block.vids(i)
|
||||||
|
val attr = block.attrs(i)
|
||||||
|
val ind = vidToIndex.getPos(vid) & OpenHashSet.POSITION_MASK
|
||||||
|
vertexArray(ind) = attr
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Iterator((pid, vertexArray))
|
Iterator((pid, vertexArray))
|
||||||
}.cache()
|
}.cache()
|
||||||
|
|
||||||
// @todo assert edge table has partitioner
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,6 +2,7 @@ package org.apache.spark.graph.impl
|
||||||
|
|
||||||
import scala.collection.JavaConversions._
|
import scala.collection.JavaConversions._
|
||||||
import scala.collection.mutable.ArrayBuffer
|
import scala.collection.mutable.ArrayBuffer
|
||||||
|
import scala.collection.mutable.ArrayBuilder
|
||||||
|
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
import org.apache.spark.storage.StorageLevel
|
import org.apache.spark.storage.StorageLevel
|
||||||
|
@ -20,6 +21,11 @@ class Vid2Pid(
|
||||||
val dstAttrOnly: VertexSetRDD[Array[Pid]] = createVid2Pid(false, true)
|
val dstAttrOnly: VertexSetRDD[Array[Pid]] = createVid2Pid(false, true)
|
||||||
val noAttrs: VertexSetRDD[Array[Pid]] = createVid2Pid(false, false)
|
val noAttrs: VertexSetRDD[Array[Pid]] = createVid2Pid(false, false)
|
||||||
|
|
||||||
|
val pid2VidBothAttrs: RDD[Array[Array[Vid]]] = createPid2Vid(bothAttrs)
|
||||||
|
val pid2VidSrcAttrOnly: RDD[Array[Array[Vid]]] = createPid2Vid(srcAttrOnly)
|
||||||
|
val pid2VidDstAttrOnly: RDD[Array[Array[Vid]]] = createPid2Vid(dstAttrOnly)
|
||||||
|
val pid2VidNoAttrs: RDD[Array[Array[Vid]]] = createPid2Vid(noAttrs)
|
||||||
|
|
||||||
def get(includeSrcAttr: Boolean, includeDstAttr: Boolean): VertexSetRDD[Array[Pid]] =
|
def get(includeSrcAttr: Boolean, includeDstAttr: Boolean): VertexSetRDD[Array[Pid]] =
|
||||||
(includeSrcAttr, includeDstAttr) match {
|
(includeSrcAttr, includeDstAttr) match {
|
||||||
case (true, true) => bothAttrs
|
case (true, true) => bothAttrs
|
||||||
|
@ -28,6 +34,14 @@ class Vid2Pid(
|
||||||
case (false, false) => noAttrs
|
case (false, false) => noAttrs
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def getPid2Vid(includeSrcAttr: Boolean, includeDstAttr: Boolean): RDD[Array[Array[Vid]]] =
|
||||||
|
(includeSrcAttr, includeDstAttr) match {
|
||||||
|
case (true, true) => pid2VidBothAttrs
|
||||||
|
case (true, false) => pid2VidSrcAttrOnly
|
||||||
|
case (false, true) => pid2VidDstAttrOnly
|
||||||
|
case (false, false) => pid2VidNoAttrs
|
||||||
|
}
|
||||||
|
|
||||||
def persist(newLevel: StorageLevel) {
|
def persist(newLevel: StorageLevel) {
|
||||||
bothAttrs.persist(newLevel)
|
bothAttrs.persist(newLevel)
|
||||||
srcAttrOnly.persist(newLevel)
|
srcAttrOnly.persist(newLevel)
|
||||||
|
@ -55,4 +69,19 @@ class Vid2Pid(
|
||||||
(a: ArrayBuffer[Pid], b: ArrayBuffer[Pid]) => a ++ b)
|
(a: ArrayBuffer[Pid], b: ArrayBuffer[Pid]) => a ++ b)
|
||||||
.mapValues(a => a.toArray).cache()
|
.mapValues(a => a.toArray).cache()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates an intermediate pid2vid structure that tells each partition of the
|
||||||
|
* vertex data where it should go.
|
||||||
|
*/
|
||||||
|
private def createPid2Vid(vid2pid: VertexSetRDD[Array[Pid]]): RDD[Array[Array[Vid]]] = {
|
||||||
|
val numPartitions = vid2pid.partitions.size
|
||||||
|
vid2pid.mapPartitions { iter =>
|
||||||
|
val pid2vidLocal = Array.fill[ArrayBuilder[Vid]](numPartitions)(ArrayBuilder.make[Vid])
|
||||||
|
for ((vid, pids) <- iter) {
|
||||||
|
pids.foreach { pid => pid2vidLocal(pid) += vid }
|
||||||
|
}
|
||||||
|
Iterator(pid2vidLocal.map(_.result))
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue