Merge remote-tracking branch 'upstream/master' into add_project_to_graph

Conflicts:
	graph/src/main/scala/org/apache/spark/graph/Graph.scala
	graph/src/main/scala/org/apache/spark/graph/impl/GraphImpl.scala
This commit is contained in:
Ankur Dave 2013-12-18 13:00:58 -08:00
commit 9193a8f788
38 changed files with 2370 additions and 1952 deletions

View file

@ -245,7 +245,7 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest](self: RDD[(K, V)])
if (getKeyClass().isArray && partitioner.isInstanceOf[HashPartitioner]) { if (getKeyClass().isArray && partitioner.isInstanceOf[HashPartitioner]) {
throw new SparkException("Default partitioner cannot partition array keys.") throw new SparkException("Default partitioner cannot partition array keys.")
} }
new ShuffledRDD[K, V, (K, V)](self, partitioner) if (self.partitioner == partitioner) self else new ShuffledRDD[K, V, (K, V)](self, partitioner)
} }
/** /**

View file

@ -22,7 +22,8 @@ import java.io.{ObjectOutputStream, IOException}
private[spark] class ZippedPartitionsPartition( private[spark] class ZippedPartitionsPartition(
idx: Int, idx: Int,
@transient rdds: Seq[RDD[_]]) @transient rdds: Seq[RDD[_]],
@transient val preferredLocations: Seq[String])
extends Partition { extends Partition {
override val index: Int = idx override val index: Int = idx
@ -47,27 +48,21 @@ abstract class ZippedPartitionsBaseRDD[V: ClassManifest](
if (preservesPartitioning) firstParent[Any].partitioner else None if (preservesPartitioning) firstParent[Any].partitioner else None
override def getPartitions: Array[Partition] = { override def getPartitions: Array[Partition] = {
val sizes = rdds.map(x => x.partitions.size) val numParts = rdds.head.partitions.size
if (!sizes.forall(x => x == sizes(0))) { if (!rdds.forall(rdd => rdd.partitions.size == numParts)) {
throw new IllegalArgumentException("Can't zip RDDs with unequal numbers of partitions") throw new IllegalArgumentException("Can't zip RDDs with unequal numbers of partitions")
} }
val array = new Array[Partition](sizes(0)) Array.tabulate[Partition](numParts) { i =>
for (i <- 0 until sizes(0)) { val prefs = rdds.map(rdd => rdd.preferredLocations(rdd.partitions(i)))
array(i) = new ZippedPartitionsPartition(i, rdds) // Check whether there are any hosts that match all RDDs; otherwise return the union
val exactMatchLocations = prefs.reduce((x, y) => x.intersect(y))
val locs = if (!exactMatchLocations.isEmpty) exactMatchLocations else prefs.flatten.distinct
new ZippedPartitionsPartition(i, rdds, locs)
} }
array
} }
override def getPreferredLocations(s: Partition): Seq[String] = { override def getPreferredLocations(s: Partition): Seq[String] = {
val parts = s.asInstanceOf[ZippedPartitionsPartition].partitions s.asInstanceOf[ZippedPartitionsPartition].preferredLocations
val prefs = rdds.zip(parts).map { case (rdd, p) => rdd.preferredLocations(p) }
// Check whether there are any hosts that match all RDDs; otherwise return the union
val exactMatchLocations = prefs.reduce((x, y) => x.intersect(y))
if (!exactMatchLocations.isEmpty) {
exactMatchLocations
} else {
prefs.flatten.distinct
}
} }
override def clearDependencies() { override def clearDependencies() {

View file

@ -102,6 +102,10 @@ class BitSet(numBits: Int) {
words(index >> 6) |= bitmask // div by 64 and mask words(index >> 6) |= bitmask // div by 64 and mask
} }
def unset(index: Int) {
val bitmask = 1L << (index & 0x3f) // mod 64 and shift
words(index >> 6) &= ~bitmask // div by 64 and mask
}
/** /**
* Return the value of the bit with the specified index. The value is true if the bit with * Return the value of the bit with the specified index. The value is true if the bit with

View file

@ -158,7 +158,7 @@ class OpenHashSet[@specialized(Long, Int) T: ClassManifest](
/** Return the value at the specified position. */ /** Return the value at the specified position. */
def getValue(pos: Int): T = _data(pos) def getValue(pos: Int): T = _data(pos)
def iterator() = new Iterator[T] { def iterator = new Iterator[T] {
var pos = nextPos(0) var pos = nextPos(0)
override def hasNext: Boolean = pos != INVALID_POS override def hasNext: Boolean = pos != INVALID_POS
override def next(): T = { override def next(): T = {
@ -249,8 +249,7 @@ class OpenHashSet[@specialized(Long, Int) T: ClassManifest](
* in the lower bits, similar to java.util.HashMap * in the lower bits, similar to java.util.HashMap
*/ */
private def hashcode(h: Int): Int = { private def hashcode(h: Int): Int = {
val r = h ^ (h >>> 20) ^ (h >>> 12) it.unimi.dsi.fastutil.HashCommon.murmurHash3(h)
r ^ (r >>> 7) ^ (r >>> 4)
} }
private def nextPowerOf2(n: Int): Int = { private def nextPowerOf2(n: Int): Int = {

View file

@ -71,7 +71,7 @@ class PrimitiveKeyOpenHashMap[@specialized(Long, Int) K: ClassManifest,
/** Set the value for a key */ /** Set the value for a key */
def setMerge(k: K, v: V, mergeF: (V,V) => V) { def setMerge(k: K, v: V, mergeF: (V, V) => V) {
val pos = keySet.addWithoutResize(k) val pos = keySet.addWithoutResize(k)
val ind = pos & OpenHashSet.POSITION_MASK val ind = pos & OpenHashSet.POSITION_MASK
if ((pos & OpenHashSet.NONEXISTENCE_MASK) != 0) { // if first add if ((pos & OpenHashSet.NONEXISTENCE_MASK) != 0) { // if first add

View file

@ -1,6 +1,8 @@
package org.apache.spark.graph package org.apache.spark.graph
import org.apache.spark._ import org.apache.spark._
import org.apache.spark.graph.algorithms._
/** /**
* The Analytics object contains a collection of basic graph analytics * The Analytics object contains a collection of basic graph analytics
@ -12,272 +14,6 @@ import org.apache.spark._
*/ */
object Analytics extends Logging { object Analytics extends Logging {
/**
* Run PageRank for a fixed number of iterations returning a graph
* with vertex attributes containing the PageRank and edge
* attributes the normalized edge weight.
*
* The following PageRank fixed point is computed for each vertex.
*
* {{{
* var PR = Array.fill(n)( 1.0 )
* val oldPR = Array.fill(n)( 1.0 )
* for( iter <- 0 until numIter ) {
* swap(oldPR, PR)
* for( i <- 0 until n ) {
* PR[i] = alpha + (1 - alpha) * inNbrs[i].map(j => oldPR[j] / outDeg[j]).sum
* }
* }
* }}}
*
* where `alpha` is the random reset probability (typically 0.15),
* `inNbrs[i]` is the set of neighbors whick link to `i` and
* `outDeg[j]` is the out degree of vertex `j`.
*
* Note that this is not the "normalized" PageRank and as a
* consequence pages that have no inlinks will have a PageRank of
* alpha.
*
* @tparam VD the original vertex attribute (not used)
* @tparam ED the original edge attribute (not used)
*
* @param graph the graph on which to compute PageRank
* @param numIter the number of iterations of PageRank to run
* @param resetProb the random reset probability (alpha)
*
* @return the graph containing with each vertex containing the
* PageRank and each edge containing the normalized weight.
*
*/
def pagerank[VD: Manifest, ED: Manifest](
graph: Graph[VD, ED], numIter: Int, resetProb: Double = 0.15):
Graph[Double, Double] = {
/**
* Initialize the pagerankGraph with each edge attribute having
* weight 1/outDegree and each vertex with attribute 1.0.
*/
val pagerankGraph: Graph[Double, Double] = graph
// Associate the degree with each vertex
.outerJoinVertices(graph.outDegrees){
(vid, vdata, deg) => deg.getOrElse(0)
}
// Set the weight on the edges based on the degree
.mapTriplets( e => 1.0 / e.srcAttr )
// Set the vertex attributes to the initial pagerank values
.mapVertices( (id, attr) => 1.0 )
// Display statistics about pagerank
println(pagerankGraph.statistics)
// Define the three functions needed to implement PageRank in the GraphX
// version of Pregel
def vertexProgram(id: Vid, attr: Double, msgSum: Double): Double =
resetProb + (1.0 - resetProb) * msgSum
def sendMessage(edge: EdgeTriplet[Double, Double]) =
Iterator((edge.dstId, edge.srcAttr * edge.attr))
def messageCombiner(a: Double, b: Double): Double = a + b
// The initial message received by all vertices in PageRank
val initialMessage = 0.0
// Execute pregel for a fixed number of iterations.
Pregel(pagerankGraph, initialMessage, numIter)(
vertexProgram, sendMessage, messageCombiner)
}
/**
* Run a dynamic version of PageRank returning a graph with vertex
* attributes containing the PageRank and edge attributes containing
* the normalized edge weight.
*
* {{{
* var PR = Array.fill(n)( 1.0 )
* val oldPR = Array.fill(n)( 0.0 )
* while( max(abs(PR - oldPr)) > tol ) {
* swap(oldPR, PR)
* for( i <- 0 until n if abs(PR[i] - oldPR[i]) > tol ) {
* PR[i] = alpha + (1 - \alpha) * inNbrs[i].map(j => oldPR[j] / outDeg[j]).sum
* }
* }
* }}}
*
* where `alpha` is the random reset probability (typically 0.15),
* `inNbrs[i]` is the set of neighbors whick link to `i` and
* `outDeg[j]` is the out degree of vertex `j`.
*
* Note that this is not the "normalized" PageRank and as a
* consequence pages that have no inlinks will have a PageRank of
* alpha.
*
* @tparam VD the original vertex attribute (not used)
* @tparam ED the original edge attribute (not used)
*
* @param graph the graph on which to compute PageRank
* @param tol the tolerance allowed at convergence (smaller => more
* accurate).
* @param resetProb the random reset probability (alpha)
*
* @return the graph containing with each vertex containing the
* PageRank and each edge containing the normalized weight.
*/
def deltaPagerank[VD: Manifest, ED: Manifest](
graph: Graph[VD, ED], tol: Double, resetProb: Double = 0.15):
Graph[Double, Double] = {
/**
* Initialize the pagerankGraph with each edge attribute
* having weight 1/outDegree and each vertex with attribute 1.0.
*/
val pagerankGraph: Graph[(Double, Double), Double] = graph
// Associate the degree with each vertex
.outerJoinVertices(graph.outDegrees){
(vid, vdata, deg) => deg.getOrElse(0)
}
// Set the weight on the edges based on the degree
.mapTriplets( e => 1.0 / e.srcAttr )
// Set the vertex attributes to (initalPR, delta = 0)
.mapVertices( (id, attr) => (0.0, 0.0) )
// Display statistics about pagerank
println(pagerankGraph.statistics)
// Define the three functions needed to implement PageRank in the GraphX
// version of Pregel
def vertexProgram(id: Vid, attr: (Double, Double), msgSum: Double): (Double, Double) = {
val (oldPR, lastDelta) = attr
val newPR = oldPR + (1.0 - resetProb) * msgSum
(newPR, newPR - oldPR)
}
def sendMessage(edge: EdgeTriplet[(Double, Double), Double]) = {
if (edge.srcAttr._2 > tol) {
Iterator((edge.dstId, edge.srcAttr._2 * edge.attr))
} else {
Iterator.empty
}
}
def messageCombiner(a: Double, b: Double): Double = a + b
// The initial message received by all vertices in PageRank
val initialMessage = resetProb / (1.0 - resetProb)
// Execute a dynamic version of Pregel.
Pregel(pagerankGraph, initialMessage)(
vertexProgram, sendMessage, messageCombiner)
.mapVertices( (vid, attr) => attr._1 )
} // end of deltaPageRank
/**
* Compute the connected component membership of each vertex and
* return an RDD with the vertex value containing the lowest vertex
* id in the connected component containing that vertex.
*
* @tparam VD the vertex attribute type (discarded in the
* computation)
* @tparam ED the edge attribute type (preserved in the computation)
*
* @param graph the graph for which to compute the connected
* components
*
* @return a graph with vertex attributes containing the smallest
* vertex in each connected component
*/
def connectedComponents[VD: Manifest, ED: Manifest](graph: Graph[VD, ED]):
Graph[Vid, ED] = {
val ccGraph = graph.mapVertices { case (vid, _) => vid }
def sendMessage(edge: EdgeTriplet[Vid, ED]) = {
if (edge.srcAttr < edge.dstAttr) {
Iterator((edge.dstId, edge.srcAttr))
} else if (edge.srcAttr > edge.dstAttr) {
Iterator((edge.srcId, edge.dstAttr))
} else {
Iterator.empty
}
}
val initialMessage = Long.MaxValue
Pregel(ccGraph, initialMessage)(
(id, attr, msg) => math.min(attr, msg),
sendMessage,
(a,b) => math.min(a,b)
)
} // end of connectedComponents
/**
* Compute the number of triangles passing through each vertex.
*
* The algorithm is relatively straightforward and can be computed in
* three steps:
*
* 1) Compute the set of neighbors for each vertex
* 2) For each edge compute the intersection of the sets and send the
* count to both vertices.
* 3) Compute the sum at each vertex and divide by two since each
* triangle is counted twice.
*
*
* @param graph a graph with `sourceId` less than `destId`
* @tparam VD
* @tparam ED
* @return
*/
def triangleCount[VD: ClassManifest, ED: ClassManifest](rawGraph: Graph[VD,ED]):
Graph[Int, ED] = {
// Remove redundant edges
val graph = rawGraph.groupEdges( (a,b) => a ).cache
// Construct set representations of the neighborhoods
val nbrSets: VertexSetRDD[VertexSet] =
graph.collectNeighborIds(EdgeDirection.Both).mapValuesWithKeys { (vid, nbrs) =>
val set = new VertexSet(4)
var i = 0
while (i < nbrs.size) {
// prevent self cycle
if(nbrs(i) != vid) {
set.add(nbrs(i))
}
i += 1
}
set
}
// join the sets with the graph
val setGraph: Graph[VertexSet, ED] = graph.outerJoinVertices(nbrSets) {
(vid, _, optSet) => optSet.getOrElse(null)
}
// Edge function computes intersection of smaller vertex with larger vertex
def edgeFunc(et: EdgeTriplet[VertexSet, ED]): Iterator[(Vid, Int)] = {
assert(et.srcAttr != null)
assert(et.dstAttr != null)
val (smallSet, largeSet) = if (et.srcAttr.size < et.dstAttr.size) {
(et.srcAttr, et.dstAttr)
} else {
(et.dstAttr, et.srcAttr)
}
val iter = smallSet.iterator()
var counter: Int = 0
while (iter.hasNext) {
val vid = iter.next
if (vid != et.srcId && vid != et.dstId && largeSet.contains(vid)) { counter += 1 }
}
Iterator((et.srcId, counter), (et.dstId, counter))
}
// compute the intersection along edges
val counters: VertexSetRDD[Int] = setGraph.mapReduceTriplets(edgeFunc, _ + _)
// Merge counters with the graph and divide by two since each triangle is counted twice
graph.outerJoinVertices(counters) {
(vid, _, optCounter: Option[Int]) =>
val dblCount = optCounter.getOrElse(0)
// double count should be even (divisible by two)
assert((dblCount & 1) == 0)
dblCount / 2
}
} // end of TriangleCount
def main(args: Array[String]) = { def main(args: Array[String]) = {
val host = args(0) val host = args(0)
val taskType = args(1) val taskType = args(1)
@ -301,10 +37,10 @@ object Analytics extends Logging {
def pickPartitioner(v: String): PartitionStrategy = { def pickPartitioner(v: String): PartitionStrategy = {
v match { v match {
case "RandomVertexCut" => RandomVertexCut() case "RandomVertexCut" => RandomVertexCut
case "EdgePartition1D" => EdgePartition1D() case "EdgePartition1D" => EdgePartition1D
case "EdgePartition2D" => EdgePartition2D() case "EdgePartition2D" => EdgePartition2D
case "CanonicalRandomVertexCut" => CanonicalRandomVertexCut() case "CanonicalRandomVertexCut" => CanonicalRandomVertexCut
case _ => throw new IllegalArgumentException("Invalid Partition Strategy: " + v) case _ => throw new IllegalArgumentException("Invalid Partition Strategy: " + v)
} }
} }
@ -318,57 +54,43 @@ object Analytics extends Logging {
taskType match { taskType match {
case "pagerank" => { case "pagerank" => {
var numIter = Int.MaxValue
var isDynamic = false
var tol:Float = 0.001F var tol:Float = 0.001F
var outFname = "" var outFname = ""
var numVPart = 4 var numVPart = 4
var numEPart = 4 var numEPart = 4
var partitionStrategy: PartitionStrategy = RandomVertexCut() var partitionStrategy: Option[PartitionStrategy] = None
options.foreach{ options.foreach{
case ("numIter", v) => numIter = v.toInt
case ("dynamic", v) => isDynamic = v.toBoolean
case ("tol", v) => tol = v.toFloat case ("tol", v) => tol = v.toFloat
case ("output", v) => outFname = v case ("output", v) => outFname = v
case ("numVPart", v) => numVPart = v.toInt case ("numVPart", v) => numVPart = v.toInt
case ("numEPart", v) => numEPart = v.toInt case ("numEPart", v) => numEPart = v.toInt
case ("partStrategy", v) => partitionStrategy = pickPartitioner(v) case ("partStrategy", v) => partitionStrategy = Some(pickPartitioner(v))
case (opt, _) => throw new IllegalArgumentException("Invalid option: " + opt) case (opt, _) => throw new IllegalArgumentException("Invalid option: " + opt)
} }
if(!isDynamic && numIter == Int.MaxValue) {
println("Set number of iterations!")
sys.exit(1)
}
println("======================================") println("======================================")
println("| PageRank |") println("| PageRank |")
println("--------------------------------------")
println(" Using parameters:")
println(" \tDynamic: " + isDynamic)
if(isDynamic) println(" \t |-> Tolerance: " + tol)
println(" \tNumIter: " + numIter)
println("======================================") println("======================================")
val sc = new SparkContext(host, "PageRank(" + fname + ")") val sc = new SparkContext(host, "PageRank(" + fname + ")")
val graph = GraphLoader.edgeListFile(sc, fname, val unpartitionedGraph = GraphLoader.edgeListFile(sc, fname,
minEdgePartitions = numEPart, partitionStrategy=partitionStrategy).cache() minEdgePartitions = numEPart).cache()
val graph = partitionStrategy.foldLeft(unpartitionedGraph)(_.partitionBy(_))
val startTime = System.currentTimeMillis println("GRAPHX: Number of vertices " + graph.vertices.count)
logInfo("GRAPHX: starting tasks") println("GRAPHX: Number of edges " + graph.edges.count)
logInfo("GRAPHX: Number of vertices " + graph.vertices.count)
logInfo("GRAPHX: Number of edges " + graph.edges.count)
//val pr = Analytics.pagerank(graph, numIter) //val pr = Analytics.pagerank(graph, numIter)
val pr = if(isDynamic) Analytics.deltaPagerank(graph, tol, numIter) val pr = PageRank.runStandalone(graph, tol)
else Analytics.pagerank(graph, numIter)
logInfo("GRAPHX: Total rank: " + pr.vertices.map{ case (id,r) => r }.reduce(_+_) ) println("GRAPHX: Total rank: " + pr.map(_._2).reduce(_+_))
if (!outFname.isEmpty) { if (!outFname.isEmpty) {
println("Saving pageranks of pages to " + outFname) logWarning("Saving pageranks of pages to " + outFname)
pr.vertices.map{case (id, r) => id + "\t" + r}.saveAsTextFile(outFname) pr.map{case (id, r) => id + "\t" + r}.saveAsTextFile(outFname)
} }
logInfo("GRAPHX: Runtime: " + ((System.currentTimeMillis - startTime)/1000.0) + " seconds")
sc.stop() sc.stop()
} }
@ -379,14 +101,14 @@ object Analytics extends Logging {
var numVPart = 4 var numVPart = 4
var numEPart = 4 var numEPart = 4
var isDynamic = false var isDynamic = false
var partitionStrategy: PartitionStrategy = RandomVertexCut() var partitionStrategy: Option[PartitionStrategy] = None
options.foreach{ options.foreach{
case ("numIter", v) => numIter = v.toInt case ("numIter", v) => numIter = v.toInt
case ("dynamic", v) => isDynamic = v.toBoolean case ("dynamic", v) => isDynamic = v.toBoolean
case ("numEPart", v) => numEPart = v.toInt case ("numEPart", v) => numEPart = v.toInt
case ("numVPart", v) => numVPart = v.toInt case ("numVPart", v) => numVPart = v.toInt
case ("partStrategy", v) => partitionStrategy = pickPartitioner(v) case ("partStrategy", v) => partitionStrategy = Some(pickPartitioner(v))
case (opt, _) => throw new IllegalArgumentException("Invalid option: " + opt) case (opt, _) => throw new IllegalArgumentException("Invalid option: " + opt)
} }
@ -403,9 +125,11 @@ object Analytics extends Logging {
println("======================================") println("======================================")
val sc = new SparkContext(host, "ConnectedComponents(" + fname + ")") val sc = new SparkContext(host, "ConnectedComponents(" + fname + ")")
val graph = GraphLoader.edgeListFile(sc, fname, val unpartitionedGraph = GraphLoader.edgeListFile(sc, fname,
minEdgePartitions = numEPart, partitionStrategy=partitionStrategy).cache() minEdgePartitions = numEPart).cache()
val cc = Analytics.connectedComponents(graph) val graph = partitionStrategy.foldLeft(unpartitionedGraph)(_.partitionBy(_))
val cc = ConnectedComponents.run(graph)
println("Components: " + cc.vertices.map{ case (vid,data) => data}.distinct()) println("Components: " + cc.vertices.map{ case (vid,data) => data}.distinct())
sc.stop() sc.stop()
} }
@ -413,7 +137,8 @@ object Analytics extends Logging {
case "triangles" => { case "triangles" => {
var numVPart = 4 var numVPart = 4
var numEPart = 4 var numEPart = 4
var partitionStrategy: PartitionStrategy = RandomVertexCut() // TriangleCount requires the graph to be partitioned
var partitionStrategy: PartitionStrategy = RandomVertexCut
options.foreach{ options.foreach{
case ("numEPart", v) => numEPart = v.toInt case ("numEPart", v) => numEPart = v.toInt
@ -426,8 +151,8 @@ object Analytics extends Logging {
println("--------------------------------------") println("--------------------------------------")
val sc = new SparkContext(host, "TriangleCount(" + fname + ")") val sc = new SparkContext(host, "TriangleCount(" + fname + ")")
val graph = GraphLoader.edgeListFile(sc, fname, canonicalOrientation = true, val graph = GraphLoader.edgeListFile(sc, fname, canonicalOrientation = true,
minEdgePartitions = numEPart, partitionStrategy=partitionStrategy).cache() minEdgePartitions = numEPart).partitionBy(partitionStrategy).cache()
val triangles = Analytics.triangleCount(graph) val triangles = TriangleCount.run(graph)
println("Triangles: " + triangles.vertices.map { println("Triangles: " + triangles.vertices.map {
case (vid,data) => data.toLong case (vid,data) => data.toLong
}.reduce(_+_) / 3) }.reduce(_+_) / 3)
@ -536,42 +261,6 @@ object Analytics extends Logging {
} }
} }
// /**
// * Compute the PageRank of a graph returning the pagerank of each vertex as an RDD
// */
// def dynamicPagerank[VD: Manifest, ED: Manifest](graph: Graph[VD, ED],
// tol: Double, maxIter: Int = 10) = {
// // Compute the out degree of each vertex
// val pagerankGraph = graph.updateVertices[Int, (Int, Double, Double)](graph.outDegrees,
// (vertex, degIter) => (degIter.sum, 1.0, 1.0)
// )
// // Run PageRank
// GraphLab.iterateGAS(pagerankGraph)(
// (me_id, edge) => edge.src.data._2 / edge.src.data._1, // gather
// (a: Double, b: Double) => a + b,
// (vertex, a: Option[Double]) =>
// (vertex.data._1, (0.15 + 0.85 * a.getOrElse(0.0)), vertex.data._2), // apply
// (me_id, edge) => math.abs(edge.src.data._2 - edge.dst.data._1) > tol, // scatter
// maxIter).mapVertices { case Vertex(vid, data) => Vertex(vid, data._2) }
// }
// /**
// * Compute the connected component membership of each vertex
// * and return an RDD with the vertex value containing the
// * lowest vertex id in the connected component containing
// * that vertex.
// */
// def connectedComponents[VD: Manifest, ED: Manifest](graph: Graph[VD, ED], numIter: Int) = {
// val ccGraph = graph.mapVertices { case Vertex(vid, _) => Vertex(vid, vid) }
// GraphLab.iterateGA[Int, ED, Int](ccGraph)(
// (me_id, edge) => edge.otherVertex(me_id).data, // gather
// (a: Int, b: Int) => math.min(a, b), // merge
// (v, a: Option[Int]) => math.min(v.data, a.getOrElse(Integer.MAX_VALUE)), // apply
// numIter,
// gatherDirection = EdgeDirection.Both)
// }
// /** // /**
// * Compute the shortest path to a set of markers // * Compute the shortest path to a set of markers
// */ // */

View file

@ -30,7 +30,6 @@ case class Edge[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) ED]
def otherVertexId(vid: Vid): Vid = def otherVertexId(vid: Vid): Vid =
if (srcId == vid) dstId else { assert(dstId == vid); srcId } if (srcId == vid) dstId else { assert(dstId == vid); srcId }
/** /**
* Return the relative direction of the edge to the corresponding * Return the relative direction of the edge to the corresponding
* vertex. * vertex.
@ -41,5 +40,11 @@ case class Edge[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) ED]
*/ */
def relativeDirection(vid: Vid): EdgeDirection = def relativeDirection(vid: Vid): EdgeDirection =
if (vid == srcId) EdgeDirection.Out else { assert(vid == dstId); EdgeDirection.In } if (vid == srcId) EdgeDirection.Out else { assert(vid == dstId); EdgeDirection.In }
}
object Edge {
def lexicographicOrdering[ED] = new Ordering[Edge[ED]] {
override def compare(a: Edge[ED], b: Edge[ED]): Int =
Ordering[(Vid, Vid)].compare((a.srcId, a.dstId), (b.srcId, b.dstId))
}
} }

View file

@ -0,0 +1,67 @@
package org.apache.spark.graph
import org.apache.spark.{OneToOneDependency, Partition, Partitioner, TaskContext}
import org.apache.spark.graph.impl.EdgePartition
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel
class EdgeRDD[@specialized ED: ClassManifest](
val partitionsRDD: RDD[(Pid, EdgePartition[ED])])
extends RDD[Edge[ED]](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {
partitionsRDD.setName("EdgeRDD")
override protected def getPartitions: Array[Partition] = partitionsRDD.partitions
/**
* If partitionsRDD already has a partitioner, use it. Otherwise assume that the Pids in
* partitionsRDD correspond to the actual partitions and create a new partitioner that allows
* co-partitioning with partitionsRDD.
*/
override val partitioner =
partitionsRDD.partitioner.orElse(Some(Partitioner.defaultPartitioner(partitionsRDD)))
override def compute(split: Partition, context: TaskContext): Iterator[Edge[ED]] = {
val edgePartition = partitionsRDD.compute(split, context).next()._2
edgePartition.iterator
}
override def collect(): Array[Edge[ED]] = this.map(_.copy()).collect()
/**
* Caching a VertexRDD causes the index and values to be cached separately.
*/
override def persist(newLevel: StorageLevel): EdgeRDD[ED] = {
partitionsRDD.persist(newLevel)
this
}
/** Persist this RDD with the default storage level (`MEMORY_ONLY`). */
override def persist(): EdgeRDD[ED] = persist(StorageLevel.MEMORY_ONLY)
/** Persist this RDD with the default storage level (`MEMORY_ONLY`). */
override def cache(): EdgeRDD[ED] = persist()
def mapEdgePartitions[ED2: ClassManifest](f: EdgePartition[ED] => EdgePartition[ED2])
: EdgeRDD[ED2]= {
new EdgeRDD[ED2](partitionsRDD.mapPartitions({ iter =>
val (pid, ep) = iter.next()
Iterator(Tuple2(pid, f(ep)))
}, preservesPartitioning = true))
}
def zipEdgePartitions[T: ClassManifest, U: ClassManifest]
(other: RDD[T])
(f: (EdgePartition[ED], Iterator[T]) => Iterator[U]): RDD[U] = {
partitionsRDD.zipPartitions(other, preservesPartitioning = true) { (ePartIter, otherIter) =>
val (_, edgePartition) = ePartIter.next()
f(edgePartition, otherIter)
}
}
def collectVids(): RDD[Vid] = {
partitionsRDD.flatMap { case (_, p) => Array.concat(p.srcIds, p.dstIds) }
}
}

View file

@ -1,5 +1,7 @@
package org.apache.spark.graph package org.apache.spark.graph
import org.apache.spark.graph.impl.VertexPartition
/** /**
* An edge triplet represents two vertices and edge along with their * An edge triplet represents two vertices and edge along with their
* attributes. * attributes.
@ -26,6 +28,9 @@ class EdgeTriplet[VD, ED] extends Edge[ED] {
*/ */
var dstAttr: VD = _ //nullValue[VD] var dstAttr: VD = _ //nullValue[VD]
var srcStale: Boolean = false
var dstStale: Boolean = false
/** /**
* Set the edge properties of this triplet. * Set the edge properties of this triplet.
*/ */
@ -54,4 +59,5 @@ class EdgeTriplet[VD, ED] extends Edge[ED] {
def vertexAttr(vid: Vid): VD = def vertexAttr(vid: Vid): VD =
if (srcId == vid) srcAttr else { assert(dstId == vid); dstAttr } if (srcId == vid) srcAttr else { assert(dstId == vid); dstAttr }
override def toString() = ((srcId, srcAttr), (dstId, dstAttr), attr).toString()
} }

View file

@ -1,8 +1,10 @@
package org.apache.spark.graph package org.apache.spark.graph
import org.apache.spark.graph.impl._
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel import org.apache.spark.storage.StorageLevel
/** /**
* The Graph abstractly represents a graph with arbitrary objects * The Graph abstractly represents a graph with arbitrary objects
* associated with vertices and edges. The graph provides basic * associated with vertices and edges. The graph provides basic
@ -32,7 +34,7 @@ abstract class Graph[VD: ClassManifest, ED: ClassManifest] {
* @see Vertex for the vertex type. * @see Vertex for the vertex type.
* *
*/ */
val vertices: VertexSetRDD[VD] val vertices: VertexRDD[VD]
/** /**
* Get the Edges and their data as an RDD. The entries in the RDD * Get the Edges and their data as an RDD. The entries in the RDD
@ -83,6 +85,11 @@ abstract class Graph[VD: ClassManifest, ED: ClassManifest] {
*/ */
def cache(): Graph[VD, ED] def cache(): Graph[VD, ED]
/**
* Repartition the edges in the graph according to partitionStrategy.
*/
def partitionBy(partitionStrategy: PartitionStrategy): Graph[VD, ED]
/** /**
* Compute statistics describing the graph representation. * Compute statistics describing the graph representation.
*/ */
@ -162,7 +169,6 @@ abstract class Graph[VD: ClassManifest, ED: ClassManifest] {
* Construct a new graph with all the edges reversed. If this graph * Construct a new graph with all the edges reversed. If this graph
* contains an edge from a to b then the returned graph contains an * contains an edge from a to b then the returned graph contains an
* edge from b to a. * edge from b to a.
*
*/ */
def reverse: Graph[VD, ED] def reverse: Graph[VD, ED]
@ -200,18 +206,15 @@ abstract class Graph[VD: ClassManifest, ED: ClassManifest] {
def mask[VD2: ClassManifest, ED2: ClassManifest](other: Graph[VD2, ED2]): Graph[VD, ED] def mask[VD2: ClassManifest, ED2: ClassManifest](other: Graph[VD2, ED2]): Graph[VD, ED]
/** /**
* This function merges multiple edges between two vertices into a * This function merges multiple edges between two vertices into a single Edge. For correct
* single Edge. See * results, the graph must have been partitioned using partitionBy.
* [[org.apache.spark.graph.Graph.groupEdgeTriplets]] for more
* detail.
* *
* @tparam ED2 the type of the resulting edge data after grouping. * @tparam ED2 the type of the resulting edge data after grouping.
* *
* @param f the user supplied commutative associative function to merge * @param f the user supplied commutative associative function to merge edge attributes for
* edge attributes for duplicate edges. * duplicate edges.
* *
* @return Graph[VD,ED2] The resulting graph with a single Edge for * @return Graph[VD,ED2] The resulting graph with a single Edge for each source, dest vertex pair.
* each source, dest vertex pair.
*/ */
def groupEdges(merge: (ED, ED) => ED): Graph[VD,ED] def groupEdges(merge: (ED, ED) => ED): Graph[VD,ED]
@ -232,6 +235,11 @@ abstract class Graph[VD: ClassManifest, ED: ClassManifest] {
* be commutative and assosciative and is used to combine the output * be commutative and assosciative and is used to combine the output
* of the map phase. * of the map phase.
* *
* @param activeSet optionally, a set of "active" vertices and a direction of edges to consider
* when running `mapFunc`. For example, if the direction is Out, `mapFunc` will only be run on
* edges originating from vertices in the active set. `activeSet` must have the same index as the
* graph's vertices.
*
* @example We can use this function to compute the inDegree of each * @example We can use this function to compute the inDegree of each
* vertex * vertex
* {{{ * {{{
@ -249,8 +257,9 @@ abstract class Graph[VD: ClassManifest, ED: ClassManifest] {
*/ */
def mapReduceTriplets[A: ClassManifest]( def mapReduceTriplets[A: ClassManifest](
mapFunc: EdgeTriplet[VD, ED] => Iterator[(Vid, A)], mapFunc: EdgeTriplet[VD, ED] => Iterator[(Vid, A)],
reduceFunc: (A, A) => A) reduceFunc: (A, A) => A,
: VertexSetRDD[A] activeSetOpt: Option[(VertexRDD[_], EdgeDirection)] = None)
: VertexRDD[A]
/** /**
* Join the vertices with an RDD and then apply a function from the * Join the vertices with an RDD and then apply a function from the
@ -294,52 +303,30 @@ abstract class Graph[VD: ClassManifest, ED: ClassManifest] {
/** /**
* The Graph object contains a collection of routines used to * The Graph object contains a collection of routines used to construct graphs from RDDs.
* construct graphs from RDDs.
*
*/ */
object Graph { object Graph {
import org.apache.spark.graph.impl._
import org.apache.spark.SparkContext._
/** /**
* Construct a graph from a collection of edges encoded as vertex id pairs. * Construct a graph from a collection of edges encoded as vertex id pairs.
* *
* @param rawEdges the RDD containing the set of edges in the graph
*
* @return a graph with edge attributes containing the count of duplicate edges.
*/
def apply[VD: ClassManifest](rawEdges: RDD[(Vid, Vid)], defaultValue: VD): Graph[VD, Int] = {
Graph(rawEdges, defaultValue, false, RandomVertexCut())
}
/**
* Construct a graph from a collection of edges encoded as vertex id
* pairs.
*
* @param rawEdges a collection of edges in (src,dst) form. * @param rawEdges a collection of edges in (src,dst) form.
* @param uniqueEdges if multiple identical edges are found they are * @param uniqueEdges if multiple identical edges are found they are combined and the edge
* combined and the edge attribute is set to the sum. Otherwise * attribute is set to the sum. Otherwise duplicate edges are treated as separate. To enable
* duplicate edges are treated as separate. * uniqueEdges, a [[PartitionStrategy]] must be provided.
*
* @return a graph with edge attributes containing either the count
* of duplicate edges or 1 (if `uniqueEdges=false`) and vertex
* attributes containing the total degree of each vertex.
* *
* @return a graph with edge attributes containing either the count of duplicate edges or 1
* (if `uniqueEdges=None`) and vertex attributes containing the total degree of each vertex.
*/ */
def apply[VD: ClassManifest]( def fromEdgeTuples[VD: ClassManifest](
rawEdges: RDD[(Vid, Vid)], rawEdges: RDD[(Vid, Vid)],
defaultValue: VD, defaultValue: VD,
uniqueEdges: Boolean, uniqueEdges: Option[PartitionStrategy] = None): Graph[VD, Int] = {
partitionStrategy: PartitionStrategy):
Graph[VD, Int] = {
val edges = rawEdges.map(p => Edge(p._1, p._2, 1)) val edges = rawEdges.map(p => Edge(p._1, p._2, 1))
val graph = GraphImpl(edges, defaultValue, partitionStrategy) val graph = GraphImpl(edges, defaultValue)
if (uniqueEdges) { uniqueEdges match {
graph.groupEdges((a,b) => a+b) case Some(p) => graph.partitionBy(p).groupEdges((a, b) => a + b)
} else { case None => graph
graph
} }
} }
@ -352,107 +339,40 @@ object Graph {
* @return a graph with edge attributes described by `edges` and vertices * @return a graph with edge attributes described by `edges` and vertices
* given by all vertices in `edges` with value `defaultValue` * given by all vertices in `edges` with value `defaultValue`
*/ */
def apply[VD: ClassManifest, ED: ClassManifest]( def fromEdges[VD: ClassManifest, ED: ClassManifest](
edges: RDD[Edge[ED]], edges: RDD[Edge[ED]],
defaultValue: VD): Graph[VD, ED] = { defaultValue: VD): Graph[VD, ED] = {
Graph(edges, defaultValue, RandomVertexCut()) GraphImpl(edges, defaultValue)
}
/**
* Construct a graph from a collection of edges.
*
* @param edges the RDD containing the set of edges in the graph
* @param defaultValue the default vertex attribute to use for each vertex
*
* @return a graph with edge attributes described by `edges` and vertices
* given by all vertices in `edges` with value `defaultValue`
*/
def apply[VD: ClassManifest, ED: ClassManifest](
edges: RDD[Edge[ED]],
defaultValue: VD,
partitionStrategy: PartitionStrategy): Graph[VD, ED] = {
GraphImpl(edges, defaultValue, partitionStrategy)
} }
/** /**
* Construct a graph from a collection attributed vertices and * Construct a graph from a collection attributed vertices and
* edges. * edges. Duplicate vertices are picked arbitrarily and
*
* @note Duplicate vertices are removed arbitrarily and missing
* vertices (vertices in the edge collection that are not in the
* vertex collection) are replaced by null vertex attributes.
*
* @tparam VD the vertex attribute type
* @tparam ED the edge attribute type
* @param vertices the "set" of vertices and their attributes
* @param edges the collection of edges in the graph
*
*/
def apply[VD: ClassManifest, ED: ClassManifest](
vertices: RDD[(Vid,VD)],
edges: RDD[Edge[ED]]): Graph[VD, ED] = {
val defaultAttr: VD = null.asInstanceOf[VD]
Graph(vertices, edges, defaultAttr, (a:VD,b:VD) => a, RandomVertexCut())
}
/**
* Construct a graph from a collection attributed vertices and
* edges. Duplicate vertices are combined using the `mergeFunc` and
* vertices found in the edge collection but not in the input * vertices found in the edge collection but not in the input
* vertices are the default attribute `defautVertexAttr`. * vertices are the default attribute.
*
* @note Duplicate vertices are removed arbitrarily .
* *
* @tparam VD the vertex attribute type * @tparam VD the vertex attribute type
* @tparam ED the edge attribute type * @tparam ED the edge attribute type
* @param vertices the "set" of vertices and their attributes * @param vertices the "set" of vertices and their attributes
* @param edges the collection of edges in the graph * @param edges the collection of edges in the graph
* @param defaultVertexAttr the default vertex attribute to use for * @param defaultVertexAttr the default vertex attribute to use for
* vertices that are mentioned in `edges` but not in `vertices` * vertices that are mentioned in edges but not in vertices
*
*/
def apply[VD: ClassManifest, ED: ClassManifest](
vertices: RDD[(Vid,VD)],
edges: RDD[Edge[ED]],
defaultVertexAttr: VD): Graph[VD, ED] = {
Graph(vertices, edges, defaultVertexAttr, (a,b) => a, RandomVertexCut())
}
/**
* Construct a graph from a collection attributed vertices and
* edges. Duplicate vertices are combined using the `mergeFunc` and
* vertices found in the edge collection but not in the input
* vertices are the default attribute `defautVertexAttr`.
*
* @tparam VD the vertex attribute type
* @tparam ED the edge attribute type
* @param vertices the "set" of vertices and their attributes
* @param edges the collection of edges in the graph
* @param defaultVertexAttr the default vertex attribute to use for
* vertices that are mentioned in `edges` but not in `vertices
* @param mergeFunc the function used to merge duplicate vertices
* in the `vertices` collection.
* @param partitionStrategy the partition strategy to use when * @param partitionStrategy the partition strategy to use when
* partitioning the edges. * partitioning the edges.
*
*/ */
def apply[VD: ClassManifest, ED: ClassManifest]( def apply[VD: ClassManifest, ED: ClassManifest](
vertices: RDD[(Vid,VD)], vertices: RDD[(Vid, VD)],
edges: RDD[Edge[ED]], edges: RDD[Edge[ED]],
defaultVertexAttr: VD, defaultVertexAttr: VD = null.asInstanceOf[VD]): Graph[VD, ED] = {
mergeFunc: (VD, VD) => VD, GraphImpl(vertices, edges, defaultVertexAttr)
partitionStrategy: PartitionStrategy): Graph[VD, ED] = {
GraphImpl(vertices, edges, defaultVertexAttr, mergeFunc, partitionStrategy)
} }
/** /**
* The implicit graphToGraphOPs function extracts the GraphOps * The implicit graphToGraphOPs function extracts the GraphOps member from a graph.
* member from a graph.
* *
* To improve modularity the Graph type only contains a small set of * To improve modularity the Graph type only contains a small set of basic operations. All the
* basic operations. All the convenience operations are defined in * convenience operations are defined in the GraphOps class which may be shared across multiple
* the GraphOps class which may be shared across multiple graph * graph implementations.
* implementations.
*/ */
implicit def graphToGraphOps[VD: ClassManifest, ED: ClassManifest](g: Graph[VD, ED]) = g.ops implicit def graphToGraphOps[VD: ClassManifest, ED: ClassManifest](g: Graph[VD, ED]) = g.ops
} // end of Graph object } // end of Graph object

View file

@ -5,22 +5,22 @@ import com.esotericsoftware.kryo.Kryo
import org.apache.spark.graph.impl._ import org.apache.spark.graph.impl._
import org.apache.spark.serializer.KryoRegistrator import org.apache.spark.serializer.KryoRegistrator
import org.apache.spark.util.collection.BitSet import org.apache.spark.util.collection.BitSet
import org.apache.spark.graph._ import org.apache.spark.util.BoundedPriorityQueue
class GraphKryoRegistrator extends KryoRegistrator { class GraphKryoRegistrator extends KryoRegistrator {
def registerClasses(kryo: Kryo) { def registerClasses(kryo: Kryo) {
kryo.register(classOf[Edge[Object]]) kryo.register(classOf[Edge[Object]])
kryo.register(classOf[MutableTuple2[Object, Object]])
kryo.register(classOf[MessageToPartition[Object]]) kryo.register(classOf[MessageToPartition[Object]])
kryo.register(classOf[VertexBroadcastMsg[Object]]) kryo.register(classOf[VertexBroadcastMsg[Object]])
kryo.register(classOf[AggregationMsg[Object]])
kryo.register(classOf[(Vid, Object)]) kryo.register(classOf[(Vid, Object)])
kryo.register(classOf[EdgePartition[Object]]) kryo.register(classOf[EdgePartition[Object]])
kryo.register(classOf[BitSet]) kryo.register(classOf[BitSet])
kryo.register(classOf[VertexIdToIndexMap]) kryo.register(classOf[VertexIdToIndexMap])
kryo.register(classOf[VertexAttributeBlock[Object]]) kryo.register(classOf[VertexAttributeBlock[Object]])
kryo.register(classOf[PartitionStrategy]) kryo.register(classOf[PartitionStrategy])
kryo.register(classOf[BoundedPriorityQueue[Object]])
// This avoids a large number of hash table lookups. // This avoids a large number of hash table lookups.
kryo.setReferences(false) kryo.setReferences(false)

View file

@ -24,6 +24,8 @@ object GraphLab {
* @param scatterFunc Executed after the apply function the scatter function takes * @param scatterFunc Executed after the apply function the scatter function takes
* a triplet and signals whether the neighboring vertex program * a triplet and signals whether the neighboring vertex program
* must be recomputed. * must be recomputed.
* @param startVertices predicate to determine which vertices to start the computation on.
* these will be the active vertices in the first iteration.
* @param numIter The maximum number of iterations to run. * @param numIter The maximum number of iterations to run.
* @param gatherDirection The direction of edges to consider during the gather phase * @param gatherDirection The direction of edges to consider during the gather phase
* @param scatterDirection The direction of edges to consider during the scatter phase * @param scatterDirection The direction of edges to consider during the scatter phase
@ -40,12 +42,13 @@ object GraphLab {
(gatherFunc: (Vid, EdgeTriplet[VD, ED]) => A, (gatherFunc: (Vid, EdgeTriplet[VD, ED]) => A,
mergeFunc: (A, A) => A, mergeFunc: (A, A) => A,
applyFunc: (Vid, VD, Option[A]) => VD, applyFunc: (Vid, VD, Option[A]) => VD,
scatterFunc: (Vid, EdgeTriplet[VD, ED]) => Boolean): Graph[VD, ED] = { scatterFunc: (Vid, EdgeTriplet[VD, ED]) => Boolean,
startVertices: (Vid, VD) => Boolean = (vid: Vid, data: VD) => true): Graph[VD, ED] = {
// Add an active attribute to all vertices to track convergence. // Add an active attribute to all vertices to track convergence.
var activeGraph: Graph[(Boolean, VD), ED] = graph.mapVertices { var activeGraph: Graph[(Boolean, VD), ED] = graph.mapVertices {
case (id, data) => (true, data) case (id, data) => (startVertices(id, data), data)
}.cache() }.cache()
// The gather function wrapper strips the active attribute and // The gather function wrapper strips the active attribute and
@ -86,9 +89,9 @@ object GraphLab {
} }
// Used to set the active status of vertices for the next round // Used to set the active status of vertices for the next round
def applyActive(vid: Vid, data: (Boolean, VD), newActive: Boolean): (Boolean, VD) = { def applyActive(vid: Vid, data: (Boolean, VD), newActiveOpt: Option[Boolean]): (Boolean, VD) = {
val (prevActive, vData) = data val (prevActive, vData) = data
(newActive, vData) (newActiveOpt.getOrElse(false), vData)
} }
// Main Loop --------------------------------------------------------------------- // Main Loop ---------------------------------------------------------------------
@ -110,7 +113,7 @@ object GraphLab {
val scattered: RDD[(Vid, Boolean)] = val scattered: RDD[(Vid, Boolean)] =
activeGraph.aggregateNeighbors(scatter, _ || _, scatterDirection.reverse) activeGraph.aggregateNeighbors(scatter, _ || _, scatterDirection.reverse)
activeGraph = activeGraph.joinVertices(scattered)(applyActive).cache() activeGraph = activeGraph.outerJoinVertices(scattered)(applyActive).cache()
// Calculate the number of active vertices // Calculate the number of active vertices
numActive = activeGraph.vertices.map{ numActive = activeGraph.vertices.map{
@ -124,12 +127,3 @@ object GraphLab {
activeGraph.mapVertices{case (vid, data) => data._2 } activeGraph.mapVertices{case (vid, data) => data._2 }
} }
} }

View file

@ -1,12 +1,13 @@
package org.apache.spark.graph package org.apache.spark.graph
import org.apache.spark.rdd.RDD import java.util.{Arrays => JArrays}
import org.apache.spark.SparkContext import org.apache.spark.graph.impl.EdgePartitionBuilder
import org.apache.spark.SparkContext._ import org.apache.spark.{Logging, SparkContext}
import org.apache.spark.graph.impl.GraphImpl import org.apache.spark.graph.impl.{EdgePartition, GraphImpl}
import org.apache.spark.util.collection.PrimitiveVector
object GraphLoader { object GraphLoader extends Logging {
/** /**
* Load an edge list from file initializing the Graph * Load an edge list from file initializing the Graph
@ -25,8 +26,7 @@ object GraphLoader {
sc: SparkContext, sc: SparkContext,
path: String, path: String,
edgeParser: Array[String] => ED, edgeParser: Array[String] => ED,
minEdgePartitions: Int = 1, minEdgePartitions: Int = 1):
partitionStrategy: PartitionStrategy = RandomVertexCut()):
Graph[Int, ED] = { Graph[Int, ED] = {
// Parse the edge data table // Parse the edge data table
val edges = sc.textFile(path, minEdgePartitions).mapPartitions( iter => val edges = sc.textFile(path, minEdgePartitions).mapPartitions( iter =>
@ -43,7 +43,7 @@ object GraphLoader {
Edge(source, target, edata) Edge(source, target, edata)
}) })
val defaultVertexAttr = 1 val defaultVertexAttr = 1
Graph(edges, defaultVertexAttr, partitionStrategy) Graph.fromEdges(edges, defaultVertexAttr)
} }
/** /**
@ -73,31 +73,39 @@ object GraphLoader {
* @tparam ED * @tparam ED
* @return * @return
*/ */
def edgeListFile[ED: ClassManifest]( def edgeListFile(
sc: SparkContext, sc: SparkContext,
path: String, path: String,
canonicalOrientation: Boolean = false, canonicalOrientation: Boolean = false,
minEdgePartitions: Int = 1, minEdgePartitions: Int = 1):
partitionStrategy: PartitionStrategy = RandomVertexCut()):
Graph[Int, Int] = { Graph[Int, Int] = {
// Parse the edge data table val startTime = System.currentTimeMillis
val edges = sc.textFile(path, minEdgePartitions).mapPartitions( iter =>
iter.filter(line => !line.isEmpty && line(0) != '#').map { line => // Parse the edge data table directly into edge partitions
val edges = sc.textFile(path, minEdgePartitions).mapPartitionsWithIndex { (pid, iter) =>
val builder = new EdgePartitionBuilder[Int]
iter.foreach { line =>
if (!line.isEmpty && line(0) != '#') {
val lineArray = line.split("\\s+") val lineArray = line.split("\\s+")
if(lineArray.length < 2) { if (lineArray.length < 2) {
println("Invalid line: " + line) logWarning("Invalid line: " + line)
assert(false)
} }
val source = lineArray(0).trim.toLong val srcId = lineArray(0).toLong
val target = lineArray(1).trim.toLong val dstId = lineArray(1).toLong
if (canonicalOrientation && target > source) { if (canonicalOrientation && dstId > srcId) {
Edge(target, source, 1) builder.add(dstId, srcId, 1)
} else { } else {
Edge(source, target, 1) builder.add(srcId, dstId, 1)
} }
}) }
val defaultVertexAttr = 1 }
Graph(edges, defaultVertexAttr, partitionStrategy) Iterator((pid, builder.toEdgePartition))
}.cache()
edges.count()
logInfo("It took %d ms to load the edges".format(System.currentTimeMillis - startTime))
GraphImpl.fromEdgePartitions(edges, defaultVertexAttr = 1)
} // end of edgeListFile } // end of edgeListFile
} }

View file

@ -2,7 +2,6 @@ package org.apache.spark.graph
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext._ import org.apache.spark.SparkContext._
import org.apache.spark.util.ClosureCleaner
import org.apache.spark.SparkException import org.apache.spark.SparkException
@ -35,14 +34,14 @@ class GraphOps[VD: ClassManifest, ED: ClassManifest](graph: Graph[VD, ED]) {
* RDD. * RDD.
* @note Vertices with no in edges are not returned in the resulting RDD. * @note Vertices with no in edges are not returned in the resulting RDD.
*/ */
lazy val inDegrees: VertexSetRDD[Int] = degreesRDD(EdgeDirection.In) lazy val inDegrees: VertexRDD[Int] = degreesRDD(EdgeDirection.In)
/** /**
* Compute the out-degree of each vertex in the Graph returning an RDD. * Compute the out-degree of each vertex in the Graph returning an RDD.
* @note Vertices with no out edges are not returned in the resulting RDD. * @note Vertices with no out edges are not returned in the resulting RDD.
*/ */
lazy val outDegrees: VertexSetRDD[Int] = degreesRDD(EdgeDirection.Out) lazy val outDegrees: VertexRDD[Int] = degreesRDD(EdgeDirection.Out)
/** /**
@ -50,7 +49,7 @@ class GraphOps[VD: ClassManifest, ED: ClassManifest](graph: Graph[VD, ED]) {
* @note Vertices with no edges are not returned in the resulting * @note Vertices with no edges are not returned in the resulting
* RDD. * RDD.
*/ */
lazy val degrees: VertexSetRDD[Int] = degreesRDD(EdgeDirection.Both) lazy val degrees: VertexRDD[Int] = degreesRDD(EdgeDirection.Both)
/** /**
@ -59,7 +58,7 @@ class GraphOps[VD: ClassManifest, ED: ClassManifest](graph: Graph[VD, ED]) {
* @param edgeDirection the direction along which to collect * @param edgeDirection the direction along which to collect
* neighboring vertex attributes. * neighboring vertex attributes.
*/ */
private def degreesRDD(edgeDirection: EdgeDirection): VertexSetRDD[Int] = { private def degreesRDD(edgeDirection: EdgeDirection): VertexRDD[Int] = {
if (edgeDirection == EdgeDirection.In) { if (edgeDirection == EdgeDirection.In) {
graph.mapReduceTriplets(et => Iterator((et.dstId,1)), _ + _) graph.mapReduceTriplets(et => Iterator((et.dstId,1)), _ + _)
} else if (edgeDirection == EdgeDirection.Out) { } else if (edgeDirection == EdgeDirection.Out) {
@ -114,10 +113,7 @@ class GraphOps[VD: ClassManifest, ED: ClassManifest](graph: Graph[VD, ED]) {
mapFunc: (Vid, EdgeTriplet[VD, ED]) => Option[A], mapFunc: (Vid, EdgeTriplet[VD, ED]) => Option[A],
reduceFunc: (A, A) => A, reduceFunc: (A, A) => A,
dir: EdgeDirection) dir: EdgeDirection)
: VertexSetRDD[A] = { : VertexRDD[A] = {
ClosureCleaner.clean(mapFunc)
ClosureCleaner.clean(reduceFunc)
// Define a new map function over edge triplets // Define a new map function over edge triplets
val mf = (et: EdgeTriplet[VD,ED]) => { val mf = (et: EdgeTriplet[VD,ED]) => {
@ -140,7 +136,6 @@ class GraphOps[VD: ClassManifest, ED: ClassManifest](graph: Graph[VD, ED]) {
} }
} }
ClosureCleaner.clean(mf)
graph.mapReduceTriplets(mf, reduceFunc) graph.mapReduceTriplets(mf, reduceFunc)
} // end of aggregateNeighbors } // end of aggregateNeighbors
@ -154,7 +149,7 @@ class GraphOps[VD: ClassManifest, ED: ClassManifest](graph: Graph[VD, ED]) {
* @return the vertex set of neighboring ids for each vertex. * @return the vertex set of neighboring ids for each vertex.
*/ */
def collectNeighborIds(edgeDirection: EdgeDirection) : def collectNeighborIds(edgeDirection: EdgeDirection) :
VertexSetRDD[Array[Vid]] = { VertexRDD[Array[Vid]] = {
val nbrs = val nbrs =
if (edgeDirection == EdgeDirection.Both) { if (edgeDirection == EdgeDirection.Both) {
graph.mapReduceTriplets[Array[Vid]]( graph.mapReduceTriplets[Array[Vid]](
@ -190,7 +185,7 @@ class GraphOps[VD: ClassManifest, ED: ClassManifest](graph: Graph[VD, ED]) {
* vertex. * vertex.
*/ */
def collectNeighbors(edgeDirection: EdgeDirection) : def collectNeighbors(edgeDirection: EdgeDirection) :
VertexSetRDD[ Array[(Vid, VD)] ] = { VertexRDD[ Array[(Vid, VD)] ] = {
val nbrs = graph.aggregateNeighbors[Array[(Vid,VD)]]( val nbrs = graph.aggregateNeighbors[Array[(Vid,VD)]](
(vid, edge) => (vid, edge) =>
Some(Array( (edge.otherVertexId(vid), edge.otherVertexAttr(vid)) )), Some(Array( (edge.otherVertexId(vid), edge.otherVertexAttr(vid)) )),
@ -233,14 +228,12 @@ class GraphOps[VD: ClassManifest, ED: ClassManifest](graph: Graph[VD, ED]) {
*/ */
def joinVertices[U: ClassManifest](table: RDD[(Vid, U)])(mapFunc: (Vid, VD, U) => VD) def joinVertices[U: ClassManifest](table: RDD[(Vid, U)])(mapFunc: (Vid, VD, U) => VD)
: Graph[VD, ED] = { : Graph[VD, ED] = {
ClosureCleaner.clean(mapFunc)
val uf = (id: Vid, data: VD, o: Option[U]) => { val uf = (id: Vid, data: VD, o: Option[U]) => {
o match { o match {
case Some(u) => mapFunc(id, data, u) case Some(u) => mapFunc(id, data, u)
case None => data case None => data
} }
} }
ClosureCleaner.clean(uf)
graph.outerJoinVertices(table)(uf) graph.outerJoinVertices(table)(uf)
} }

View file

@ -50,7 +50,7 @@ sealed trait PartitionStrategy extends Serializable {
* *
* *
*/ */
case class EdgePartition2D() extends PartitionStrategy { case object EdgePartition2D extends PartitionStrategy {
override def getPartition(src: Vid, dst: Vid, numParts: Pid): Pid = { override def getPartition(src: Vid, dst: Vid, numParts: Pid): Pid = {
val ceilSqrtNumParts: Pid = math.ceil(math.sqrt(numParts)).toInt val ceilSqrtNumParts: Pid = math.ceil(math.sqrt(numParts)).toInt
val mixingPrime: Vid = 1125899906842597L val mixingPrime: Vid = 1125899906842597L
@ -61,7 +61,7 @@ case class EdgePartition2D() extends PartitionStrategy {
} }
case class EdgePartition1D() extends PartitionStrategy { case object EdgePartition1D extends PartitionStrategy {
override def getPartition(src: Vid, dst: Vid, numParts: Pid): Pid = { override def getPartition(src: Vid, dst: Vid, numParts: Pid): Pid = {
val mixingPrime: Vid = 1125899906842597L val mixingPrime: Vid = 1125899906842597L
(math.abs(src) * mixingPrime).toInt % numParts (math.abs(src) * mixingPrime).toInt % numParts
@ -73,7 +73,7 @@ case class EdgePartition1D() extends PartitionStrategy {
* Assign edges to an aribtrary machine corresponding to a * Assign edges to an aribtrary machine corresponding to a
* random vertex cut. * random vertex cut.
*/ */
case class RandomVertexCut() extends PartitionStrategy { case object RandomVertexCut extends PartitionStrategy {
override def getPartition(src: Vid, dst: Vid, numParts: Pid): Pid = { override def getPartition(src: Vid, dst: Vid, numParts: Pid): Pid = {
math.abs((src, dst).hashCode()) % numParts math.abs((src, dst).hashCode()) % numParts
} }
@ -85,7 +85,7 @@ case class RandomVertexCut() extends PartitionStrategy {
* function ensures that edges of opposite direction between the same two vertices * function ensures that edges of opposite direction between the same two vertices
* will end up on the same partition. * will end up on the same partition.
*/ */
case class CanonicalRandomVertexCut() extends PartitionStrategy { case object CanonicalRandomVertexCut extends PartitionStrategy {
override def getPartition(src: Vid, dst: Vid, numParts: Pid): Pid = { override def getPartition(src: Vid, dst: Vid, numParts: Pid): Pid = {
val lower = math.min(src, dst) val lower = math.min(src, dst)
val higher = math.max(src, dst) val higher = math.max(src, dst)

View file

@ -1,7 +1,5 @@
package org.apache.spark.graph package org.apache.spark.graph
import org.apache.spark.rdd.RDD
/** /**
* This object implements a Pregel-like bulk-synchronous * This object implements a Pregel-like bulk-synchronous
@ -13,10 +11,6 @@ import org.apache.spark.rdd.RDD
* execution while also exposing greater flexibility for graph based * execution while also exposing greater flexibility for graph based
* computation. * computation.
* *
* This object present several variants of the bulk synchronous
* execution that differ only in the edge direction along which
* messages are sent and whether a fixed number of iterations is used.
*
* @example We can use the Pregel abstraction to implement PageRank * @example We can use the Pregel abstraction to implement PageRank
* {{{ * {{{
* val pagerankGraph: Graph[Double, Double] = graph * val pagerankGraph: Graph[Double, Double] = graph
@ -43,7 +37,6 @@ import org.apache.spark.rdd.RDD
*/ */
object Pregel { object Pregel {
/** /**
* Execute a Pregel-like iterative vertex-parallel abstraction. The * Execute a Pregel-like iterative vertex-parallel abstraction. The
* user-defined vertex-program `vprog` is executed in parallel on * user-defined vertex-program `vprog` is executed in parallel on
@ -58,7 +51,8 @@ object Pregel {
* on subsequent iterations if a vertex does not receive a message * on subsequent iterations if a vertex does not receive a message
* then the vertex-program is not invoked. * then the vertex-program is not invoked.
* *
* This function iterates a fixed number (`numIter`) of iterations. * This function iterates until there are no remaining messages, or
* for maxIterations iterations.
* *
* @tparam VD the vertex data type * @tparam VD the vertex data type
* @tparam ED the edge data type * @tparam ED the edge data type
@ -69,7 +63,7 @@ object Pregel {
* @param initialMsg the message each vertex will receive at the on * @param initialMsg the message each vertex will receive at the on
* the first iteration. * the first iteration.
* *
* @param numIter the number of iterations to run this computation. * @param maxIterations the maximum number of iterations to run for.
* *
* @param vprog the user-defined vertex program which runs on each * @param vprog the user-defined vertex program which runs on each
* vertex and receives the inbound message and computes a new vertex * vertex and receives the inbound message and computes a new vertex
@ -91,123 +85,36 @@ object Pregel {
* *
*/ */
def apply[VD: ClassManifest, ED: ClassManifest, A: ClassManifest] def apply[VD: ClassManifest, ED: ClassManifest, A: ClassManifest]
(graph: Graph[VD, ED], initialMsg: A, numIter: Int)( (graph: Graph[VD, ED], initialMsg: A, maxIterations: Int = Int.MaxValue)(
vprog: (Vid, VD, A) => VD, vprog: (Vid, VD, A) => VD,
sendMsg: EdgeTriplet[VD, ED] => Iterator[(Vid,A)], sendMsg: EdgeTriplet[VD, ED] => Iterator[(Vid,A)],
mergeMsg: (A, A) => A) mergeMsg: (A, A) => A)
: Graph[VD, ED] = { : Graph[VD, ED] = {
// Receive the first set of messages var g = graph.mapVertices( (vid, vdata) => vprog(vid, vdata, initialMsg) )
var g = graph.mapVertices( (vid, vdata) => vprog(vid, vdata, initialMsg)).cache
var i = 0
while (i < numIter) {
// compute the messages // compute the messages
val messages = g.mapReduceTriplets(sendMsg, mergeMsg) var messages = g.mapReduceTriplets(sendMsg, mergeMsg).cache()
// receive the messages var activeMessages = messages.count()
g = g.joinVertices(messages)(vprog).cache
// count the iteration
i += 1
}
// Return the final graph
g
} // end of apply
/**
* Execute a Pregel-like iterative vertex-parallel abstraction. The
* user-defined vertex-program `vprog` is executed in parallel on
* each vertex receiving any inbound messages and computing a new
* value for the vertex. The `sendMsg` function is then invoked on
* all out-edges and is used to compute an optional message to the
* destination vertex. The `mergeMsg` function is a commutative
* associative function used to combine messages destined to the
* same vertex.
*
* On the first iteration all vertices receive the `initialMsg` and
* on subsequent iterations if a vertex does not receive a message
* then the vertex-program is not invoked.
*
* This function iterates until there are no remaining messages.
*
* @tparam VD the vertex data type
* @tparam ED the edge data type
* @tparam A the Pregel message type
*
* @param graph the input graph.
*
* @param initialMsg the message each vertex will receive at the on
* the first iteration.
*
* @param numIter the number of iterations to run this computation.
*
* @param vprog the user-defined vertex program which runs on each
* vertex and receives the inbound message and computes a new vertex
* value. On the first iteration the vertex program is invoked on
* all vertices and is passed the default message. On subsequent
* iterations the vertex program is only invoked on those vertices
* that receive messages.
*
* @param sendMsg a user supplied function that is applied to out
* edges of vertices that received messages in the current
* iteration.
*
* @param mergeMsg a user supplied function that takes two incoming
* messages of type A and merges them into a single message of type
* A. ''This function must be commutative and associative and
* ideally the size of A should not increase.''
*
* @return the resulting graph at the end of the computation
*
*/
def apply[VD: ClassManifest, ED: ClassManifest, A: ClassManifest]
(graph: Graph[VD, ED], initialMsg: A)(
vprog: (Vid, VD, A) => VD,
sendMsg: EdgeTriplet[VD, ED] => Iterator[(Vid,A)],
mergeMsg: (A, A) => A)
: Graph[VD, ED] = {
def vprogFun(id: Vid, attr: (VD, Boolean), msgOpt: Option[A]): (VD, Boolean) = {
msgOpt match {
case Some(msg) => (vprog(id, attr._1, msg), true)
case None => (attr._1, false)
}
}
def sendMsgFun(edge: EdgeTriplet[(VD,Boolean), ED]): Iterator[(Vid, A)] = {
if(edge.srcAttr._2) {
val et = new EdgeTriplet[VD, ED]
et.srcId = edge.srcId
et.srcAttr = edge.srcAttr._1
et.dstId = edge.dstId
et.dstAttr = edge.dstAttr._1
et.attr = edge.attr
sendMsg(et)
} else {
Iterator.empty
}
}
var g = graph.mapVertices( (vid, vdata) => (vprog(vid, vdata, initialMsg), true) )
// compute the messages
var messages = g.mapReduceTriplets(sendMsgFun, mergeMsg).cache
var activeMessages = messages.count
// Loop // Loop
var i = 0 var i = 0
while (activeMessages > 0) { while (activeMessages > 0 && i < maxIterations) {
// receive the messages // Receive the messages. Vertices that didn't get any messages do not appear in newVerts.
g = g.outerJoinVertices(messages)(vprogFun) val newVerts = g.vertices.innerJoin(messages)(vprog).cache()
// Update the graph with the new vertices.
g = g.outerJoinVertices(newVerts) { (vid, old, newOpt) => newOpt.getOrElse(old) }
val oldMessages = messages val oldMessages = messages
// compute the messages // Send new messages. Vertices that didn't get any messages don't appear in newVerts, so don't
messages = g.mapReduceTriplets(sendMsgFun, mergeMsg).cache // get to send messages.
activeMessages = messages.count messages = g.mapReduceTriplets(sendMsg, mergeMsg, Some((newVerts, EdgeDirection.Out))).cache()
activeMessages = messages.count()
// after counting we can unpersist the old messages // after counting we can unpersist the old messages
oldMessages.unpersist(blocking=false) oldMessages.unpersist(blocking=false)
// count the iteration // count the iteration
i += 1 i += 1
} }
// Return the final graph
g.mapVertices((id, attr) => attr._1) g
} // end of apply } // end of apply
} // end of class Pregel } // end of class Pregel

View file

@ -0,0 +1,378 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.graph
import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
import org.apache.spark.storage.StorageLevel
import org.apache.spark.graph.impl.MsgRDDFunctions
import org.apache.spark.graph.impl.VertexPartition
/**
* A `VertexRDD[VD]` extends the `RDD[(Vid, VD)]` by ensuring that there is
* only one entry for each vertex and by pre-indexing the entries for fast,
* efficient joins.
*
* @tparam VD the vertex attribute associated with each vertex in the set.
*
* To construct a `VertexRDD` use the singleton object:
*
* @example Construct a `VertexRDD` from a plain RDD
* {{{
* // Construct an intial vertex set
* val someData: RDD[(Vid, SomeType)] = loadData(someFile)
* val vset = VertexRDD(someData)
* // If there were redundant values in someData we would use a reduceFunc
* val vset2 = VertexRDD(someData, reduceFunc)
* // Finally we can use the VertexRDD to index another dataset
* val otherData: RDD[(Vid, OtherType)] = loadData(otherFile)
* val vset3 = VertexRDD(otherData, vset.index)
* // Now we can construct very fast joins between the two sets
* val vset4: VertexRDD[(SomeType, OtherType)] = vset.leftJoin(vset3)
* }}}
*
*/
class VertexRDD[@specialized VD: ClassManifest](
val partitionsRDD: RDD[VertexPartition[VD]])
extends RDD[(Vid, VD)](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {
require(partitionsRDD.partitioner.isDefined)
partitionsRDD.setName("VertexRDD")
/**
* Construct a new VertexRDD that is indexed by only the keys in the RDD.
* The resulting VertexRDD will be based on a different index and can
* no longer be quickly joined with this RDD.
*/
def reindex(): VertexRDD[VD] = new VertexRDD(partitionsRDD.map(_.reindex()))
/**
* The partitioner is defined by the index.
*/
override val partitioner = partitionsRDD.partitioner
/**
* The actual partitions are defined by the tuples.
*/
override protected def getPartitions: Array[Partition] = partitionsRDD.partitions
/**
* The preferred locations are computed based on the preferred
* locations of the tuples.
*/
override protected def getPreferredLocations(s: Partition): Seq[String] =
partitionsRDD.preferredLocations(s)
/**
* Caching a VertexRDD causes the index and values to be cached separately.
*/
override def persist(newLevel: StorageLevel): VertexRDD[VD] = {
partitionsRDD.persist(newLevel)
this
}
/** Persist this RDD with the default storage level (`MEMORY_ONLY`). */
override def persist(): VertexRDD[VD] = persist(StorageLevel.MEMORY_ONLY)
/** Persist this RDD with the default storage level (`MEMORY_ONLY`). */
override def cache(): VertexRDD[VD] = persist()
/** Return the number of vertices in this set. */
override def count(): Long = {
partitionsRDD.map(_.size).reduce(_ + _)
}
/**
* Provide the `RDD[(Vid, VD)]` equivalent output.
*/
override def compute(part: Partition, context: TaskContext): Iterator[(Vid, VD)] = {
partitionsRDD.compute(part, context).next().iterator
}
/**
* Return a new VertexRDD by applying a function to each VertexPartition of this RDD.
*/
def mapVertexPartitions[VD2: ClassManifest](f: VertexPartition[VD] => VertexPartition[VD2])
: VertexRDD[VD2] = {
val newPartitionsRDD = partitionsRDD.mapPartitions(_.map(f), preservesPartitioning = true)
new VertexRDD(newPartitionsRDD)
}
/**
* Return a new VertexRDD by applying a function to corresponding
* VertexPartitions of this VertexRDD and another one.
*/
def zipVertexPartitions[VD2: ClassManifest, VD3: ClassManifest]
(other: VertexRDD[VD2])
(f: (VertexPartition[VD], VertexPartition[VD2]) => VertexPartition[VD3]): VertexRDD[VD3] = {
val newPartitionsRDD = partitionsRDD.zipPartitions(
other.partitionsRDD, preservesPartitioning = true
) { (thisIter, otherIter) =>
val thisPart = thisIter.next()
val otherPart = otherIter.next()
Iterator(f(thisPart, otherPart))
}
new VertexRDD(newPartitionsRDD)
}
/**
* Restrict the vertex set to the set of vertices satisfying the
* given predicate.
*
* @param pred the user defined predicate, which takes a tuple to conform to
* the RDD[(Vid, VD)] interface
*
* @note The vertex set preserves the original index structure
* which means that the returned RDD can be easily joined with
* the original vertex-set. Furthermore, the filter only
* modifies the bitmap index and so no new values are allocated.
*/
override def filter(pred: Tuple2[Vid, VD] => Boolean): VertexRDD[VD] =
this.mapVertexPartitions(_.filter(Function.untupled(pred)))
/**
* Pass each vertex attribute through a map function and retain the
* original RDD's partitioning and index.
*
* @tparam VD2 the type returned by the map function
*
* @param f the function applied to each value in the RDD
* @return a new VertexRDD with values obtained by applying `f` to
* each of the entries in the original VertexRDD. The resulting
* VertexRDD retains the same index.
*/
def mapValues[VD2: ClassManifest](f: VD => VD2): VertexRDD[VD2] =
this.mapVertexPartitions(_.map((vid, attr) => f(attr)))
/**
* Pass each vertex attribute through a map function and retain the
* original RDD's partitioning and index.
*
* @tparam VD2 the type returned by the map function
*
* @param f the function applied to each value in the RDD
* @return a new VertexRDD with values obtained by applying `f` to
* each of the entries in the original VertexRDD. The resulting
* VertexRDD retains the same index.
*/
def mapValues[VD2: ClassManifest](f: (Vid, VD) => VD2): VertexRDD[VD2] =
this.mapVertexPartitions(_.map(f))
/**
* Hides vertices that are the same between this and other. For vertices that are different, keeps
* the values from `other`.
*/
def diff(other: VertexRDD[VD]): VertexRDD[VD] = {
this.zipVertexPartitions(other) { (thisPart, otherPart) =>
thisPart.diff(otherPart)
}
}
/**
* Inner join this VertexSet with another VertexSet which has the
* same Index. This function will fail if both VertexSets do not
* share the same index. The resulting vertex set will only contain
* vertices that are in both this and the other vertex set.
*
* @tparam VD2 the attribute type of the other VertexSet
* @tparam VD3 the attribute type of the resulting VertexSet
*
* @param other the other VertexSet with which to join.
* @param f the function mapping a vertex id and its attributes in
* this and the other vertex set to a new vertex attribute.
* @return a VertexRDD containing only the vertices in both this
* and the other VertexSet and with tuple attributes.
*/
def zipJoin[VD2: ClassManifest, VD3: ClassManifest]
(other: VertexRDD[VD2])(f: (Vid, VD, VD2) => VD3): VertexRDD[VD3] = {
this.zipVertexPartitions(other) { (thisPart, otherPart) =>
thisPart.join(otherPart)(f)
}
}
/**
* Left join this VertexSet with another VertexSet which has the
* same Index. This function will fail if both VertexSets do not
* share the same index. The resulting vertex set contains an entry
* for each vertex in this set. If the other VertexSet is missing
* any vertex in this VertexSet then a `None` attribute is generated
*
* @tparam VD2 the attribute type of the other VertexSet
* @tparam VD3 the attribute type of the resulting VertexSet
*
* @param other the other VertexSet with which to join.
* @param f the function mapping a vertex id and its attributes in
* this and the other vertex set to a new vertex attribute.
* @return a VertexRDD containing all the vertices in this
* VertexSet with `None` attributes used for Vertices missing in the
* other VertexSet.
*
*/
def leftZipJoin[VD2: ClassManifest, VD3: ClassManifest]
(other: VertexRDD[VD2])(f: (Vid, VD, Option[VD2]) => VD3): VertexRDD[VD3] = {
this.zipVertexPartitions(other) { (thisPart, otherPart) =>
thisPart.leftJoin(otherPart)(f)
}
}
/**
* Left join this VertexRDD with an RDD containing vertex attribute
* pairs. If the other RDD is backed by a VertexRDD with the same
* index than the efficient leftZipJoin implementation is used. The
* resulting vertex set contains an entry for each vertex in this
* set. If the other VertexRDD is missing any vertex in this
* VertexRDD then a `None` attribute is generated.
*
* If there are duplicates, the vertex is picked at random.
*
* @tparam VD2 the attribute type of the other VertexRDD
* @tparam VD3 the attribute type of the resulting VertexRDD
*
* @param other the other VertexRDD with which to join.
* @param f the function mapping a vertex id and its attributes in
* this and the other vertex set to a new vertex attribute.
* @return a VertexRDD containing all the vertices in this
* VertexRDD with the attribute emitted by f.
*/
def leftJoin[VD2: ClassManifest, VD3: ClassManifest]
(other: RDD[(Vid, VD2)])
(f: (Vid, VD, Option[VD2]) => VD3)
: VertexRDD[VD3] =
{
// Test if the other vertex is a VertexRDD to choose the optimal join strategy.
// If the other set is a VertexRDD then we use the much more efficient leftZipJoin
other match {
case other: VertexRDD[_] =>
leftZipJoin(other)(f)
case _ =>
new VertexRDD[VD3](
partitionsRDD.zipPartitions(
other.partitionBy(this.partitioner.get), preservesPartitioning = true)
{ (part, msgs) =>
val vertexPartition: VertexPartition[VD] = part.next()
Iterator(vertexPartition.leftJoin(msgs)(f))
}
)
}
}
/**
* Same effect as leftJoin(other) { (vid, a, bOpt) => bOpt.getOrElse(a) }, but `this` and `other`
* must have the same index.
*/
def innerZipJoin[U: ClassManifest, VD2: ClassManifest](other: VertexRDD[U])
(f: (Vid, VD, U) => VD2): VertexRDD[VD2] = {
this.zipVertexPartitions(other) { (thisPart, otherPart) =>
thisPart.innerJoin(otherPart)(f)
}
}
/**
* Replace vertices with corresponding vertices in `other`, and drop vertices without a
* corresponding vertex in `other`.
*/
def innerJoin[U: ClassManifest, VD2: ClassManifest](other: RDD[(Vid, U)])
(f: (Vid, VD, U) => VD2): VertexRDD[VD2] = {
// Test if the other vertex is a VertexRDD to choose the optimal join strategy.
// If the other set is a VertexRDD then we use the much more efficient innerZipJoin
other match {
case other: VertexRDD[_] =>
innerZipJoin(other)(f)
case _ =>
new VertexRDD(
partitionsRDD.zipPartitions(
other.partitionBy(this.partitioner.get), preservesPartitioning = true)
{ (part, msgs) =>
val vertexPartition: VertexPartition[VD] = part.next()
Iterator(vertexPartition.innerJoin(msgs)(f))
}
)
}
}
def aggregateUsingIndex[VD2: ClassManifest](
messages: RDD[(Vid, VD2)], reduceFunc: (VD2, VD2) => VD2): VertexRDD[VD2] =
{
val shuffled = MsgRDDFunctions.partitionForAggregation(messages, this.partitioner.get)
val parts = partitionsRDD.zipPartitions(shuffled, true) { (thisIter, msgIter) =>
val vertexPartition: VertexPartition[VD] = thisIter.next()
Iterator(vertexPartition.aggregateUsingIndex(msgIter, reduceFunc))
}
new VertexRDD[VD2](parts)
}
} // end of VertexRDD
/**
* The VertexRDD singleton is used to construct VertexRDDs
*/
object VertexRDD {
/**
* Construct a vertex set from an RDD of vertex-attribute pairs.
* Duplicate entries are removed arbitrarily.
*
* @tparam VD the vertex attribute type
*
* @param rdd the collection of vertex-attribute pairs
*/
def apply[VD: ClassManifest](rdd: RDD[(Vid, VD)]): VertexRDD[VD] = {
val partitioned: RDD[(Vid, VD)] = rdd.partitioner match {
case Some(p) => rdd
case None => rdd.partitionBy(new HashPartitioner(rdd.partitions.size))
}
val vertexPartitions = partitioned.mapPartitions(
iter => Iterator(VertexPartition(iter)),
preservesPartitioning = true)
new VertexRDD(vertexPartitions)
}
/**
* Construct a vertex set from an RDD of vertex-attribute pairs.
* Duplicate entries are merged using mergeFunc.
*
* @tparam VD the vertex attribute type
*
* @param rdd the collection of vertex-attribute pairs
* @param mergeFunc the associative, commutative merge function.
*/
def apply[VD: ClassManifest](rdd: RDD[(Vid, VD)], mergeFunc: (VD, VD) => VD): VertexRDD[VD] =
{
val partitioned: RDD[(Vid, VD)] = rdd.partitioner match {
case Some(p) => rdd
case None => rdd.partitionBy(new HashPartitioner(rdd.partitions.size))
}
val vertexPartitions = partitioned.mapPartitions(
iter => Iterator(VertexPartition(iter)),
preservesPartitioning = true)
new VertexRDD(vertexPartitions)
}
def apply[VD: ClassManifest](vids: RDD[Vid], rdd: RDD[(Vid, VD)], defaultVal: VD)
: VertexRDD[VD] =
{
VertexRDD(vids.map(vid => (vid, defaultVal))).leftJoin(rdd) { (vid, default, value) =>
value.getOrElse(default)
}
}
}

View file

@ -1,593 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.graph
import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
import org.apache.spark.storage.StorageLevel
import org.apache.spark.util.collection.{BitSet, OpenHashSet, PrimitiveKeyOpenHashMap}
import org.apache.spark.graph.impl.AggregationMsg
import org.apache.spark.graph.impl.MsgRDDFunctions._
import org.apache.spark.graph.impl.VertexPartition
/**
* Maintains the per-partition mapping from vertex id to the corresponding
* location in the per-partition values array. This class is meant to be an
* opaque type.
*
*/
class VertexSetIndex(private[spark] val rdd: RDD[VertexIdToIndexMap]) {
/**
* The persist function behaves like the standard RDD persist
*/
def persist(newLevel: StorageLevel): VertexSetIndex = {
rdd.persist(newLevel)
return this
}
/**
* Returns the partitioner object of the underlying RDD. This is
* used by the VertexSetRDD to partition the values RDD.
*/
def partitioner: Partitioner = rdd.partitioner.get
} // end of VertexSetIndex
/**
* A `VertexSetRDD[VD]` extends the `RDD[(Vid, VD)]` by ensuring that there is
* only one entry for each vertex and by pre-indexing the entries for fast,
* efficient joins.
*
* @tparam VD the vertex attribute associated with each vertex in the set.
*
* To construct a `VertexSetRDD` use the singleton object:
*
* @example Construct a `VertexSetRDD` from a plain RDD
* {{{
* // Construct an intial vertex set
* val someData: RDD[(Vid, SomeType)] = loadData(someFile)
* val vset = VertexSetRDD(someData)
* // If there were redundant values in someData we would use a reduceFunc
* val vset2 = VertexSetRDD(someData, reduceFunc)
* // Finally we can use the VertexSetRDD to index another dataset
* val otherData: RDD[(Vid, OtherType)] = loadData(otherFile)
* val vset3 = VertexSetRDD(otherData, vset.index)
* // Now we can construct very fast joins between the two sets
* val vset4: VertexSetRDD[(SomeType, OtherType)] = vset.leftJoin(vset3)
* }}}
*
*/
class VertexSetRDD[@specialized VD: ClassManifest](
@transient val partitionsRDD: RDD[VertexPartition[VD]])
extends RDD[(Vid, VD)](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {
/**
* The `VertexSetIndex` representing the layout of this `VertexSetRDD`.
*/
def index = new VertexSetIndex(partitionsRDD.mapPartitions(_.map(_.index),
preservesPartitioning = true))
/**
* Construct a new VertexSetRDD that is indexed by only the keys in the RDD.
* The resulting VertexSet will be based on a different index and can
* no longer be quickly joined with this RDD.
*/
def reindex(): VertexSetRDD[VD] = VertexSetRDD(this)
/**
* An internal representation which joins the block indices with the values
* This is used by the compute function to emulate `RDD[(Vid, VD)]`
*/
protected[spark] val tuples = partitionsRDD.flatMap(_.iterator)
/**
* The partitioner is defined by the index.
*/
override val partitioner = partitionsRDD.partitioner
/**
* The actual partitions are defined by the tuples.
*/
override def getPartitions: Array[Partition] = tuples.partitions
/**
* The preferred locations are computed based on the preferred
* locations of the tuples.
*/
override def getPreferredLocations(s: Partition): Seq[String] =
tuples.preferredLocations(s)
/**
* Caching a VertexSetRDD causes the index and values to be cached separately.
*/
override def persist(newLevel: StorageLevel): VertexSetRDD[VD] = {
partitionsRDD.persist(newLevel)
this
}
/** Persist this RDD with the default storage level (`MEMORY_ONLY`). */
override def persist(): VertexSetRDD[VD] = persist(StorageLevel.MEMORY_ONLY)
/** Persist this RDD with the default storage level (`MEMORY_ONLY`). */
override def cache(): VertexSetRDD[VD] = persist()
/**
* Provide the `RDD[(Vid, VD)]` equivalent output.
*/
override def compute(part: Partition, context: TaskContext): Iterator[(Vid, VD)] =
tuples.compute(part, context)
/**
* Return a new VertexSetRDD by applying a function to each VertexPartition of
* this RDD.
*/
def mapVertexPartitions[VD2: ClassManifest](
f: VertexPartition[VD] => VertexPartition[VD2]): VertexSetRDD[VD2] = {
val cleanF = sparkContext.clean(f)
val newPartitionsRDD = partitionsRDD.mapPartitions(_.map(f), preservesPartitioning = true)
new VertexSetRDD(newPartitionsRDD)
}
/**
* Return a new VertexSetRDD by applying a function to corresponding
* VertexPartitions of this VertexSetRDD and another one.
*/
def zipVertexPartitions[VD2: ClassManifest, VD3: ClassManifest]
(other: VertexSetRDD[VD2])
(f: (VertexPartition[VD], VertexPartition[VD2]) => VertexPartition[VD3]): VertexSetRDD[VD3] = {
val cleanF = sparkContext.clean(f)
val newPartitionsRDD = partitionsRDD.zipPartitions(
other.partitionsRDD, preservesPartitioning = true
) {
(thisIter, otherIter) =>
val thisPart = thisIter.next()
val otherPart = otherIter.next()
Iterator(cleanF(thisPart, otherPart))
}
new VertexSetRDD(newPartitionsRDD)
}
/**
* Restrict the vertex set to the set of vertices satisfying the
* given predicate.
*
* @param pred the user defined predicate, which takes a tuple to conform to
* the RDD[(Vid, VD)] interface
*
* @note The vertex set preserves the original index structure
* which means that the returned RDD can be easily joined with
* the original vertex-set. Furthermore, the filter only
* modifies the bitmap index and so no new values are allocated.
*/
override def filter(pred: Tuple2[Vid, VD] => Boolean): VertexSetRDD[VD] =
this.mapVertexPartitions(_.filter(Function.untupled(pred)))
/**
* Pass each vertex attribute through a map function and retain the
* original RDD's partitioning and index.
*
* @tparam VD2 the type returned by the map function
*
* @param f the function applied to each value in the RDD
* @return a new VertexSetRDD with values obtained by applying `f` to
* each of the entries in the original VertexSet. The resulting
* VertexSetRDD retains the same index.
*/
def mapValues[VD2: ClassManifest](f: VD => VD2): VertexSetRDD[VD2] =
this.mapVertexPartitions(_.map { case (vid, attr) => f(attr) })
/**
* Fill in missing values for all vertices in the index.
*
* @param missingValue the value to be used for vertices in the
* index that don't currently have values.
* @return A VertexSetRDD with a value for all vertices.
*/
def fillMissing(missingValue: VD): VertexSetRDD[VD] = {
this.mapVertexPartitions { part =>
// Allocate a new values array with missing value as the default
val newValues = Array.fill(part.values.size)(missingValue)
// Copy over the old values
part.mask.iterator.foreach { ind =>
newValues(ind) = part.values(ind)
}
// Create a new mask with all vertices in the index
val newMask = part.index.getBitSet
new VertexPartition(part.index, newValues, newMask)
}
}
/**
* Pass each vertex attribute along with the vertex id through a map
* function and retain the original RDD's partitioning and index.
*
* @tparam VD2 the type returned by the map function
*
* @param f the function applied to each vertex id and vertex
* attribute in the RDD
* @return a new VertexSet with values obtained by applying `f` to
* each of the entries in the original VertexSet. The resulting
* VertexSetRDD retains the same index.
*/
def mapValuesWithKeys[VD2: ClassManifest](f: (Vid, VD) => VD2): VertexSetRDD[VD2] = {
this.mapVertexPartitions { part =>
// Construct a view of the map transformation
val newValues = new Array[VD2](part.index.capacity)
part.mask.iterator.foreach { ind =>
newValues(ind) = f(part.index.getValueSafe(ind), part.values(ind))
}
new VertexPartition(part.index, newValues, part.mask)
}
} // end of mapValuesWithKeys
/**
* Inner join this VertexSet with another VertexSet which has the
* same Index. This function will fail if both VertexSets do not
* share the same index. The resulting vertex set will only contain
* vertices that are in both this and the other vertex set.
*
* @tparam VD2 the attribute type of the other VertexSet
* @tparam VD3 the attribute type of the resulting VertexSet
*
* @param other the other VertexSet with which to join.
* @param f the function mapping a vertex id and its attributes in
* this and the other vertex set to a new vertex attribute.
* @return a VertexSetRDD containing only the vertices in both this
* and the other VertexSet and with tuple attributes.
*
*/
def zipJoin[VD2: ClassManifest, VD3: ClassManifest]
(other: VertexSetRDD[VD2])(f: (Vid, VD, VD2) => VD3): VertexSetRDD[VD3] = {
this.zipVertexPartitions(other) {
(thisPart, otherPart) =>
if (thisPart.index != otherPart.index) {
throw new SparkException("can't zip join VertexSetRDDs with different indexes")
}
val newValues = new Array[VD3](thisPart.index.capacity)
val newMask = thisPart.mask & otherPart.mask
newMask.iterator.foreach { ind =>
newValues(ind) =
f(thisPart.index.getValueSafe(ind), thisPart.values(ind), otherPart.values(ind))
}
new VertexPartition(thisPart.index, newValues, newMask)
}
}
/**
* Inner join this VertexSet with another VertexSet which has the
* same Index. This function will fail if both VertexSets do not
* share the same index.
*
* @param other the vertex set to join with this vertex set
* @param f the function mapping a vertex id and its attributes in
* this and the other vertex set to a collection of tuples.
* @tparam VD2 the type of the other vertex set attributes
* @tparam VD3 the type of the tuples emitted by `f`
* @return an RDD containing the tuples emitted by `f`
*/
def zipJoinFlatMap[VD2: ClassManifest, VD3: ClassManifest]
(other: VertexSetRDD[VD2])
(f: (Vid, VD, VD2) => Iterator[VD3]): RDD[VD3] = {
val cleanF = sparkContext.clean(f)
partitionsRDD.zipPartitions(other.partitionsRDD) {
(thisPartIter, otherPartIter) =>
val thisPart = thisPartIter.next()
val otherPart = otherPartIter.next()
if (thisPart.index != otherPart.index) {
throw new SparkException("can't zip join VertexSetRDDs with different indexes")
}
(thisPart.mask & otherPart.mask).iterator.flatMap { ind =>
cleanF(thisPart.index.getValueSafe(ind), thisPart.values(ind), otherPart.values(ind))
}
}
}
/**
* Left join this VertexSet with another VertexSet which has the
* same Index. This function will fail if both VertexSets do not
* share the same index. The resulting vertex set contains an entry
* for each vertex in this set. If the other VertexSet is missing
* any vertex in this VertexSet then a `None` attribute is generated
*
* @tparam VD2 the attribute type of the other VertexSet
* @tparam VD3 the attribute type of the resulting VertexSet
*
* @param other the other VertexSet with which to join.
* @param f the function mapping a vertex id and its attributes in
* this and the other vertex set to a new vertex attribute.
* @return a VertexSetRDD containing all the vertices in this
* VertexSet with `None` attributes used for Vertices missing in the
* other VertexSet.
*
*/
def leftZipJoin[VD2: ClassManifest, VD3: ClassManifest]
(other: VertexSetRDD[VD2])(f: (Vid, VD, Option[VD2]) => VD3): VertexSetRDD[VD3] = {
this.zipVertexPartitions(other) {
(thisPart, otherPart) =>
if (thisPart.index != otherPart.index) {
throw new SparkException("can't zip join VertexSetRDDs with different indexes")
}
val newValues = new Array[VD3](thisPart.index.capacity)
thisPart.mask.iterator.foreach { ind =>
val otherV = if (otherPart.mask.get(ind)) Option(otherPart.values(ind)) else None
newValues(ind) = f(
thisPart.index.getValueSafe(ind), thisPart.values(ind), otherV)
}
new VertexPartition(thisPart.index, newValues, thisPart.mask)
}
} // end of leftZipJoin
/**
* Left join this VertexSet with an RDD containing vertex attribute
* pairs. If the other RDD is backed by a VertexSet with the same
* index than the efficient leftZipJoin implementation is used. The
* resulting vertex set contains an entry for each vertex in this
* set. If the other VertexSet is missing any vertex in this
* VertexSet then a `None` attribute is generated
*
* @tparam VD2 the attribute type of the other VertexSet
* @tparam VD2 the attribute type of the resulting VertexSet
*
* @param other the other VertexSet with which to join.
* @param f the function mapping a vertex id and its attributes in
* this and the other vertex set to a new vertex attribute.
* @param merge the function used combine duplicate vertex
* attributes
* @return a VertexSetRDD containing all the vertices in this
* VertexSet with the attribute emitted by f.
*
*/
def leftJoin[VD2: ClassManifest, VD3: ClassManifest]
(other: RDD[(Vid, VD2)])
(f: (Vid, VD, Option[VD2]) => VD3, merge: (VD2, VD2) => VD2 = (a: VD2, b: VD2) => a)
: VertexSetRDD[VD3] = {
// Test if the other vertex is a VertexSetRDD to choose the optimal
// join strategy
other match {
// If the other set is a VertexSetRDD then we use the much more efficient
// leftZipJoin
case other: VertexSetRDD[_] => {
leftZipJoin(other)(f)
}
case _ => {
val indexedOther: VertexSetRDD[VD2] = VertexSetRDD(other, this.index, merge)
leftZipJoin(indexedOther)(f)
}
}
} // end of leftJoin
} // end of VertexSetRDD
/**
* The VertexSetRDD singleton is used to construct VertexSets
*/
object VertexSetRDD {
/**
* Construct a vertex set from an RDD of vertex-attribute pairs.
* Duplicate entries are removed arbitrarily.
*
* @tparam VD the vertex attribute type
*
* @param rdd the collection of vertex-attribute pairs
*/
def apply[VD: ClassManifest](rdd: RDD[(Vid, VD)]): VertexSetRDD[VD] =
apply(rdd, (a: VD, b: VD) => a)
/**
* Construct a vertex set from an RDD of vertex-attribute pairs
* where duplicate entries are merged using the reduceFunc
*
* @tparam VD the vertex attribute type
*
* @param rdd the collection of vertex-attribute pairs
* @param reduceFunc the function used to merge attributes of
* duplicate vertices.
*/
def apply[VD: ClassManifest](
rdd: RDD[(Vid, VD)], reduceFunc: (VD, VD) => VD): VertexSetRDD[VD] = {
val cReduceFunc = rdd.context.clean(reduceFunc)
// Preaggregate and shuffle if necessary
val preAgg = rdd.partitioner match {
case Some(p) => rdd
case None =>
val partitioner = new HashPartitioner(rdd.partitions.size)
// Preaggregation.
val aggregator = new Aggregator[Vid, VD, VD](v => v, cReduceFunc, cReduceFunc)
rdd.mapPartitions(aggregator.combineValuesByKey, true).partitionBy(partitioner)
}
val partitionsRDD = preAgg.mapPartitions(iter => {
val hashMap = new PrimitiveKeyOpenHashMap[Vid, VD]
for ((k, v) <- iter) {
hashMap.setMerge(k, v, cReduceFunc)
}
val part = new VertexPartition(hashMap.keySet, hashMap._values, hashMap.keySet.getBitSet)
Iterator(part)
}, preservesPartitioning = true).cache
new VertexSetRDD(partitionsRDD)
} // end of apply
/**
* Construct a vertex set from an RDD using an existing index.
*
* @note duplicate vertices are discarded arbitrarily
*
* @tparam VD the vertex attribute type
* @param rdd the rdd containing vertices
* @param indexPrototype a VertexSetRDD whose indexes will be reused. The
* indexes must be a superset of the vertices in rdd
* in RDD
*/
def apply[VD: ClassManifest](
rdd: RDD[(Vid, VD)], index: VertexSetIndex): VertexSetRDD[VD] =
apply(rdd, index, (a: VD, b: VD) => a)
/**
* Construct a vertex set from an RDD using an existing index and a
* user defined `combiner` to merge duplicate vertices.
*
* @tparam VD the vertex attribute type
* @param rdd the rdd containing vertices
* @param indexPrototype a VertexSetRDD whose indexes will be reused. The
* indexes must be a superset of the vertices in rdd
* @param reduceFunc the user defined reduce function used to merge
* duplicate vertex attributes.
*/
def apply[VD: ClassManifest](
rdd: RDD[(Vid, VD)],
index: VertexSetIndex,
reduceFunc: (VD, VD) => VD): VertexSetRDD[VD] =
apply(rdd, index, (v: VD) => v, reduceFunc, reduceFunc)
/**
* Construct a vertex set from an RDD of Product2[Vid, VD]
*
* @tparam VD the vertex attribute type
* @param rdd the rdd containing vertices
* @param indexPrototype a VertexSetRDD whose indexes will be reused. The
* indexes must be a superset of the vertices in rdd
* @param reduceFunc the user defined reduce function used to merge
* duplicate vertex attributes.
*/
private[spark] def aggregate[VD: ClassManifest, VidVDPair <: Product2[Vid, VD] : ClassManifest](
rdd: RDD[VidVDPair],
index: VertexSetIndex,
reduceFunc: (VD, VD) => VD): VertexSetRDD[VD] = {
val cReduceFunc = rdd.context.clean(reduceFunc)
assert(rdd.partitioner == Some(index.partitioner))
// Use the index to build the new values table
val partitionsRDD = index.rdd.zipPartitions(
rdd, preservesPartitioning = true
) {
(indexIter, tblIter) =>
// There is only one map
val index = indexIter.next()
val mask = new BitSet(index.capacity)
val values = new Array[VD](index.capacity)
for (vertexPair <- tblIter) {
// Get the location of the key in the index
val pos = index.getPos(vertexPair._1)
if ((pos & OpenHashSet.NONEXISTENCE_MASK) != 0) {
throw new SparkException("Error: Trying to bind an external index " +
"to an RDD which contains keys that are not in the index.")
} else {
// Get the actual index
val ind = pos & OpenHashSet.POSITION_MASK
// If this value has already been seen then merge
if (mask.get(ind)) {
values(ind) = cReduceFunc(values(ind), vertexPair._2)
} else { // otherwise just store the new value
mask.set(ind)
values(ind) = vertexPair._2
}
}
}
Iterator(new VertexPartition(index, values, mask))
}
new VertexSetRDD(partitionsRDD)
}
/**
* Construct a vertex set from an RDD using an existing index and a
* user defined `combiner` to merge duplicate vertices.
*
* @tparam VD the vertex attribute type
* @param rdd the rdd containing vertices
* @param index the index which must be a superset of the vertices
* in RDD
* @param createCombiner a user defined function to create a combiner
* from a vertex attribute
* @param mergeValue a user defined function to merge a vertex
* attribute into an existing combiner
* @param mergeCombiners a user defined function to merge combiners
*
*/
def apply[VD: ClassManifest, C: ClassManifest](
rdd: RDD[(Vid, VD)],
index: VertexSetIndex,
createCombiner: VD => C,
mergeValue: (C, VD) => C,
mergeCombiners: (C, C) => C): VertexSetRDD[C] = {
val cCreateCombiner = rdd.context.clean(createCombiner)
val cMergeValue = rdd.context.clean(mergeValue)
val cMergeCombiners = rdd.context.clean(mergeCombiners)
val partitioner = index.partitioner
// Preaggregate and shuffle if necessary
val partitioned =
if (rdd.partitioner != Some(partitioner)) {
// Preaggregation.
val aggregator = new Aggregator[Vid, VD, C](cCreateCombiner, cMergeValue, cMergeCombiners)
rdd.mapPartitions(aggregator.combineValuesByKey).partitionBy(partitioner)
} else {
rdd.mapValues(x => createCombiner(x))
}
aggregate(partitioned, index, mergeCombiners)
} // end of apply
/**
* Construct an index of the unique vertices. The resulting index
* can be used to build VertexSets over subsets of the vertices in
* the input.
*/
def makeIndex(
keys: RDD[Vid], partitionerOpt: Option[Partitioner] = None): VertexSetIndex = {
val partitioner = partitionerOpt match {
case Some(p) => p
case None => Partitioner.defaultPartitioner(keys)
}
val preAgg: RDD[(Vid, Unit)] = keys.mapPartitions(iter => {
val keys = new VertexIdToIndexMap
while (iter.hasNext) { keys.add(iter.next) }
keys.iterator.map(k => (k, ()))
}, preservesPartitioning = true).partitionBy(partitioner)
val index = preAgg.mapPartitions(iter => {
val index = new VertexIdToIndexMap
while (iter.hasNext) { index.add(iter.next._1) }
Iterator(index)
}, preservesPartitioning = true).cache
new VertexSetIndex(index)
}
/**
* Create a VertexSetRDD with all vertices initialized to the default value.
*
* @param index an index over the set of vertices
* @param defaultValue the default value to use when initializing the vertices
* @tparam VD the type of the vertex attribute
* @return
*/
def apply[VD: ClassManifest](index: VertexSetIndex, defaultValue: VD): VertexSetRDD[VD] = {
// Use the index to build the new values tables
val partitionsRDD = index.rdd.mapPartitions(_.map { index =>
val values = Array.fill(index.capacity)(defaultValue)
val mask = index.getBitSet
new VertexPartition(index, values, mask)
}, preservesPartitioning = true)
new VertexSetRDD(partitionsRDD)
} // end of apply
} // end of object VertexSetRDD

View file

@ -0,0 +1,37 @@
package org.apache.spark.graph.algorithms
import org.apache.spark.graph._
object ConnectedComponents {
/**
* Compute the connected component membership of each vertex and return an RDD with the vertex
* value containing the lowest vertex id in the connected component containing that vertex.
*
* @tparam VD the vertex attribute type (discarded in the computation)
* @tparam ED the edge attribute type (preserved in the computation)
*
* @param graph the graph for which to compute the connected components
*
* @return a graph with vertex attributes containing the smallest vertex in each
* connected component
*/
def run[VD: Manifest, ED: Manifest](graph: Graph[VD, ED]): Graph[Vid, ED] = {
val ccGraph = graph.mapVertices { case (vid, _) => vid }
def sendMessage(edge: EdgeTriplet[Vid, ED]) = {
if (edge.srcAttr < edge.dstAttr) {
Iterator((edge.dstId, edge.srcAttr))
} else if (edge.srcAttr > edge.dstAttr) {
Iterator((edge.srcId, edge.dstAttr))
} else {
Iterator.empty
}
}
val initialMessage = Long.MaxValue
Pregel(ccGraph, initialMessage)(
vprog = (id, attr, msg) => math.min(attr, msg),
sendMsg = sendMessage,
mergeMsg = (a, b) => math.min(a, b))
} // end of connectedComponents
}

View file

@ -0,0 +1,205 @@
package org.apache.spark.graph.algorithms
import org.apache.spark.Logging
import org.apache.spark.graph._
object PageRank extends Logging {
/**
* Run PageRank for a fixed number of iterations returning a graph
* with vertex attributes containing the PageRank and edge
* attributes the normalized edge weight.
*
* The following PageRank fixed point is computed for each vertex.
*
* {{{
* var PR = Array.fill(n)( 1.0 )
* val oldPR = Array.fill(n)( 1.0 )
* for( iter <- 0 until numIter ) {
* swap(oldPR, PR)
* for( i <- 0 until n ) {
* PR[i] = alpha + (1 - alpha) * inNbrs[i].map(j => oldPR[j] / outDeg[j]).sum
* }
* }
* }}}
*
* where `alpha` is the random reset probability (typically 0.15),
* `inNbrs[i]` is the set of neighbors whick link to `i` and
* `outDeg[j]` is the out degree of vertex `j`.
*
* Note that this is not the "normalized" PageRank and as a consequence pages that have no
* inlinks will have a PageRank of alpha.
*
* @tparam VD the original vertex attribute (not used)
* @tparam ED the original edge attribute (not used)
*
* @param graph the graph on which to compute PageRank
* @param numIter the number of iterations of PageRank to run
* @param resetProb the random reset probability (alpha)
*
* @return the graph containing with each vertex containing the PageRank and each edge
* containing the normalized weight.
*
*/
def run[VD: Manifest, ED: Manifest](
graph: Graph[VD, ED], numIter: Int, resetProb: Double = 0.15): Graph[Double, Double] =
{
/**
* Initialize the pagerankGraph with each edge attribute having
* weight 1/outDegree and each vertex with attribute 1.0.
*/
val pagerankGraph: Graph[Double, Double] = graph
// Associate the degree with each vertex
.outerJoinVertices(graph.outDegrees){
(vid, vdata, deg) => deg.getOrElse(0)
}
// Set the weight on the edges based on the degree
.mapTriplets( e => 1.0 / e.srcAttr )
// Set the vertex attributes to the initial pagerank values
.mapVertices( (id, attr) => 1.0 )
// Display statistics about pagerank
logInfo(pagerankGraph.statistics.toString)
// Define the three functions needed to implement PageRank in the GraphX
// version of Pregel
def vertexProgram(id: Vid, attr: Double, msgSum: Double): Double =
resetProb + (1.0 - resetProb) * msgSum
def sendMessage(edge: EdgeTriplet[Double, Double]) =
Iterator((edge.dstId, edge.srcAttr * edge.attr))
def messageCombiner(a: Double, b: Double): Double = a + b
// The initial message received by all vertices in PageRank
val initialMessage = 0.0
// Execute pregel for a fixed number of iterations.
Pregel(pagerankGraph, initialMessage, numIter)(
vertexProgram, sendMessage, messageCombiner)
}
/**
* Run a dynamic version of PageRank returning a graph with vertex attributes containing the
* PageRank and edge attributes containing the normalized edge weight.
*
* {{{
* var PR = Array.fill(n)( 1.0 )
* val oldPR = Array.fill(n)( 0.0 )
* while( max(abs(PR - oldPr)) > tol ) {
* swap(oldPR, PR)
* for( i <- 0 until n if abs(PR[i] - oldPR[i]) > tol ) {
* PR[i] = alpha + (1 - \alpha) * inNbrs[i].map(j => oldPR[j] / outDeg[j]).sum
* }
* }
* }}}
*
* where `alpha` is the random reset probability (typically 0.15), `inNbrs[i]` is the set of
* neighbors whick link to `i` and `outDeg[j]` is the out degree of vertex `j`.
*
* Note that this is not the "normalized" PageRank and as a consequence pages that have no
* inlinks will have a PageRank of alpha.
*
* @tparam VD the original vertex attribute (not used)
* @tparam ED the original edge attribute (not used)
*
* @param graph the graph on which to compute PageRank
* @param tol the tolerance allowed at convergence (smaller => more * accurate).
* @param resetProb the random reset probability (alpha)
*
* @return the graph containing with each vertex containing the PageRank and each edge
* containing the normalized weight.
*/
def runUntillConvergence[VD: Manifest, ED: Manifest](
graph: Graph[VD, ED], tol: Double, resetProb: Double = 0.15): Graph[Double, Double] =
{
// Initialize the pagerankGraph with each edge attribute
// having weight 1/outDegree and each vertex with attribute 1.0.
val pagerankGraph: Graph[(Double, Double), Double] = graph
// Associate the degree with each vertex
.outerJoinVertices(graph.outDegrees) {
(vid, vdata, deg) => deg.getOrElse(0)
}
// Set the weight on the edges based on the degree
.mapTriplets( e => 1.0 / e.srcAttr )
// Set the vertex attributes to (initalPR, delta = 0)
.mapVertices( (id, attr) => (0.0, 0.0) )
// Display statistics about pagerank
logInfo(pagerankGraph.statistics.toString)
// Define the three functions needed to implement PageRank in the GraphX
// version of Pregel
def vertexProgram(id: Vid, attr: (Double, Double), msgSum: Double): (Double, Double) = {
val (oldPR, lastDelta) = attr
val newPR = oldPR + (1.0 - resetProb) * msgSum
(newPR, newPR - oldPR)
}
def sendMessage(edge: EdgeTriplet[(Double, Double), Double]) = {
if (edge.srcAttr._2 > tol) {
Iterator((edge.dstId, edge.srcAttr._2 * edge.attr))
} else {
Iterator.empty
}
}
def messageCombiner(a: Double, b: Double): Double = a + b
// The initial message received by all vertices in PageRank
val initialMessage = resetProb / (1.0 - resetProb)
// Execute a dynamic version of Pregel.
Pregel(pagerankGraph, initialMessage)(vertexProgram, sendMessage, messageCombiner)
.mapVertices((vid, attr) => attr._1)
} // end of deltaPageRank
def runStandalone[VD: Manifest, ED: Manifest](
graph: Graph[VD, ED], tol: Double, resetProb: Double = 0.15): VertexRDD[Double] = {
// Initialize the ranks
var ranks: VertexRDD[Double] = graph.vertices.mapValues((vid, attr) => resetProb).cache()
// Initialize the delta graph where each vertex stores its delta and each edge knows its weight
var deltaGraph: Graph[Double, Double] =
graph.outerJoinVertices(graph.outDegrees)((vid, vdata, deg) => deg.getOrElse(0))
.mapTriplets(e => 1.0 / e.srcAttr)
.mapVertices((vid, degree) => resetProb).cache()
var numDeltas: Long = ranks.count()
var prevDeltas: Option[VertexRDD[Double]] = None
var i = 0
val weight = (1.0 - resetProb)
while (numDeltas > 0) {
// Compute new deltas. Only deltas that existed in the last round (i.e., were greater than
// `tol`) get to send messages; those that were less than `tol` would send messages less than
// `tol` as well.
val deltas = deltaGraph
.mapReduceTriplets[Double](
et => Iterator((et.dstId, et.srcAttr * et.attr * weight)),
_ + _,
prevDeltas.map((_, EdgeDirection.Out)))
.filter { case (vid, delta) => delta > tol }
.cache()
prevDeltas = Some(deltas)
numDeltas = deltas.count()
logInfo("Standalone PageRank: iter %d has %d deltas".format(i, numDeltas))
// Update deltaGraph with the deltas
deltaGraph = deltaGraph.outerJoinVertices(deltas) { (vid, old, newOpt) =>
newOpt.getOrElse(old)
}.cache()
// Update ranks
ranks = ranks.leftZipJoin(deltas) { (vid, oldRank, deltaOpt) =>
oldRank + deltaOpt.getOrElse(0.0)
}
ranks.foreach(x => {}) // force the iteration for ease of debugging
i += 1
}
ranks
}
}

View file

@ -0,0 +1,158 @@
package org.apache.spark.graph.algorithms
import org.apache.spark._
import org.apache.spark.rdd._
import org.apache.spark.graph._
import scala.util.Random
import org.apache.commons.math.linear._
class VT ( // vertex type
var v1: RealVector, // v1: p for user node, q for item node
var v2: RealVector, // v2: pu + |N(u)|^(-0.5)*sum(y) for user node, y for item node
var bias: Double,
var norm: Double // only for user node
) extends Serializable
class Msg ( // message
var v1: RealVector,
var v2: RealVector,
var bias: Double
) extends Serializable
object Svdpp {
// implement SVD++ based on http://public.research.att.com/~volinsky/netflix/kdd08koren.pdf
def run(edges: RDD[Edge[Double]]): Graph[VT, Double] = {
// defalut parameters
val rank = 10
val maxIters = 20
val minVal = 0.0
val maxVal = 5.0
val gamma1 = 0.007
val gamma2 = 0.007
val gamma6 = 0.005
val gamma7 = 0.015
def defaultF(rank: Int) = {
val v1 = new ArrayRealVector(rank)
val v2 = new ArrayRealVector(rank)
for (i <- 0 until rank) {
v1.setEntry(i, Random.nextDouble)
v2.setEntry(i, Random.nextDouble)
}
var vd = new VT(v1, v2, 0.0, 0.0)
vd
}
// calculate initial norm and bias
def mapF0(et: EdgeTriplet[VT, Double]): Iterator[(Vid, (Long, Double))] = {
assert(et.srcAttr != null && et.dstAttr != null)
Iterator((et.srcId, (1L, et.attr)), (et.dstId, (1L, et.attr)))
}
def reduceF0(g1: (Long, Double), g2: (Long, Double)) = {
(g1._1 + g2._1, g1._2 + g2._2)
}
def updateF0(vid: Vid, vd: VT, msg: Option[(Long, Double)]) = {
if (msg.isDefined) {
vd.bias = msg.get._2 / msg.get._1
vd.norm = 1.0 / scala.math.sqrt(msg.get._1)
}
vd
}
// calculate global rating mean
val (rs, rc) = edges.map(e => (e.attr, 1L)).reduce((a, b) => (a._1 + b._1, a._2 + b._2))
val u = rs / rc // global rating mean
// make graph
var g = Graph.fromEdges(edges, defaultF(rank)).cache()
// calculate initial norm and bias
val t0 = g.mapReduceTriplets(mapF0, reduceF0)
g.outerJoinVertices(t0) {updateF0}
// phase 1
def mapF1(et: EdgeTriplet[VT, Double]): Iterator[(Vid, RealVector)] = {
assert(et.srcAttr != null && et.dstAttr != null)
Iterator((et.srcId, et.dstAttr.v2)) // sum up y of connected item nodes
}
def reduceF1(g1: RealVector, g2: RealVector) = {
g1.add(g2)
}
def updateF1(vid: Vid, vd: VT, msg: Option[RealVector]) = {
if (msg.isDefined) {
vd.v2 = vd.v1.add(msg.get.mapMultiply(vd.norm)) // pu + |N(u)|^(-0.5)*sum(y)
}
vd
}
// phase 2
def mapF2(et: EdgeTriplet[VT, Double]): Iterator[(Vid, Msg)] = {
assert(et.srcAttr != null && et.dstAttr != null)
val usr = et.srcAttr
val itm = et.dstAttr
var p = usr.v1
var q = itm.v1
val itmBias = 0.0
val usrBias = 0.0
var pred = u + usr.bias + itm.bias + q.dotProduct(usr.v2)
pred = math.max(pred, minVal)
pred = math.min(pred, maxVal)
val err = et.attr - pred
val y = (q.mapMultiply(err*usr.norm)).subtract((usr.v2).mapMultiply(gamma7))
val newP = (q.mapMultiply(err)).subtract(p.mapMultiply(gamma7)) // for each connected item q
val newQ = (usr.v2.mapMultiply(err)).subtract(q.mapMultiply(gamma7))
Iterator((et.srcId, new Msg(newP, y, err - gamma6*usr.bias)), (et.dstId, new Msg(newQ, y, err - gamma6*itm.bias)))
}
def reduceF2(g1: Msg, g2: Msg):Msg = {
g1.v1 = g1.v1.add(g2.v1)
g1.v2 = g1.v2.add(g2.v2)
g1.bias += g2.bias
g1
}
def updateF2(vid: Vid, vd: VT, msg: Option[Msg]) = {
if (msg.isDefined) {
vd.v1 = vd.v1.add(msg.get.v1.mapMultiply(gamma2))
if (vid % 2 == 1) { // item node update y
vd.v2 = vd.v2.add(msg.get.v2.mapMultiply(gamma2))
}
vd.bias += msg.get.bias*gamma1
}
vd
}
for (i <- 0 until maxIters) {
// phase 1
val t1: VertexRDD[RealVector] = g.mapReduceTriplets(mapF1, reduceF1)
g.outerJoinVertices(t1) {updateF1}
// phase 2
val t2: VertexRDD[Msg] = g.mapReduceTriplets(mapF2, reduceF2)
g.outerJoinVertices(t2) {updateF2}
}
// calculate error on training set
def mapF3(et: EdgeTriplet[VT, Double]): Iterator[(Vid, Double)] = {
assert(et.srcAttr != null && et.dstAttr != null)
val usr = et.srcAttr
val itm = et.dstAttr
var p = usr.v1
var q = itm.v1
val itmBias = 0.0
val usrBias = 0.0
var pred = u + usr.bias + itm.bias + q.dotProduct(usr.v2)
pred = math.max(pred, minVal)
pred = math.min(pred, maxVal)
val err = (et.attr - pred)*(et.attr - pred)
Iterator((et.dstId, err))
}
def updateF3(vid: Vid, vd: VT, msg: Option[Double]) = {
if (msg.isDefined && vid % 2 == 1) { // item sum up the errors
vd.norm = msg.get
}
vd
}
val t3: VertexRDD[Double] = g.mapReduceTriplets(mapF3, _ + _)
g.outerJoinVertices(t3) {updateF3}
g
}
}

View file

@ -0,0 +1,76 @@
package org.apache.spark.graph.algorithms
import org.apache.spark.graph._
object TriangleCount {
/**
* Compute the number of triangles passing through each vertex.
*
* The algorithm is relatively straightforward and can be computed in three steps:
*
* 1) Compute the set of neighbors for each vertex
* 2) For each edge compute the intersection of the sets and send the
* count to both vertices.
* 3) Compute the sum at each vertex and divide by two since each
* triangle is counted twice.
*
*
* @param graph a graph with `sourceId` less than `destId`. The graph must have been partitioned
* using Graph.partitionBy.
*
* @return
*/
def run[VD: ClassManifest, ED: ClassManifest](graph: Graph[VD,ED]): Graph[Int, ED] = {
// Remove redundant edges
val g = graph.groupEdges((a, b) => a).cache
// Construct set representations of the neighborhoods
val nbrSets: VertexRDD[VertexSet] =
g.collectNeighborIds(EdgeDirection.Both).mapValues { (vid, nbrs) =>
val set = new VertexSet(4)
var i = 0
while (i < nbrs.size) {
// prevent self cycle
if(nbrs(i) != vid) {
set.add(nbrs(i))
}
i += 1
}
set
}
// join the sets with the graph
val setGraph: Graph[VertexSet, ED] = g.outerJoinVertices(nbrSets) {
(vid, _, optSet) => optSet.getOrElse(null)
}
// Edge function computes intersection of smaller vertex with larger vertex
def edgeFunc(et: EdgeTriplet[VertexSet, ED]): Iterator[(Vid, Int)] = {
assert(et.srcAttr != null)
assert(et.dstAttr != null)
val (smallSet, largeSet) = if (et.srcAttr.size < et.dstAttr.size) {
(et.srcAttr, et.dstAttr)
} else {
(et.dstAttr, et.srcAttr)
}
val iter = smallSet.iterator
var counter: Int = 0
while (iter.hasNext) {
val vid = iter.next
if (vid != et.srcId && vid != et.dstId && largeSet.contains(vid)) { counter += 1 }
}
Iterator((et.srcId, counter), (et.dstId, counter))
}
// compute the intersection along edges
val counters: VertexRDD[Int] = setGraph.mapReduceTriplets(edgeFunc, _ + _)
// Merge counters with the graph and divide by two since each triangle is counted twice
g.outerJoinVertices(counters) {
(vid, _, optCounter: Option[Int]) =>
val dblCount = optCounter.getOrElse(0)
// double count should be even (divisible by two)
assert((dblCount & 1) == 0)
dblCount / 2
}
} // end of TriangleCount
}

View file

@ -1,29 +1,36 @@
package org.apache.spark.graph.impl package org.apache.spark.graph.impl
import org.apache.spark.graph._ import org.apache.spark.graph._
import org.apache.spark.util.collection.OpenHashMap import org.apache.spark.util.collection.PrimitiveKeyOpenHashMap
/** /**
* A collection of edges stored in 3 large columnar arrays (src, dst, attribute). * A collection of edges stored in 3 large columnar arrays (src, dst, attribute). The arrays are
* clustered by src.
* *
* @param srcIds the source vertex id of each edge * @param srcIds the source vertex id of each edge
* @param dstIds the destination vertex id of each edge * @param dstIds the destination vertex id of each edge
* @param data the attribute associated with each edge * @param data the attribute associated with each edge
* @param index a clustered index on source vertex id
* @tparam ED the edge attribute type. * @tparam ED the edge attribute type.
*/ */
class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) ED: ClassManifest]( class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) ED: ClassManifest](
val srcIds: Array[Vid], val srcIds: Array[Vid],
val dstIds: Array[Vid], val dstIds: Array[Vid],
val data: Array[ED]) { val data: Array[ED],
val index: PrimitiveKeyOpenHashMap[Vid, Int]) {
/** /**
* Reverse all the edges in this partition. * Reverse all the edges in this partition.
* *
* @note No new data structures are created.
*
* @return a new edge partition with all edges reversed. * @return a new edge partition with all edges reversed.
*/ */
def reverse: EdgePartition[ED] = new EdgePartition(dstIds, srcIds, data) def reverse: EdgePartition[ED] = {
val builder = new EdgePartitionBuilder(size)
for (e <- iterator) {
builder.add(e.dstId, e.srcId, e.attr)
}
builder.toEdgePartition
}
/** /**
* Construct a new edge partition by applying the function f to all * Construct a new edge partition by applying the function f to all
@ -46,7 +53,7 @@ class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double)
newData(i) = f(edge) newData(i) = f(edge)
i += 1 i += 1
} }
new EdgePartition(srcIds, dstIds, newData) new EdgePartition(srcIds, dstIds, newData, index)
} }
/** /**
@ -55,16 +62,7 @@ class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double)
* @param f an external state mutating user defined function. * @param f an external state mutating user defined function.
*/ */
def foreach(f: Edge[ED] => Unit) { def foreach(f: Edge[ED] => Unit) {
val edge = new Edge[ED] iterator.foreach(f)
val size = data.size
var i = 0
while (i < size) {
edge.srcId = srcIds(i)
edge.dstId = dstIds(i)
edge.attr = data(i)
f(edge)
i += 1
}
} }
/** /**
@ -75,21 +73,29 @@ class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double)
* @return a new edge partition without duplicate edges * @return a new edge partition without duplicate edges
*/ */
def groupEdges(merge: (ED, ED) => ED): EdgePartition[ED] = { def groupEdges(merge: (ED, ED) => ED): EdgePartition[ED] = {
// Aggregate all matching edges in a hashmap val builder = new EdgePartitionBuilder[ED]
val agg = new OpenHashMap[(Vid,Vid), ED] var firstIter: Boolean = true
foreach { e => agg.setMerge((e.srcId, e.dstId), e.attr, merge) } var currSrcId: Vid = nullValue[Vid]
// Populate new srcId, dstId, and data, arrays var currDstId: Vid = nullValue[Vid]
val newSrcIds = new Array[Vid](agg.size) var currAttr: ED = nullValue[ED]
val newDstIds = new Array[Vid](agg.size)
val newData = new Array[ED](agg.size)
var i = 0 var i = 0
agg.foreach { kv => while (i < size) {
newSrcIds(i) = kv._1._1 if (i > 0 && currSrcId == srcIds(i) && currDstId == dstIds(i)) {
newDstIds(i) = kv._1._2 currAttr = merge(currAttr, data(i))
newData(i) = kv._2 } else {
if (i > 0) {
builder.add(currSrcId, currDstId, currAttr)
}
currSrcId = srcIds(i)
currDstId = dstIds(i)
currAttr = data(i)
}
i += 1 i += 1
} }
new EdgePartition(newSrcIds, newDstIds, newData) if (size > 0) {
builder.add(currSrcId, currDstId, currAttr)
}
builder.toEdgePartition
} }
/** /**
@ -99,6 +105,9 @@ class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double)
*/ */
def size: Int = srcIds.size def size: Int = srcIds.size
/** The number of unique source vertices in the partition. */
def indexSize: Int = index.size
/** /**
* Get an iterator over the edges in this partition. * Get an iterator over the edges in this partition.
* *
@ -118,4 +127,34 @@ class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double)
edge edge
} }
} }
/**
* Get an iterator over the edges in this partition whose source vertex ids match srcIdPred. The
* iterator is generated using an index scan, so it is efficient at skipping edges that don't
* match srcIdPred.
*/
def indexIterator(srcIdPred: Vid => Boolean): Iterator[Edge[ED]] =
index.iterator.filter(kv => srcIdPred(kv._1)).flatMap(Function.tupled(clusterIterator))
/**
* Get an iterator over the cluster of edges in this partition with source vertex id `srcId`. The
* cluster must start at position `index`.
*/
private def clusterIterator(srcId: Vid, index: Int) = new Iterator[Edge[ED]] {
private[this] val edge = new Edge[ED]
private[this] var pos = index
override def hasNext: Boolean = {
pos >= 0 && pos < EdgePartition.this.size && srcIds(pos) == srcId
}
override def next(): Edge[ED] = {
assert(srcIds(pos) == srcId)
edge.srcId = srcIds(pos)
edge.dstId = dstIds(pos)
edge.attr = data(pos)
pos += 1
edge
}
}
} }

View file

@ -1,27 +1,45 @@
package org.apache.spark.graph.impl package org.apache.spark.graph.impl
import scala.collection.mutable.ArrayBuilder import scala.util.Sorting
import org.apache.spark.graph._ import org.apache.spark.graph._
import org.apache.spark.util.collection.{PrimitiveKeyOpenHashMap, PrimitiveVector}
//private[graph] //private[graph]
class EdgePartitionBuilder[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) class EdgePartitionBuilder[@specialized(Long, Int, Double) ED: ClassManifest](size: Int = 64) {
ED: ClassManifest]{
val srcIds = new VertexArrayList
val dstIds = new VertexArrayList
var dataBuilder = ArrayBuilder.make[ED]
var edges = new PrimitiveVector[Edge[ED]](size)
/** Add a new edge to the partition. */ /** Add a new edge to the partition. */
def add(src: Vid, dst: Vid, d: ED) { def add(src: Vid, dst: Vid, d: ED) {
srcIds.add(src) edges += Edge(src, dst, d)
dstIds.add(dst)
dataBuilder += d
} }
def toEdgePartition: EdgePartition[ED] = { def toEdgePartition: EdgePartition[ED] = {
new EdgePartition(srcIds.toLongArray(), dstIds.toLongArray(), dataBuilder.result()) val edgeArray = edges.trim().array
Sorting.quickSort(edgeArray)(Edge.lexicographicOrdering)
val srcIds = new Array[Vid](edgeArray.size)
val dstIds = new Array[Vid](edgeArray.size)
val data = new Array[ED](edgeArray.size)
val index = new PrimitiveKeyOpenHashMap[Vid, Int]
// Copy edges into columnar structures, tracking the beginnings of source vertex id clusters and
// adding them to the index
if (edgeArray.length > 0) {
index.update(srcIds(0), 0)
var currSrcId: Vid = srcIds(0)
var i = 0
while (i < edgeArray.size) {
srcIds(i) = edgeArray(i).srcId
dstIds(i) = edgeArray(i).dstId
data(i) = edgeArray(i).attr
if (edgeArray(i).srcId != currSrcId) {
currSrcId = edgeArray(i).srcId
index.update(currSrcId, i)
}
i += 1
}
}
new EdgePartition(srcIds, dstIds, data, index)
} }
} }

View file

@ -0,0 +1,41 @@
package org.apache.spark.graph.impl
import org.apache.spark.graph._
import org.apache.spark.util.collection.PrimitiveKeyOpenHashMap
/**
* The Iterator type returned when constructing edge triplets. This class technically could be
* an anonymous class in GraphImpl.triplets, but we name it here explicitly so it is easier to
* debug / profile.
*/
private[impl]
class EdgeTripletIterator[VD: ClassManifest, ED: ClassManifest](
val vidToIndex: VertexIdToIndexMap,
val vertexArray: Array[VD],
val edgePartition: EdgePartition[ED])
extends Iterator[EdgeTriplet[VD, ED]] {
// Current position in the array.
private var pos = 0
// A triplet object that this iterator.next() call returns. We reuse this object to avoid
// allocating too many temporary Java objects.
private val triplet = new EdgeTriplet[VD, ED]
private val vmap = new PrimitiveKeyOpenHashMap[Vid, VD](vidToIndex, vertexArray)
override def hasNext: Boolean = pos < edgePartition.size
override def next() = {
triplet.srcId = edgePartition.srcIds(pos)
// assert(vmap.containsKey(e.src.id))
triplet.srcAttr = vmap(triplet.srcId)
triplet.dstId = edgePartition.dstIds(pos)
// assert(vmap.containsKey(e.dst.id))
triplet.dstAttr = vmap(triplet.dstId)
triplet.attr = edgePartition.data(pos)
pos += 1
triplet
}
}

View file

@ -1,66 +1,15 @@
package org.apache.spark.graph.impl package org.apache.spark.graph.impl
import scala.collection.JavaConversions._ import org.apache.spark.util.collection.PrimitiveVector
import org.apache.spark.{HashPartitioner, Partitioner}
import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
import org.apache.spark.SparkContext._ import org.apache.spark.SparkContext._
import org.apache.spark.HashPartitioner
import org.apache.spark.util.ClosureCleaner
import org.apache.spark.SparkException
import org.apache.spark.Partitioner
import org.apache.spark.graph._ import org.apache.spark.graph._
import org.apache.spark.graph.impl.GraphImpl._ import org.apache.spark.graph.impl.GraphImpl._
import org.apache.spark.graph.impl.MsgRDDFunctions._ import org.apache.spark.graph.impl.MsgRDDFunctions._
import org.apache.spark.graph.util.BytecodeUtils import org.apache.spark.graph.util.BytecodeUtils
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.{ShuffledRDD, RDD}
import org.apache.spark.storage.StorageLevel import org.apache.spark.storage.StorageLevel
import org.apache.spark.util.collection.{BitSet, OpenHashSet, PrimitiveKeyOpenHashMap} import org.apache.spark.util.ClosureCleaner
/**
* The Iterator type returned when constructing edge triplets
*/
class EdgeTripletIterator[VD: ClassManifest, ED: ClassManifest](
val vidToIndex: VertexIdToIndexMap,
val vertexArray: Array[VD],
val edgePartition: EdgePartition[ED]) extends Iterator[EdgeTriplet[VD, ED]] {
private var pos = 0
private val et = new EdgeTriplet[VD, ED]
private val vmap = new PrimitiveKeyOpenHashMap[Vid, VD](vidToIndex, vertexArray)
override def hasNext: Boolean = pos < edgePartition.size
override def next() = {
et.srcId = edgePartition.srcIds(pos)
// assert(vmap.containsKey(e.src.id))
et.srcAttr = vmap(et.srcId)
et.dstId = edgePartition.dstIds(pos)
// assert(vmap.containsKey(e.dst.id))
et.dstAttr = vmap(et.dstId)
et.attr = edgePartition.data(pos)
pos += 1
et
}
override def toList: List[EdgeTriplet[VD, ED]] = {
val lb = new mutable.ListBuffer[EdgeTriplet[VD,ED]]
val currentEdge = new EdgeTriplet[VD, ED]
for (i <- (0 until edgePartition.size)) {
currentEdge.srcId = edgePartition.srcIds(i)
// assert(vmap.containsKey(e.src.id))
currentEdge.srcAttr = vmap(currentEdge.srcId)
currentEdge.dstId = edgePartition.dstIds(i)
// assert(vmap.containsKey(e.dst.id))
currentEdge.dstAttr = vmap(currentEdge.dstId)
currentEdge.attr = edgePartition.data(i)
lb += currentEdge
}
lb.toList
}
} // end of Edge Triplet Iterator
/** /**
@ -74,59 +23,95 @@ class EdgeTripletIterator[VD: ClassManifest, ED: ClassManifest](
* destinations. `vertexPlacement` specifies where each vertex will be * destinations. `vertexPlacement` specifies where each vertex will be
* replicated. `vTableReplicated` stores the replicated vertex attributes, which * replicated. `vTableReplicated` stores the replicated vertex attributes, which
* are co-partitioned with the relevant edges. * are co-partitioned with the relevant edges.
*
* mask in vertices means filter
* mask in vTableReplicated means active
*/ */
class GraphImpl[VD: ClassManifest, ED: ClassManifest] protected ( class GraphImpl[VD: ClassManifest, ED: ClassManifest] protected (
@transient val vTable: VertexSetRDD[VD], @transient val vertices: VertexRDD[VD],
@transient val eTable: RDD[(Pid, EdgePartition[ED])], @transient val edges: EdgeRDD[ED],
@transient val vertexPlacement: VertexPlacement, @transient val vertexPlacement: VertexPlacement,
@transient val partitioner: PartitionStrategy) @transient val vTableReplicated: VTableReplicated[VD])
extends Graph[VD, ED] { extends Graph[VD, ED] {
def this() = this(null, null, null, null) def this(
vertices: VertexRDD[VD],
edges: EdgeRDD[ED],
vertexPlacement: VertexPlacement) = {
this(vertices, edges, vertexPlacement, new VTableReplicated(vertices, edges, vertexPlacement))
}
@transient val vTableReplicated: VTableReplicated[VD] = def this(
new VTableReplicated(vTable, eTable, vertexPlacement) vertices: VertexRDD[VD],
edges: EdgeRDD[ED]) = {
this(vertices, edges, new VertexPlacement(edges, vertices))
}
/** Return a RDD of vertices. */ /** Return a RDD that brings edges together with their source and destination vertices. */
@transient override val vertices = vTable @transient override val triplets: RDD[EdgeTriplet[VD, ED]] = {
val vdManifest = classManifest[VD]
val edManifest = classManifest[ED]
/** Return a RDD of edges. */ edges.zipEdgePartitions(vTableReplicated.get(true, true)) { (ePart, vPartIter) =>
@transient override val edges: RDD[Edge[ED]] = val (_, vPart) = vPartIter.next()
eTable.mapPartitions(_.next()._2.iterator, true) new EdgeTripletIterator(vPart.index, vPart.values, ePart)(vdManifest, edManifest)
}
/** Return a RDD that brings edges with its source and destination vertices together. */ }
@transient override val triplets: RDD[EdgeTriplet[VD, ED]] =
makeTriplets(vTableReplicated.bothAttrs, eTable)
override def persist(newLevel: StorageLevel): Graph[VD, ED] = { override def persist(newLevel: StorageLevel): Graph[VD, ED] = {
vTable.persist(newLevel) vertices.persist(newLevel)
eTable.persist(newLevel) edges.persist(newLevel)
vertexPlacement.persist(newLevel)
this this
} }
override def cache(): Graph[VD, ED] = persist(StorageLevel.MEMORY_ONLY) override def cache(): Graph[VD, ED] = persist(StorageLevel.MEMORY_ONLY)
override def partitionBy(partitionStrategy: PartitionStrategy): Graph[VD, ED] = {
val numPartitions = edges.partitions.size
val edManifest = classManifest[ED]
val newEdges = new EdgeRDD(edges.map { e =>
val part: Pid = partitionStrategy.getPartition(e.srcId, e.dstId, numPartitions)
// Should we be using 3-tuple or an optimized class
new MessageToPartition(part, (e.srcId, e.dstId, e.attr))
}
.partitionBy(new HashPartitioner(numPartitions))
.mapPartitionsWithIndex( { (pid, iter) =>
val builder = new EdgePartitionBuilder[ED]()(edManifest)
iter.foreach { message =>
val data = message.data
builder.add(data._1, data._2, data._3)
}
val edgePartition = builder.toEdgePartition
Iterator((pid, edgePartition))
}, preservesPartitioning = true).cache())
new GraphImpl(vertices, newEdges)
}
override def statistics: Map[String, Any] = { override def statistics: Map[String, Any] = {
val numVertices = this.numVertices // Get the total number of vertices after replication, used to compute the replication ratio.
val numEdges = this.numEdges def numReplicatedVertices(vid2pids: RDD[Array[Array[Vid]]]): Double = {
val replicationRatioBothAttrs = vid2pids.map(_.map(_.size).sum.toLong).reduce(_ + _).toDouble
vertexPlacement.bothAttrs.map(_.map(_.size).sum).sum / numVertices }
val replicationRatioSrcAttrOnly =
vertexPlacement.srcAttrOnly.map(_.map(_.size).sum).sum / numVertices val numVertices = this.ops.numVertices
val replicationRatioDstAttrOnly = val numEdges = this.ops.numEdges
vertexPlacement.dstAttrOnly.map(_.map(_.size).sum).sum / numVertices val replicationRatioBoth = numReplicatedVertices(vertexPlacement.bothAttrs) / numVertices
val loadArray = val replicationRatioSrcOnly = numReplicatedVertices(vertexPlacement.srcAttrOnly) / numVertices
eTable.map{ case (pid, epart) => epart.data.size }.collect.map(x => x.toDouble / numEdges) val replicationRatioDstOnly = numReplicatedVertices(vertexPlacement.dstAttrOnly) / numVertices
// One entry for each partition, indicate the total number of edges on that partition.
val loadArray = edges.partitionsRDD.map(_._2.size).collect().map(_.toDouble / numEdges)
val minLoad = loadArray.min val minLoad = loadArray.min
val maxLoad = loadArray.max val maxLoad = loadArray.max
Map( Map(
"Num Vertices" -> numVertices, "Num Edges" -> numEdges, "Num Vertices" -> numVertices,
"Replication (both)" -> replicationRatioBothAttrs, "Num Edges" -> numEdges,
"Replication (src only)" -> replicationRatioSrcAttrOnly, "Replication (both)" -> replicationRatioBoth,
"Replication (dest only)" -> replicationRatioDstAttrOnly, "Replication (src only)" -> replicationRatioSrcOnly,
"Replication (dest only)" -> replicationRatioDstOnly,
"Load Array" -> loadArray, "Load Array" -> loadArray,
"Min Load" -> minLoad, "Max Load" -> maxLoad) "Min Load" -> minLoad,
"Max Load" -> maxLoad)
} }
/** /**
@ -137,7 +122,7 @@ class GraphImpl[VD: ClassManifest, ED: ClassManifest] protected (
rdd: RDD[_], rdd: RDD[_],
indent: String = "", indent: String = "",
visited: Map[Int, String] = Map.empty[Int, String]) { visited: Map[Int, String] = Map.empty[Int, String]) {
if(visited.contains(rdd.id)) { if (visited.contains(rdd.id)) {
println(indent + visited(rdd.id)) println(indent + visited(rdd.id))
println(indent) println(indent)
} else { } else {
@ -155,57 +140,79 @@ class GraphImpl[VD: ClassManifest, ED: ClassManifest] protected (
} }
} }
println("eTable ------------------------------------------") println("eTable ------------------------------------------")
traverseLineage(eTable, " ") traverseLineage(edges, " ")
var visited = Map(eTable.id -> "eTable") var visited = Map(edges.id -> "eTable")
println("\n\nvTable ------------------------------------------") println("\n\nvTable ------------------------------------------")
traverseLineage(vTable, " ", visited) traverseLineage(vertices, " ", visited)
visited += (vTable.id -> "vTable") visited += (vertices.id -> "vTable")
println("\n\nvertexPlacement.bothAttrs -------------------------------") println("\n\nvertexPlacement.bothAttrs -------------------------------")
traverseLineage(vertexPlacement.bothAttrs, " ", visited) traverseLineage(vertexPlacement.bothAttrs, " ", visited)
visited += (vertexPlacement.bothAttrs.id -> "vertexPlacement.bothAttrs") visited += (vertexPlacement.bothAttrs.id -> "vertexPlacement.bothAttrs")
println("\n\nvTableReplicated.bothAttrs ----------------")
traverseLineage(vTableReplicated.bothAttrs, " ", visited)
visited += (vTableReplicated.bothAttrs.id -> "vTableReplicated.bothAttrs")
println("\n\ntriplets ----------------------------------------") println("\n\ntriplets ----------------------------------------")
traverseLineage(triplets, " ", visited) traverseLineage(triplets, " ", visited)
println(visited) println(visited)
} // end of print lineage } // end of printLineage
override def reverse: Graph[VD, ED] = { override def reverse: Graph[VD, ED] =
val newETable = eTable.mapPartitions(_.map { case (pid, epart) => (pid, epart.reverse) }, new GraphImpl(vertices, edges.mapEdgePartitions(_.reverse), vertexPlacement, vTableReplicated)
preservesPartitioning = true)
new GraphImpl(vTable, newETable, vertexPlacement, partitioner) override def mapVertices[VD2: ClassManifest](f: (Vid, VD) => VD2): Graph[VD2, ED] = {
if (classManifest[VD] equals classManifest[VD2]) {
// The map preserves type, so we can use incremental replication
val newVerts = vertices.mapVertexPartitions(_.map(f))
val changedVerts = vertices.asInstanceOf[VertexRDD[VD2]].diff(newVerts)
val newVTableReplicated = new VTableReplicated[VD2](
changedVerts, edges, vertexPlacement,
Some(vTableReplicated.asInstanceOf[VTableReplicated[VD2]]))
new GraphImpl(newVerts, edges, vertexPlacement, newVTableReplicated)
} else {
// The map does not preserve type, so we must re-replicate all vertices
new GraphImpl(vertices.mapVertexPartitions(_.map(f)), edges, vertexPlacement)
}
} }
override def mapVertices[VD2: ClassManifest](f: (Vid, VD) => VD2): Graph[VD2, ED] = override def mapEdges[ED2: ClassManifest](f: Edge[ED] => ED2): Graph[VD, ED2] =
new GraphImpl(vTable.mapVertexPartitions(_.map(f)), eTable, vertexPlacement, partitioner) new GraphImpl(vertices, edges.mapEdgePartitions(_.map(f)), vertexPlacement, vTableReplicated)
override def mapEdges[ED2: ClassManifest](f: Edge[ED] => ED2): Graph[VD, ED2] = { override def mapTriplets[ED2: ClassManifest](f: EdgeTriplet[VD, ED] => ED2): Graph[VD, ED2] = {
val newETable = eTable.mapPartitions(_.map { case (pid, epart) => (pid, epart.map(f)) }, // Use an explicit manifest in PrimitiveKeyOpenHashMap init so we don't pull in the implicit
preservesPartitioning = true) // manifest from GraphImpl (which would require serializing GraphImpl).
new GraphImpl(vTable, newETable, vertexPlacement, partitioner) val vdManifest = classManifest[VD]
val newETable =
edges.zipEdgePartitions(vTableReplicated.get(true, true)) { (edgePartition, vTableReplicatedIter) =>
val (pid, vPart) = vTableReplicatedIter.next()
val et = new EdgeTriplet[VD, ED]
val newEdgePartition = edgePartition.map { e =>
et.set(e)
et.srcAttr = vPart(e.srcId)
et.dstAttr = vPart(e.dstId)
f(et)
}
Iterator((pid, newEdgePartition))
}
new GraphImpl(vertices, new EdgeRDD(newETable), vertexPlacement, vTableReplicated)
} }
override def mapTriplets[ED2: ClassManifest](f: EdgeTriplet[VD, ED] => ED2): Graph[VD, ED2] =
GraphImpl.mapTriplets(this, f)
override def subgraph( override def subgraph(
epred: EdgeTriplet[VD, ED] => Boolean = (x => true), epred: EdgeTriplet[VD, ED] => Boolean = x => true,
vpred: (Vid, VD) => Boolean = ((a,b) => true)): Graph[VD, ED] = { vpred: (Vid, VD) => Boolean = (a, b) => true): Graph[VD, ED] = {
// Filter the vertices, reusing the partitioner (but not the index) from // Filter the vertices, reusing the partitioner (but not the index) from
// this graph // this graph
val newVTable = vTable.mapVertexPartitions(_.filter(vpred).reindex()) val newVTable = vertices.mapVertexPartitions(_.filter(vpred).reindex())
// Restrict the set of edges to those that satisfy the vertex and the edge predicate. val edManifest = classManifest[ED]
val newETable = createETable(
triplets.filter(t => vpred(t.srcId, t.srcAttr) && vpred(t.dstId, t.dstAttr) && epred(t))
.map(t => Edge(t.srcId, t.dstId, t.attr)), partitioner)
// Construct the VertexPlacement map val newETable = new EdgeRDD[ED](triplets.filter { et =>
val newVertexPlacement = new VertexPlacement(newETable, newVTable) vpred(et.srcId, et.srcAttr) && vpred(et.dstId, et.dstAttr) && epred(et)
}.mapPartitionsWithIndex( { (pid, iter) =>
val builder = new EdgePartitionBuilder[ED]()(edManifest)
iter.foreach { et => builder.add(et.srcId, et.dstId, et.attr) }
val edgePartition = builder.toEdgePartition
Iterator((pid, edgePartition))
}, preservesPartitioning = true)).cache()
new GraphImpl(newVTable, newETable, newVertexPlacement, partitioner) new GraphImpl(newVTable, newETable)
} // end of subgraph } // end of subgraph
override def mask[VD2: ClassManifest, ED2: ClassManifest] ( override def mask[VD2: ClassManifest, ED2: ClassManifest] (
@ -213,10 +220,8 @@ class GraphImpl[VD: ClassManifest, ED: ClassManifest] protected (
override def groupEdges(merge: (ED, ED) => ED): Graph[VD, ED] = { override def groupEdges(merge: (ED, ED) => ED): Graph[VD, ED] = {
ClosureCleaner.clean(merge) ClosureCleaner.clean(merge)
val newETable = val newETable = edges.mapEdgePartitions(_.groupEdges(merge))
eTable.mapPartitions({ _.map(p => (p._1, p._2.groupEdges(merge))) }, new GraphImpl(vertices, newETable, vertexPlacement, vTableReplicated)
preservesPartitioning = true)
new GraphImpl(vTable, newETable, vertexPlacement, partitioner)
} }
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
@ -225,14 +230,91 @@ class GraphImpl[VD: ClassManifest, ED: ClassManifest] protected (
override def mapReduceTriplets[A: ClassManifest]( override def mapReduceTriplets[A: ClassManifest](
mapFunc: EdgeTriplet[VD, ED] => Iterator[(Vid, A)], mapFunc: EdgeTriplet[VD, ED] => Iterator[(Vid, A)],
reduceFunc: (A, A) => A): VertexSetRDD[A] = reduceFunc: (A, A) => A,
GraphImpl.mapReduceTriplets(this, mapFunc, reduceFunc) activeSetOpt: Option[(VertexRDD[_], EdgeDirection)] = None) = {
ClosureCleaner.clean(mapFunc)
ClosureCleaner.clean(reduceFunc)
// For each vertex, replicate its attribute only to partitions where it is
// in the relevant position in an edge.
val mapUsesSrcAttr = accessesVertexAttr[VD, ED](mapFunc, "srcAttr")
val mapUsesDstAttr = accessesVertexAttr[VD, ED](mapFunc, "dstAttr")
val vs = activeSetOpt match {
case Some((activeSet, _)) => vTableReplicated.get(mapUsesSrcAttr, mapUsesDstAttr, activeSet)
case None => vTableReplicated.get(mapUsesSrcAttr, mapUsesDstAttr)
}
val activeDirectionOpt = activeSetOpt.map(_._2)
// Map and combine.
val preAgg = edges.zipEdgePartitions(vs) { (edgePartition, vTableReplicatedIter) =>
val (_, vPart) = vTableReplicatedIter.next()
// Choose scan method
val activeFraction = vPart.numActives.getOrElse(0) / edgePartition.indexSize.toFloat
val edgeIter = activeDirectionOpt match {
case Some(EdgeDirection.Both) =>
if (activeFraction < 0.8) {
edgePartition.indexIterator(srcVid => vPart.isActive(srcVid))
.filter(e => vPart.isActive(e.dstId))
} else {
edgePartition.iterator.filter(e => vPart.isActive(e.srcId) && vPart.isActive(e.dstId))
}
case Some(EdgeDirection.Out) =>
if (activeFraction < 0.8) {
edgePartition.indexIterator(srcVid => vPart.isActive(srcVid))
} else {
edgePartition.iterator.filter(e => vPart.isActive(e.srcId))
}
case Some(EdgeDirection.In) =>
edgePartition.iterator.filter(e => vPart.isActive(e.dstId))
case None =>
edgePartition.iterator
}
// Scan edges and run the map function
val et = new EdgeTriplet[VD, ED]
val mapOutputs = edgeIter.flatMap { e =>
et.set(e)
if (mapUsesSrcAttr) {
et.srcAttr = vPart(e.srcId)
}
if (mapUsesDstAttr) {
et.dstAttr = vPart(e.dstId)
}
mapFunc(et)
}
// Note: This doesn't allow users to send messages to arbitrary vertices.
vPart.aggregateUsingIndex(mapOutputs, reduceFunc).iterator
}
// do the final reduction reusing the index map
vertices.aggregateUsingIndex(preAgg, reduceFunc)
} // end of mapReduceTriplets
override def outerJoinVertices[U: ClassManifest, VD2: ClassManifest] override def outerJoinVertices[U: ClassManifest, VD2: ClassManifest]
(updates: RDD[(Vid, U)])(updateF: (Vid, VD, Option[U]) => VD2): Graph[VD2, ED] = { (updates: RDD[(Vid, U)])(updateF: (Vid, VD, Option[U]) => VD2): Graph[VD2, ED] = {
ClosureCleaner.clean(updateF) if (classManifest[VD] equals classManifest[VD2]) {
val newVTable = vTable.leftJoin(updates)(updateF) // updateF preserves type, so we can use incremental replication
new GraphImpl(newVTable, eTable, vertexPlacement, partitioner) val newVerts = vertices.leftJoin(updates)(updateF)
val changedVerts = vertices.asInstanceOf[VertexRDD[VD2]].diff(newVerts)
val newVTableReplicated = new VTableReplicated[VD2](
changedVerts, edges, vertexPlacement,
Some(vTableReplicated.asInstanceOf[VTableReplicated[VD2]]))
new GraphImpl(newVerts, edges, vertexPlacement, newVTableReplicated)
} else {
// updateF does not preserve type, so we must re-replicate all vertices
val newVerts = vertices.leftJoin(updates)(updateF)
new GraphImpl(newVerts, edges, vertexPlacement)
}
}
private def accessesVertexAttr[VD, ED](closure: AnyRef, attrName: String): Boolean = {
try {
BytecodeUtils.invokedMethod(closure, classOf[EdgeTriplet[VD, ED]], attrName)
} catch {
case _: ClassNotFoundException => true // if we don't know, be conservative
}
} }
} // end of class GraphImpl } // end of class GraphImpl
@ -241,70 +323,35 @@ object GraphImpl {
def apply[VD: ClassManifest, ED: ClassManifest]( def apply[VD: ClassManifest, ED: ClassManifest](
edges: RDD[Edge[ED]], edges: RDD[Edge[ED]],
defaultValue: VD, defaultVertexAttr: VD): GraphImpl[VD, ED] =
partitionStrategy: PartitionStrategy): GraphImpl[VD, ED] = { {
val etable = createETable(edges, partitionStrategy).cache fromEdgeRDD(createETable(edges), defaultVertexAttr)
// Get the set of all vids
val vids = etable.mapPartitions(iter => {
val (pid, epart) = iter.next()
assert(!iter.hasNext)
epart.iterator.flatMap(e => Iterator(e.srcId, e.dstId))
}, preservesPartitioning = true)
// Index the set of all vids
val index = VertexSetRDD.makeIndex(vids)
// Index the vertices and fill in missing attributes with the default
val vtable = VertexSetRDD(index, defaultValue)
val vertexPlacement = new VertexPlacement(etable, vtable)
new GraphImpl(vtable, etable, vertexPlacement, partitionStrategy)
} }
// def apply[VD: ClassManifest, ED: ClassManifest]( def fromEdgePartitions[VD: ClassManifest, ED: ClassManifest](
// vertices: RDD[(Vid, VD)], edges: RDD[(Pid, EdgePartition[ED])],
// edges: RDD[Edge[ED]], defaultVertexAttr: VD): GraphImpl[VD, ED] = {
// defaultVertexAttr: VD): GraphImpl[VD,ED] = { fromEdgeRDD(createETableFromEdgePartitions(edges), defaultVertexAttr)
// apply(vertices, edges, defaultVertexAttr, (a:VD, b:VD) => a, RandomVertexCut()) }
// }
// def apply[VD: ClassManifest, ED: ClassManifest](
// vertices: RDD[(Vid, VD)],
// edges: RDD[Edge[ED]],
// defaultVertexAttr: VD,
// partitionStrategy: PartitionStrategy): GraphImpl[VD,ED] = {
// apply(vertices, edges, defaultVertexAttr, (a:VD, b:VD) => a, partitionStrategy)
// }
// def apply[VD: ClassManifest, ED: ClassManifest](
// vertices: RDD[(Vid, VD)],
// edges: RDD[Edge[ED]],
// defaultVertexAttr: VD,
// mergeFunc: (VD, VD) => VD): GraphImpl[VD,ED] = {
// apply(vertices, edges, defaultVertexAttr, mergeFunc, RandomVertexCut())
// }
def apply[VD: ClassManifest, ED: ClassManifest]( def apply[VD: ClassManifest, ED: ClassManifest](
vertices: RDD[(Vid, VD)], vertices: RDD[(Vid, VD)],
edges: RDD[Edge[ED]], edges: RDD[Edge[ED]],
defaultVertexAttr: VD, defaultVertexAttr: VD): GraphImpl[VD, ED] =
mergeFunc: (VD, VD) => VD, {
partitionStrategy: PartitionStrategy): GraphImpl[VD, ED] = { val etable = createETable(edges).cache()
vertices.cache // Get the set of all vids
val etable = createETable(edges, partitionStrategy).cache
// Get the set of all vids, preserving partitions
val partitioner = Partitioner.defaultPartitioner(vertices) val partitioner = Partitioner.defaultPartitioner(vertices)
val implicitVids = etable.flatMap { val vPartitioned = vertices.partitionBy(partitioner)
case (pid, partition) => Array.concat(partition.srcIds, partition.dstIds) val vidsFromEdges = collectVidsFromEdges(etable, partitioner)
}.map(vid => (vid, ())).partitionBy(partitioner) val vids = vPartitioned.zipPartitions(vidsFromEdges) { (vertexIter, vidsFromEdgesIter) =>
val allVids = vertices.zipPartitions(implicitVids, preservesPartitioning = true) { vertexIter.map(_._1) ++ vidsFromEdgesIter.map(_._1)
(a, b) => a.map(_._1) ++ b.map(_._1)
} }
// Index the set of all vids
val index = VertexSetRDD.makeIndex(allVids, Some(partitioner))
// Index the vertices and fill in missing attributes with the default
val vtable = VertexSetRDD(vertices, index, mergeFunc).fillMissing(defaultVertexAttr)
val vertexPlacement = new VertexPlacement(etable, vtable) val vtable = VertexRDD(vids, vPartitioned, defaultVertexAttr)
new GraphImpl(vtable, etable, vertexPlacement, partitionStrategy)
new GraphImpl(vtable, etable)
} }
/** /**
@ -315,39 +362,21 @@ object GraphImpl {
* key-value pair: the key is the partition id, and the value is an EdgePartition object * key-value pair: the key is the partition id, and the value is an EdgePartition object
* containing all the edges in a partition. * containing all the edges in a partition.
*/ */
protected def createETable[ED: ClassManifest]( private def createETable[ED: ClassManifest](
edges: RDD[Edge[ED]], edges: RDD[Edge[ED]]): EdgeRDD[ED] = {
partitionStrategy: PartitionStrategy): RDD[(Pid, EdgePartition[ED])] = { val eTable = edges.mapPartitionsWithIndex { (pid, iter) =>
// Get the number of partitions
val numPartitions = edges.partitions.size
edges.map { e =>
val part: Pid = partitionStrategy.getPartition(e.srcId, e.dstId, numPartitions)
// Should we be using 3-tuple or an optimized class
new MessageToPartition(part, (e.srcId, e.dstId, e.attr))
}
.partitionBy(new HashPartitioner(numPartitions))
.mapPartitionsWithIndex( (pid, iter) => {
val builder = new EdgePartitionBuilder[ED] val builder = new EdgePartitionBuilder[ED]
iter.foreach { message => iter.foreach { e =>
val data = message.data builder.add(e.srcId, e.dstId, e.attr)
builder.add(data._1, data._2, data._3)
} }
val edgePartition = builder.toEdgePartition Iterator((pid, builder.toEdgePartition))
Iterator((pid, edgePartition)) }
}, preservesPartitioning = true).cache() new EdgeRDD(eTable)
} }
protected def makeTriplets[VD: ClassManifest, ED: ClassManifest]( private def createETableFromEdgePartitions[ED: ClassManifest](
vTableReplicated: RDD[(Pid, (VertexIdToIndexMap, Array[VD]))], edges: RDD[(Pid, EdgePartition[ED])]): EdgeRDD[ED] = {
eTable: RDD[(Pid, EdgePartition[ED])]): RDD[EdgeTriplet[VD, ED]] = { new EdgeRDD(edges)
eTable.zipPartitions(vTableReplicated) {
(eTableIter, vTableReplicatedIter) =>
val (_, edgePartition) = eTableIter.next()
val (_, (vidToIndex, vertexArray)) = vTableReplicatedIter.next()
new EdgeTripletIterator(vidToIndex, vertexArray, edgePartition)
}
} }
def mask[VD: ClassManifest, ED: ClassManifest, VD2: ClassManifest, ED2: ClassManifest] ( def mask[VD: ClassManifest, ED: ClassManifest, VD2: ClassManifest, ED2: ClassManifest] (
@ -382,95 +411,24 @@ object GraphImpl {
new GraphImpl(newVTable, newETable, newVertexPlacement, thisImpl.partitioner) new GraphImpl(newVTable, newETable, newVertexPlacement, thisImpl.partitioner)
} }
private def fromEdgeRDD[VD: ClassManifest, ED: ClassManifest](
protected def mapTriplets[VD: ClassManifest, ED: ClassManifest, ED2: ClassManifest]( edges: EdgeRDD[ED],
g: GraphImpl[VD, ED], defaultVertexAttr: VD): GraphImpl[VD, ED] = {
f: EdgeTriplet[VD, ED] => ED2): Graph[VD, ED2] = { edges.cache()
val newETable = g.eTable.zipPartitions( // Get the set of all vids
g.vTableReplicated.bothAttrs, preservesPartitioning = true val vids = collectVidsFromEdges(edges, new HashPartitioner(edges.partitions.size))
) { // Create the VertexRDD.
(eTableIter, vTableReplicatedIter) => val vtable = VertexRDD(vids.mapValues(x => defaultVertexAttr))
val (pid, edgePartition) = eTableIter.next() new GraphImpl(vtable, edges)
val (_, (vidToIndex, vertexArray)) = vTableReplicatedIter.next()
val et = new EdgeTriplet[VD, ED]
val vmap = new PrimitiveKeyOpenHashMap[Vid, VD](vidToIndex, vertexArray)
val newEdgePartition = edgePartition.map { e =>
et.set(e)
et.srcAttr = vmap(e.srcId)
et.dstAttr = vmap(e.dstId)
f(et)
}
Iterator((pid, newEdgePartition))
}
new GraphImpl(g.vTable, newETable, g.vertexPlacement, g.partitioner)
} }
protected def mapReduceTriplets[VD: ClassManifest, ED: ClassManifest, A: ClassManifest]( /** Collects all vids mentioned in edges and partitions them by partitioner. */
g: GraphImpl[VD, ED], private def collectVidsFromEdges(
mapFunc: EdgeTriplet[VD, ED] => Iterator[(Vid, A)], edges: EdgeRDD[_],
reduceFunc: (A, A) => A): VertexSetRDD[A] = { partitioner: Partitioner): RDD[(Vid, Int)] = {
ClosureCleaner.clean(mapFunc) // TODO: Consider doing map side distinct before shuffle.
ClosureCleaner.clean(reduceFunc) new ShuffledRDD[Vid, Int, (Vid, Int)](
// For each vertex, replicate its attribute only to partitions where it is edges.collectVids.map(vid => (vid, 0)), partitioner)
// in the relevant position in an edge. .setSerializer(classOf[VidMsgSerializer].getName)
val mapFuncUsesSrcAttr = accessesVertexAttr[VD, ED](mapFunc, "srcAttr")
val mapFuncUsesDstAttr = accessesVertexAttr[VD, ED](mapFunc, "dstAttr")
// Map and preaggregate
val preAgg = g.eTable.zipPartitions(
g.vTableReplicated.get(mapFuncUsesSrcAttr, mapFuncUsesDstAttr)
) {
(edgePartitionIter, vTableReplicatedIter) =>
val (_, edgePartition) = edgePartitionIter.next()
val (_, (vidToIndex, vertexArray)) = vTableReplicatedIter.next()
assert(vidToIndex.capacity == vertexArray.size)
val vmap = new PrimitiveKeyOpenHashMap[Vid, VD](vidToIndex, vertexArray)
// TODO(jegonzal): This doesn't allow users to send messages to arbitrary vertices.
val msgArray = new Array[A](vertexArray.size)
val msgBS = new BitSet(vertexArray.size)
// Iterate over the partition
val et = new EdgeTriplet[VD, ED]
edgePartition.foreach { e =>
et.set(e)
if (mapFuncUsesSrcAttr) {
et.srcAttr = vmap(e.srcId)
} }
if (mapFuncUsesDstAttr) {
et.dstAttr = vmap(e.dstId)
}
// TODO(rxin): rewrite the foreach using a simple while loop to speed things up.
// Also given we are only allowing zero, one, or two messages, we can completely unroll
// the for loop.
mapFunc(et).foreach { case (vid, msg) =>
// verify that the vid is valid
assert(vid == et.srcId || vid == et.dstId)
// Get the index of the key
val ind = vidToIndex.getPos(vid) & OpenHashSet.POSITION_MASK
// Populate the aggregator map
if (msgBS.get(ind)) {
msgArray(ind) = reduceFunc(msgArray(ind), msg)
} else {
msgArray(ind) = msg
msgBS.set(ind)
}
}
}
// construct an iterator of tuples Iterator[(Vid, A)]
msgBS.iterator.map { ind =>
new AggregationMsg[A](vidToIndex.getValue(ind), msgArray(ind))
}
}.partitionBy(g.vTable.partitioner.get)
// do the final reduction reusing the index map
VertexSetRDD.aggregate(preAgg, g.vTable.index, reduceFunc)
}
private def accessesVertexAttr[VD: ClassManifest, ED: ClassManifest](
closure: AnyRef, attrName: String): Boolean = {
try {
BytecodeUtils.invokedMethod(closure, classOf[EdgeTriplet[VD, ED]], attrName)
} catch {
case _: ClassNotFoundException => true // if we don't know, be conservative
}
}
} // end of object GraphImpl } // end of object GraphImpl

View file

@ -19,17 +19,6 @@ class VertexBroadcastMsg[@specialized(Int, Long, Double, Boolean) T](
} }
class AggregationMsg[@specialized(Int, Long, Double, Boolean) T](var vid: Vid, var data: T)
extends Product2[Vid, T] {
override def _1 = vid
override def _2 = data
override def canEqual(that: Any): Boolean = that.isInstanceOf[AggregationMsg[_]]
}
/** /**
* A message used to send a specific value to a partition. * A message used to send a specific value to a partition.
* @param partition index of the target partition. * @param partition index of the target partition.
@ -65,23 +54,6 @@ class VertexBroadcastMsgRDDFunctions[T: ClassManifest](self: RDD[VertexBroadcast
} }
class AggregationMessageRDDFunctions[T: ClassManifest](self: RDD[AggregationMsg[T]]) {
def partitionBy(partitioner: Partitioner): RDD[AggregationMsg[T]] = {
val rdd = new ShuffledRDD[Vid, T, AggregationMsg[T]](self, partitioner)
// Set a custom serializer if the data is of int or double type.
if (classManifest[T] == ClassManifest.Int) {
rdd.setSerializer(classOf[IntAggMsgSerializer].getName)
} else if (classManifest[T] == ClassManifest.Long) {
rdd.setSerializer(classOf[LongAggMsgSerializer].getName)
} else if (classManifest[T] == ClassManifest.Double) {
rdd.setSerializer(classOf[DoubleAggMsgSerializer].getName)
}
rdd
}
}
class MsgRDDFunctions[T: ClassManifest](self: RDD[MessageToPartition[T]]) { class MsgRDDFunctions[T: ClassManifest](self: RDD[MessageToPartition[T]]) {
/** /**
@ -103,7 +75,17 @@ object MsgRDDFunctions {
new VertexBroadcastMsgRDDFunctions(rdd) new VertexBroadcastMsgRDDFunctions(rdd)
} }
implicit def rdd2aggMessageRDDFunctions[T: ClassManifest](rdd: RDD[AggregationMsg[T]]) = { def partitionForAggregation[T: ClassManifest](msgs: RDD[(Vid, T)], partitioner: Partitioner) = {
new AggregationMessageRDDFunctions(rdd) val rdd = new ShuffledRDD[Vid, T, (Vid, T)](msgs, partitioner)
// Set a custom serializer if the data is of int or double type.
if (classManifest[T] == ClassManifest.Int) {
rdd.setSerializer(classOf[IntAggMsgSerializer].getName)
} else if (classManifest[T] == ClassManifest.Long) {
rdd.setSerializer(classOf[LongAggMsgSerializer].getName)
} else if (classManifest[T] == ClassManifest.Double) {
rdd.setSerializer(classOf[DoubleAggMsgSerializer].getName)
}
rdd
} }
} }

View file

@ -3,8 +3,27 @@ package org.apache.spark.graph.impl
import java.io.{EOFException, InputStream, OutputStream} import java.io.{EOFException, InputStream, OutputStream}
import java.nio.ByteBuffer import java.nio.ByteBuffer
import org.apache.spark.graph._
import org.apache.spark.serializer._ import org.apache.spark.serializer._
class VidMsgSerializer extends Serializer {
override def newInstance(): SerializerInstance = new ShuffleSerializerInstance {
override def serializeStream(s: OutputStream) = new ShuffleSerializationStream(s) {
def writeObject[T](t: T) = {
val msg = t.asInstanceOf[(Vid, _)]
writeVarLong(msg._1, optimizePositive = false)
this
}
}
override def deserializeStream(s: InputStream) = new ShuffleDeserializationStream(s) {
override def readObject[T](): T = {
(readVarLong(optimizePositive = false), null).asInstanceOf[T]
}
}
}
}
/** A special shuffle serializer for VertexBroadcastMessage[Int]. */ /** A special shuffle serializer for VertexBroadcastMessage[Int]. */
class IntVertexBroadcastMsgSerializer extends Serializer { class IntVertexBroadcastMsgSerializer extends Serializer {
@ -13,7 +32,7 @@ class IntVertexBroadcastMsgSerializer extends Serializer {
override def serializeStream(s: OutputStream) = new ShuffleSerializationStream(s) { override def serializeStream(s: OutputStream) = new ShuffleSerializationStream(s) {
def writeObject[T](t: T) = { def writeObject[T](t: T) = {
val msg = t.asInstanceOf[VertexBroadcastMsg[Int]] val msg = t.asInstanceOf[VertexBroadcastMsg[Int]]
writeLong(msg.vid) writeVarLong(msg.vid, optimizePositive = false)
writeInt(msg.data) writeInt(msg.data)
this this
} }
@ -21,7 +40,9 @@ class IntVertexBroadcastMsgSerializer extends Serializer {
override def deserializeStream(s: InputStream) = new ShuffleDeserializationStream(s) { override def deserializeStream(s: InputStream) = new ShuffleDeserializationStream(s) {
override def readObject[T](): T = { override def readObject[T](): T = {
new VertexBroadcastMsg[Int](0, readLong(), readInt()).asInstanceOf[T] val a = readVarLong(optimizePositive = false)
val b = readInt()
new VertexBroadcastMsg[Int](0, a, b).asInstanceOf[T]
} }
} }
} }
@ -34,7 +55,7 @@ class LongVertexBroadcastMsgSerializer extends Serializer {
override def serializeStream(s: OutputStream) = new ShuffleSerializationStream(s) { override def serializeStream(s: OutputStream) = new ShuffleSerializationStream(s) {
def writeObject[T](t: T) = { def writeObject[T](t: T) = {
val msg = t.asInstanceOf[VertexBroadcastMsg[Long]] val msg = t.asInstanceOf[VertexBroadcastMsg[Long]]
writeLong(msg.vid) writeVarLong(msg.vid, optimizePositive = false)
writeLong(msg.data) writeLong(msg.data)
this this
} }
@ -42,7 +63,7 @@ class LongVertexBroadcastMsgSerializer extends Serializer {
override def deserializeStream(s: InputStream) = new ShuffleDeserializationStream(s) { override def deserializeStream(s: InputStream) = new ShuffleDeserializationStream(s) {
override def readObject[T](): T = { override def readObject[T](): T = {
val a = readLong() val a = readVarLong(optimizePositive = false)
val b = readLong() val b = readLong()
new VertexBroadcastMsg[Long](0, a, b).asInstanceOf[T] new VertexBroadcastMsg[Long](0, a, b).asInstanceOf[T]
} }
@ -57,7 +78,7 @@ class DoubleVertexBroadcastMsgSerializer extends Serializer {
override def serializeStream(s: OutputStream) = new ShuffleSerializationStream(s) { override def serializeStream(s: OutputStream) = new ShuffleSerializationStream(s) {
def writeObject[T](t: T) = { def writeObject[T](t: T) = {
val msg = t.asInstanceOf[VertexBroadcastMsg[Double]] val msg = t.asInstanceOf[VertexBroadcastMsg[Double]]
writeLong(msg.vid) writeVarLong(msg.vid, optimizePositive = false)
writeDouble(msg.data) writeDouble(msg.data)
this this
} }
@ -65,7 +86,7 @@ class DoubleVertexBroadcastMsgSerializer extends Serializer {
override def deserializeStream(s: InputStream) = new ShuffleDeserializationStream(s) { override def deserializeStream(s: InputStream) = new ShuffleDeserializationStream(s) {
def readObject[T](): T = { def readObject[T](): T = {
val a = readLong() val a = readVarLong(optimizePositive = false)
val b = readDouble() val b = readDouble()
new VertexBroadcastMsg[Double](0, a, b).asInstanceOf[T] new VertexBroadcastMsg[Double](0, a, b).asInstanceOf[T]
} }
@ -73,25 +94,24 @@ class DoubleVertexBroadcastMsgSerializer extends Serializer {
} }
} }
/** A special shuffle serializer for AggregationMessage[Int]. */ /** A special shuffle serializer for AggregationMessage[Int]. */
class IntAggMsgSerializer extends Serializer { class IntAggMsgSerializer extends Serializer {
override def newInstance(): SerializerInstance = new ShuffleSerializerInstance { override def newInstance(): SerializerInstance = new ShuffleSerializerInstance {
override def serializeStream(s: OutputStream) = new ShuffleSerializationStream(s) { override def serializeStream(s: OutputStream) = new ShuffleSerializationStream(s) {
def writeObject[T](t: T) = { def writeObject[T](t: T) = {
val msg = t.asInstanceOf[AggregationMsg[Int]] val msg = t.asInstanceOf[(Vid, Int)]
writeLong(msg.vid) writeVarLong(msg._1, optimizePositive = false)
writeUnsignedVarInt(msg.data) writeUnsignedVarInt(msg._2)
this this
} }
} }
override def deserializeStream(s: InputStream) = new ShuffleDeserializationStream(s) { override def deserializeStream(s: InputStream) = new ShuffleDeserializationStream(s) {
override def readObject[T](): T = { override def readObject[T](): T = {
val a = readLong() val a = readVarLong(optimizePositive = false)
val b = readUnsignedVarInt() val b = readUnsignedVarInt()
new AggregationMsg[Int](a, b).asInstanceOf[T] (a, b).asInstanceOf[T]
} }
} }
} }
@ -103,9 +123,9 @@ class LongAggMsgSerializer extends Serializer {
override def serializeStream(s: OutputStream) = new ShuffleSerializationStream(s) { override def serializeStream(s: OutputStream) = new ShuffleSerializationStream(s) {
def writeObject[T](t: T) = { def writeObject[T](t: T) = {
val msg = t.asInstanceOf[AggregationMsg[Long]] val msg = t.asInstanceOf[(Vid, Long)]
writeVarLong(msg.vid, optimizePositive = false) writeVarLong(msg._1, optimizePositive = false)
writeVarLong(msg.data, optimizePositive = true) writeVarLong(msg._2, optimizePositive = true)
this this
} }
} }
@ -114,22 +134,21 @@ class LongAggMsgSerializer extends Serializer {
override def readObject[T](): T = { override def readObject[T](): T = {
val a = readVarLong(optimizePositive = false) val a = readVarLong(optimizePositive = false)
val b = readVarLong(optimizePositive = true) val b = readVarLong(optimizePositive = true)
new AggregationMsg[Long](a, b).asInstanceOf[T] (a, b).asInstanceOf[T]
} }
} }
} }
} }
/** A special shuffle serializer for AggregationMessage[Double]. */ /** A special shuffle serializer for AggregationMessage[Double]. */
class DoubleAggMsgSerializer extends Serializer { class DoubleAggMsgSerializer extends Serializer {
override def newInstance(): SerializerInstance = new ShuffleSerializerInstance { override def newInstance(): SerializerInstance = new ShuffleSerializerInstance {
override def serializeStream(s: OutputStream) = new ShuffleSerializationStream(s) { override def serializeStream(s: OutputStream) = new ShuffleSerializationStream(s) {
def writeObject[T](t: T) = { def writeObject[T](t: T) = {
val msg = t.asInstanceOf[AggregationMsg[Double]] val msg = t.asInstanceOf[(Vid, Double)]
writeVarLong(msg.vid, optimizePositive = false) writeVarLong(msg._1, optimizePositive = false)
writeDouble(msg.data) writeDouble(msg._2)
this this
} }
} }
@ -138,7 +157,7 @@ class DoubleAggMsgSerializer extends Serializer {
def readObject[T](): T = { def readObject[T](): T = {
val a = readVarLong(optimizePositive = false) val a = readVarLong(optimizePositive = false)
val b = readDouble() val b = readDouble()
new AggregationMsg[Double](a, b).asInstanceOf[T] (a, b).asInstanceOf[T]
} }
} }
} }
@ -148,7 +167,7 @@ class DoubleAggMsgSerializer extends Serializer {
// Helper classes to shorten the implementation of those special serializers. // Helper classes to shorten the implementation of those special serializers.
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
sealed abstract class ShuffleSerializationStream(s: OutputStream) extends SerializationStream { abstract class ShuffleSerializationStream(s: OutputStream) extends SerializationStream {
// The implementation should override this one. // The implementation should override this one.
def writeObject[T](t: T): SerializationStream def writeObject[T](t: T): SerializationStream
@ -261,8 +280,7 @@ sealed abstract class ShuffleSerializationStream(s: OutputStream) extends Serial
override def close(): Unit = s.close() override def close(): Unit = s.close()
} }
abstract class ShuffleDeserializationStream(s: InputStream) extends DeserializationStream {
sealed abstract class ShuffleDeserializationStream(s: InputStream) extends DeserializationStream {
// The implementation should override this one. // The implementation should override this one.
def readObject[T](): T def readObject[T](): T
@ -277,7 +295,7 @@ sealed abstract class ShuffleDeserializationStream(s: InputStream) extends Deser
var i: Int = 0 var i: Int = 0
def readOrThrow(): Int = { def readOrThrow(): Int = {
val in = s.read() val in = s.read()
if (in < 0) throw new java.io.EOFException if (in < 0) throw new EOFException
in & 0xFF in & 0xFF
} }
var b: Int = readOrThrow() var b: Int = readOrThrow()
@ -291,22 +309,45 @@ sealed abstract class ShuffleDeserializationStream(s: InputStream) extends Deser
} }
def readVarLong(optimizePositive: Boolean): Long = { def readVarLong(optimizePositive: Boolean): Long = {
// TODO: unroll the while loop.
var value: Long = 0L
var i: Int = 0
def readOrThrow(): Int = { def readOrThrow(): Int = {
val in = s.read() val in = s.read()
if (in < 0) throw new java.io.EOFException if (in < 0) throw new EOFException
in & 0xFF in & 0xFF
} }
var b: Int = readOrThrow() var b = readOrThrow()
while ((b & 0x80) != 0) { var ret: Long = b & 0x7F
value |= (b & 0x7F).toLong << i if ((b & 0x80) != 0) {
i += 7
if (i > 63) throw new IllegalArgumentException("Variable length quantity is too long")
b = readOrThrow() b = readOrThrow()
ret |= (b & 0x7F) << 7
if ((b & 0x80) != 0) {
b = readOrThrow()
ret |= (b & 0x7F) << 14
if ((b & 0x80) != 0) {
b = readOrThrow()
ret |= (b & 0x7F) << 21
if ((b & 0x80) != 0) {
b = readOrThrow()
ret |= (b & 0x7F).toLong << 28
if ((b & 0x80) != 0) {
b = readOrThrow()
ret |= (b & 0x7F).toLong << 35
if ((b & 0x80) != 0) {
b = readOrThrow()
ret |= (b & 0x7F).toLong << 42
if ((b & 0x80) != 0) {
b = readOrThrow()
ret |= (b & 0x7F).toLong << 49
if ((b & 0x80) != 0) {
b = readOrThrow()
ret |= b.toLong << 56
}
}
}
}
}
}
}
} }
val ret = value | (b.toLong << i)
if (!optimizePositive) (ret >>> 1) ^ -(ret & 1) else ret if (!optimizePositive) (ret >>> 1) ^ -(ret & 1) else ret
} }
@ -329,7 +370,6 @@ sealed abstract class ShuffleDeserializationStream(s: InputStream) extends Deser
override def close(): Unit = s.close() override def close(): Unit = s.close()
} }
sealed trait ShuffleSerializerInstance extends SerializerInstance { sealed trait ShuffleSerializerInstance extends SerializerInstance {
override def serialize[T](t: T): ByteBuffer = throw new UnsupportedOperationException override def serialize[T](t: T): ByteBuffer = throw new UnsupportedOperationException

View file

@ -2,72 +2,38 @@ package org.apache.spark.graph.impl
import org.apache.spark.SparkContext._ import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
import org.apache.spark.util.collection.{OpenHashSet, PrimitiveKeyOpenHashMap} import org.apache.spark.util.collection.{PrimitiveVector, OpenHashSet}
import org.apache.spark.graph._ import org.apache.spark.graph._
import org.apache.spark.graph.impl.MsgRDDFunctions._
/** /**
* Stores the vertex attribute values after they are replicated. * A view of the vertices after they are shipped to the join sites specified in
* `vertexPlacement`. The resulting view is co-partitioned with `edges`. If `prevVTableReplicated`
* is specified, `updatedVerts` are treated as incremental updates to the previous view. Otherwise,
* a fresh view is created.
*
* The view is always cached (i.e., once it is created, it remains materialized). This avoids
* constructing it twice if the user calls graph.triplets followed by graph.mapReduceTriplets, for
* example.
*/ */
private[impl]
class VTableReplicated[VD: ClassManifest]( class VTableReplicated[VD: ClassManifest](
vTable: VertexSetRDD[VD], updatedVerts: VertexRDD[VD],
eTable: RDD[(Pid, EdgePartition[ED])] forSome { type ED }, edges: EdgeRDD[_],
vertexPlacement: VertexPlacement) {
val bothAttrs: RDD[(Pid, (VertexIdToIndexMap, Array[VD]))] =
VTableReplicated.createVTableReplicated(vTable, eTable, vertexPlacement, true, true)
val srcAttrOnly: RDD[(Pid, (VertexIdToIndexMap, Array[VD]))] =
VTableReplicated.createVTableReplicated(vTable, eTable, vertexPlacement, true, false)
val dstAttrOnly: RDD[(Pid, (VertexIdToIndexMap, Array[VD]))] =
VTableReplicated.createVTableReplicated(vTable, eTable, vertexPlacement, false, true)
val noAttrs: RDD[(Pid, (VertexIdToIndexMap, Array[VD]))] =
VTableReplicated.createVTableReplicated(vTable, eTable, vertexPlacement, false, false)
def get(includeSrcAttr: Boolean, includeDstAttr: Boolean)
: RDD[(Pid, (VertexIdToIndexMap, Array[VD]))] =
(includeSrcAttr, includeDstAttr) match {
case (true, true) => bothAttrs
case (true, false) => srcAttrOnly
case (false, true) => dstAttrOnly
case (false, false) => noAttrs
}
}
class VertexAttributeBlock[VD: ClassManifest](val vids: Array[Vid], val attrs: Array[VD])
object VTableReplicated {
protected def createVTableReplicated[VD: ClassManifest](
vTable: VertexSetRDD[VD],
eTable: RDD[(Pid, EdgePartition[ED])] forSome { type ED },
vertexPlacement: VertexPlacement, vertexPlacement: VertexPlacement,
includeSrcAttr: Boolean, prevVTableReplicated: Option[VTableReplicated[VD]] = None) {
includeDstAttr: Boolean): RDD[(Pid, (VertexIdToIndexMap, Array[VD]))] = {
val placement = vertexPlacement.get(includeSrcAttr, includeDstAttr)
// Send each edge partition the vertex attributes it wants, as specified in /**
// vertexPlacement * Within each edge partition, create a local map from vid to an index into the attribute
val msgsByPartition = placement.zipPartitions(vTable.partitionsRDD) { * array. Each map contains a superset of the vertices that it will receive, because it stores
(pid2vidIter, vertexPartIter) => * vids from both the source and destination of edges. It must always include both source and
val pid2vid = pid2vidIter.next() * destination vids because some operations, such as GraphImpl.mapReduceTriplets, rely on this.
val vertexPart = vertexPartIter.next() */
private val localVidMap: RDD[(Int, VertexIdToIndexMap)] = prevVTableReplicated match {
val vmap = new PrimitiveKeyOpenHashMap(vertexPart.index, vertexPart.values) case Some(prev) =>
val output = new Array[(Pid, VertexAttributeBlock[VD])](pid2vid.size) prev.localVidMap
for (pid <- 0 until pid2vid.size) { case None =>
val block = new VertexAttributeBlock(pid2vid(pid), pid2vid(pid).map(vid => vmap(vid))) edges.partitionsRDD.mapPartitions(_.map {
output(pid) = (pid, block)
}
output.iterator
}.partitionBy(eTable.partitioner.get).cache()
// Within each edge partition, create a local map from vid to an index into
// the attribute array. Each map contains a superset of the vertices that it
// will receive, because it stores vids from both the source and destination
// of edges. It must always include both source and destination vids because
// some operations, such as GraphImpl.mapReduceTriplets, rely on this.
val localVidMap = eTable.mapPartitions(_.map {
case (pid, epart) => case (pid, epart) =>
val vidToIndex = new VertexIdToIndexMap val vidToIndex = new VertexIdToIndexMap
epart.foreach { e => epart.foreach { e =>
@ -75,26 +41,141 @@ object VTableReplicated {
vidToIndex.add(e.dstId) vidToIndex.add(e.dstId)
} }
(pid, vidToIndex) (pid, vidToIndex)
}, preservesPartitioning = true).cache() }, preservesPartitioning = true).cache().setName("VTableReplicated localVidMap")
}
// Within each edge partition, place the vertex attributes received from private lazy val bothAttrs: RDD[(Pid, VertexPartition[VD])] = create(true, true)
// msgsByPartition into the correct locations specified in localVidMap private lazy val srcAttrOnly: RDD[(Pid, VertexPartition[VD])] = create(true, false)
localVidMap.zipPartitions(msgsByPartition) { private lazy val dstAttrOnly: RDD[(Pid, VertexPartition[VD])] = create(false, true)
(mapIter, msgsIter) => private lazy val noAttrs: RDD[(Pid, VertexPartition[VD])] = create(false, false)
def get(includeSrc: Boolean, includeDst: Boolean): RDD[(Pid, VertexPartition[VD])] = {
(includeSrc, includeDst) match {
case (true, true) => bothAttrs
case (true, false) => srcAttrOnly
case (false, true) => dstAttrOnly
case (false, false) => noAttrs
}
}
def get(
includeSrc: Boolean,
includeDst: Boolean,
actives: VertexRDD[_]): RDD[(Pid, VertexPartition[VD])] = {
// Ship active sets to edge partitions using vertexPlacement, but ignoring includeSrc and
// includeDst. These flags govern attribute shipping, but the activeness of a vertex must be
// shipped to all edges mentioning that vertex, regardless of whether the vertex attribute is
// also shipped there.
val shippedActives = vertexPlacement.get(true, true)
.zipPartitions(actives.partitionsRDD)(VTableReplicated.buildActiveBuffer(_, _))
.partitionBy(edges.partitioner.get)
// Update vTableReplicated with shippedActives, setting activeness flags in the resulting
// VertexPartitions
get(includeSrc, includeDst).zipPartitions(shippedActives) { (viewIter, shippedActivesIter) =>
val (pid, vPart) = viewIter.next()
val newPart = vPart.replaceActives(shippedActivesIter.flatMap(_._2.iterator))
Iterator((pid, newPart))
}
}
private def create(includeSrc: Boolean, includeDst: Boolean)
: RDD[(Pid, VertexPartition[VD])] = {
val vdManifest = classManifest[VD]
// Ship vertex attributes to edge partitions according to vertexPlacement
val verts = updatedVerts.partitionsRDD
val shippedVerts = vertexPlacement.get(includeSrc, includeDst)
.zipPartitions(verts)(VTableReplicated.buildBuffer(_, _)(vdManifest))
.partitionBy(edges.partitioner.get)
// TODO: Consider using a specialized shuffler.
prevVTableReplicated match {
case Some(vTableReplicated) =>
val prevView: RDD[(Pid, VertexPartition[VD])] =
vTableReplicated.get(includeSrc, includeDst)
// Update vTableReplicated with shippedVerts, setting staleness flags in the resulting
// VertexPartitions
prevView.zipPartitions(shippedVerts) { (prevViewIter, shippedVertsIter) =>
val (pid, prevVPart) = prevViewIter.next()
val newVPart = prevVPart.innerJoinKeepLeft(shippedVertsIter.flatMap(_._2.iterator))
Iterator((pid, newVPart))
}.cache().setName("VTableReplicated delta %s %s".format(includeSrc, includeDst))
case None =>
// Within each edge partition, place the shipped vertex attributes into the correct
// locations specified in localVidMap
localVidMap.zipPartitions(shippedVerts) { (mapIter, shippedVertsIter) =>
val (pid, vidToIndex) = mapIter.next() val (pid, vidToIndex) = mapIter.next()
assert(!mapIter.hasNext) assert(!mapIter.hasNext)
// Populate the vertex array using the vidToIndex map // Populate the vertex array using the vidToIndex map
val vertexArray = new Array[VD](vidToIndex.capacity) val vertexArray = vdManifest.newArray(vidToIndex.capacity)
for ((_, block) <- msgsIter) { for ((_, block) <- shippedVertsIter) {
for (i <- 0 until block.vids.size) { for (i <- 0 until block.vids.size) {
val vid = block.vids(i) val vid = block.vids(i)
val attr = block.attrs(i) val attr = block.attrs(i)
val ind = vidToIndex.getPos(vid) & OpenHashSet.POSITION_MASK val ind = vidToIndex.getPos(vid)
vertexArray(ind) = attr vertexArray(ind) = attr
} }
} }
Iterator((pid, (vidToIndex, vertexArray))) val newVPart = new VertexPartition(
}.cache() vidToIndex, vertexArray, vidToIndex.getBitSet)(vdManifest)
Iterator((pid, newVPart))
}.cache().setName("VTableReplicated %s %s".format(includeSrc, includeDst))
}
}
}
object VTableReplicated {
protected def buildBuffer[VD: ClassManifest](
pid2vidIter: Iterator[Array[Array[Vid]]],
vertexPartIter: Iterator[VertexPartition[VD]]) = {
val pid2vid: Array[Array[Vid]] = pid2vidIter.next()
val vertexPart: VertexPartition[VD] = vertexPartIter.next()
Iterator.tabulate(pid2vid.size) { pid =>
val vidsCandidate = pid2vid(pid)
val size = vidsCandidate.length
val vids = new PrimitiveVector[Vid](pid2vid(pid).size)
val attrs = new PrimitiveVector[VD](pid2vid(pid).size)
var i = 0
while (i < size) {
val vid = vidsCandidate(i)
if (vertexPart.isDefined(vid)) {
vids += vid
attrs += vertexPart(vid)
}
i += 1
}
(pid, new VertexAttributeBlock(vids.trim().array, attrs.trim().array))
}
} }
protected def buildActiveBuffer(
pid2vidIter: Iterator[Array[Array[Vid]]],
activePartIter: Iterator[VertexPartition[_]])
: Iterator[(Int, Array[Vid])] = {
val pid2vid: Array[Array[Vid]] = pid2vidIter.next()
val activePart: VertexPartition[_] = activePartIter.next()
Iterator.tabulate(pid2vid.size) { pid =>
val vidsCandidate = pid2vid(pid)
val size = vidsCandidate.length
val actives = new PrimitiveVector[Vid](vidsCandidate.size)
var i = 0
while (i < size) {
val vid = vidsCandidate(i)
if (activePart.isDefined(vid)) {
actives += vid
}
i += 1
}
(pid, actives.trim().array)
}
}
}
class VertexAttributeBlock[VD: ClassManifest](val vids: Array[Vid], val attrs: Array[VD]) {
def iterator: Iterator[(Vid, VD)] = (0 until vids.size).iterator.map { i => (vids(i), attrs(i)) }
} }

View file

@ -2,12 +2,60 @@ package org.apache.spark.graph.impl
import org.apache.spark.util.collection.{BitSet, PrimitiveKeyOpenHashMap} import org.apache.spark.util.collection.{BitSet, PrimitiveKeyOpenHashMap}
import org.apache.spark.Logging
import org.apache.spark.graph._ import org.apache.spark.graph._
class VertexPartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) VD: ClassManifest](
private[graph] object VertexPartition {
def apply[VD: ClassManifest](iter: Iterator[(Vid, VD)]): VertexPartition[VD] = {
val map = new PrimitiveKeyOpenHashMap[Vid, VD]
iter.foreach { case (k, v) =>
map(k) = v
}
new VertexPartition(map.keySet, map._values, map.keySet.getBitSet)
}
def apply[VD: ClassManifest](iter: Iterator[(Vid, VD)], mergeFunc: (VD, VD) => VD)
: VertexPartition[VD] =
{
val map = new PrimitiveKeyOpenHashMap[Vid, VD]
iter.foreach { case (k, v) =>
map.setMerge(k, v, mergeFunc)
}
new VertexPartition(map.keySet, map._values, map.keySet.getBitSet)
}
}
private[graph]
class VertexPartition[@specialized(Long, Int, Double) VD: ClassManifest](
val index: VertexIdToIndexMap, val index: VertexIdToIndexMap,
val values: Array[VD], val values: Array[VD],
val mask: BitSet) { val mask: BitSet,
/** A set of vids of active vertices. May contain vids not in index due to join rewrite. */
private val activeSet: Option[VertexSet] = None)
extends Logging {
val capacity: Int = index.capacity
def size: Int = mask.cardinality()
/** Return the vertex attribute for the given vertex ID. */
def apply(vid: Vid): VD = values(index.getPos(vid))
def isDefined(vid: Vid): Boolean = {
val pos = index.getPos(vid)
pos >= 0 && mask.get(pos)
}
/** Look up vid in activeSet, throwing an exception if it is None. */
def isActive(vid: Vid): Boolean = {
activeSet.get.contains(vid)
}
/** The number of active vertices, if any exist. */
def numActives: Option[Int] = activeSet.map(_.size)
/** /**
* Pass each vertex attribute along with the vertex id through a map * Pass each vertex attribute along with the vertex id through a map
@ -19,48 +67,196 @@ class VertexPartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double
* attribute in the RDD * attribute in the RDD
* *
* @return a new VertexPartition with values obtained by applying `f` to * @return a new VertexPartition with values obtained by applying `f` to
* each of the entries in the original VertexSet. The resulting * each of the entries in the original VertexRDD. The resulting
* VertexPartition retains the same index. * VertexPartition retains the same index.
*/ */
def map[VD2: ClassManifest](f: (Vid, VD) => VD2): VertexPartition[VD2] = { def map[VD2: ClassManifest](f: (Vid, VD) => VD2): VertexPartition[VD2] = {
// Construct a view of the map transformation // Construct a view of the map transformation
val newValues = new Array[VD2](index.capacity) val newValues = new Array[VD2](capacity)
mask.iterator.foreach { ind => var i = mask.nextSetBit(0)
newValues(ind) = f(index.getValueSafe(ind), values(ind)) while (i >= 0) {
newValues(i) = f(index.getValue(i), values(i))
i = mask.nextSetBit(i + 1)
} }
new VertexPartition[VD2](index, newValues, mask) new VertexPartition[VD2](index, newValues, mask)
} }
/** /**
* Restrict the vertex set to the set of vertices satisfying the * Restrict the vertex set to the set of vertices satisfying the given predicate.
* given predicate.
* *
* @param pred the user defined predicate * @param pred the user defined predicate
* *
* @note The vertex set preserves the original index structure * @note The vertex set preserves the original index structure which means that the returned
* which means that the returned RDD can be easily joined with * RDD can be easily joined with the original vertex-set. Furthermore, the filter only
* the original vertex-set. Furthermore, the filter only
* modifies the bitmap index and so no new values are allocated. * modifies the bitmap index and so no new values are allocated.
*/ */
def filter(pred: (Vid, VD) => Boolean): VertexPartition[VD] = { def filter(pred: (Vid, VD) => Boolean): VertexPartition[VD] = {
// Allocate the array to store the results into // Allocate the array to store the results into
val newMask = new BitSet(index.capacity) val newMask = new BitSet(capacity)
// Iterate over the active bits in the old bitset and // Iterate over the active bits in the old mask and evaluate the predicate
// evaluate the predicate var i = mask.nextSetBit(0)
var ind = mask.nextSetBit(0) while (i >= 0) {
while (ind >= 0) { if (pred(index.getValue(i), values(i))) {
val k = index.getValueSafe(ind) newMask.set(i)
if (pred(k, values(ind))) {
newMask.set(ind)
} }
ind = mask.nextSetBit(ind + 1) i = mask.nextSetBit(i + 1)
} }
new VertexPartition(index, values, newMask) new VertexPartition(index, values, newMask)
} }
/** /**
* Construct a new VertexPartition whose index contains only the vertices in * Hides vertices that are the same between this and other. For vertices that are different, keeps
* the mask. * the values from `other`. The indices of `this` and `other` must be the same.
*/
def diff(other: VertexPartition[VD]): VertexPartition[VD] = {
if (index != other.index) {
logWarning("Diffing two VertexPartitions with different indexes is slow.")
diff(createUsingIndex(other.iterator))
} else {
val newMask = mask & other.mask
var i = newMask.nextSetBit(0)
while (i >= 0) {
if (values(i) == other.values(i)) {
newMask.unset(i)
}
i = newMask.nextSetBit(i + 1)
}
new VertexPartition(index, other.values, newMask)
}
}
/** Inner join another VertexPartition. */
def join[VD2: ClassManifest, VD3: ClassManifest]
(other: VertexPartition[VD2])
(f: (Vid, VD, VD2) => VD3): VertexPartition[VD3] =
{
if (index != other.index) {
logWarning("Joining two VertexPartitions with different indexes is slow.")
join(createUsingIndex(other.iterator))(f)
} else {
val newValues = new Array[VD3](capacity)
val newMask = mask & other.mask
var i = newMask.nextSetBit(0)
while (i >= 0) {
newValues(i) = f(index.getValue(i), values(i), other.values(i))
i = mask.nextSetBit(i + 1)
}
new VertexPartition(index, newValues, newMask)
}
}
/** Left outer join another VertexPartition. */
def leftJoin[VD2: ClassManifest, VD3: ClassManifest]
(other: VertexPartition[VD2])
(f: (Vid, VD, Option[VD2]) => VD3): VertexPartition[VD3] = {
if (index != other.index) {
logWarning("Joining two VertexPartitions with different indexes is slow.")
leftJoin(createUsingIndex(other.iterator))(f)
} else {
val newValues = new Array[VD3](capacity)
var i = mask.nextSetBit(0)
while (i >= 0) {
val otherV: Option[VD2] = if (other.mask.get(i)) Some(other.values(i)) else None
newValues(i) = f(index.getValue(i), values(i), otherV)
i = mask.nextSetBit(i + 1)
}
new VertexPartition(index, newValues, mask)
}
}
/** Left outer join another iterator of messages. */
def leftJoin[VD2: ClassManifest, VD3: ClassManifest]
(other: Iterator[(Vid, VD2)])
(f: (Vid, VD, Option[VD2]) => VD3): VertexPartition[VD3] = {
leftJoin(createUsingIndex(other))(f)
}
/** Inner join another VertexPartition. */
def innerJoin[U: ClassManifest, VD2: ClassManifest](other: VertexPartition[U])
(f: (Vid, VD, U) => VD2): VertexPartition[VD2] = {
if (index != other.index) {
logWarning("Joining two VertexPartitions with different indexes is slow.")
innerJoin(createUsingIndex(other.iterator))(f)
}
val newMask = mask & other.mask
val newValues = new Array[VD2](capacity)
var i = newMask.nextSetBit(0)
while (i >= 0) {
newValues(i) = f(index.getValue(i), values(i), other.values(i))
i = newMask.nextSetBit(i + 1)
}
new VertexPartition(index, newValues, newMask)
}
/**
* Inner join an iterator of messages.
*/
def innerJoin[U: ClassManifest, VD2: ClassManifest]
(iter: Iterator[Product2[Vid, U]])
(f: (Vid, VD, U) => VD2): VertexPartition[VD2] = {
innerJoin(createUsingIndex(iter))(f)
}
/**
* Similar effect as aggregateUsingIndex((a, b) => a)
*/
def createUsingIndex[VD2: ClassManifest](iter: Iterator[Product2[Vid, VD2]])
: VertexPartition[VD2] = {
val newMask = new BitSet(capacity)
val newValues = new Array[VD2](capacity)
iter.foreach { case (vid, vdata) =>
val pos = index.getPos(vid)
newMask.set(pos)
newValues(pos) = vdata
}
new VertexPartition[VD2](index, newValues, newMask)
}
/**
* Similar to innerJoin, but vertices from the left side that don't appear in iter will remain in
* the partition, hidden by the bitmask.
*/
def innerJoinKeepLeft(iter: Iterator[Product2[Vid, VD]]): VertexPartition[VD] = {
val newMask = new BitSet(capacity)
val newValues = new Array[VD](capacity)
System.arraycopy(values, 0, newValues, 0, newValues.length)
iter.foreach { case (vid, vdata) =>
val pos = index.getPos(vid)
newMask.set(pos)
newValues(pos) = vdata
}
new VertexPartition(index, newValues, newMask)
}
def aggregateUsingIndex[VD2: ClassManifest](
iter: Iterator[Product2[Vid, VD2]], reduceFunc: (VD2, VD2) => VD2): VertexPartition[VD2] =
{
val newMask = new BitSet(capacity)
val newValues = new Array[VD2](capacity)
iter.foreach { product =>
val vid = product._1
val vdata = product._2
val pos = index.getPos(vid)
if (newMask.get(pos)) {
newValues(pos) = reduceFunc(newValues(pos), vdata)
} else { // otherwise just store the new value
newMask.set(pos)
newValues(pos) = vdata
}
}
new VertexPartition[VD2](index, newValues, newMask)
}
def replaceActives(iter: Iterator[Vid]): VertexPartition[VD] = {
val newActiveSet = new VertexSet
iter.foreach(newActiveSet.add(_))
new VertexPartition(index, values, mask, Some(newActiveSet))
}
/**
* Construct a new VertexPartition whose index contains only the vertices in the mask.
*/ */
def reindex(): VertexPartition[VD] = { def reindex(): VertexPartition[VD] = {
val hashMap = new PrimitiveKeyOpenHashMap[Vid, VD] val hashMap = new PrimitiveKeyOpenHashMap[Vid, VD]
@ -68,8 +264,10 @@ class VertexPartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double
for ((k, v) <- this.iterator) { for ((k, v) <- this.iterator) {
hashMap.setMerge(k, v, arbitraryMerge) hashMap.setMerge(k, v, arbitraryMerge)
} }
new VertexPartition(hashMap.keySet, hashMap._values, index.getBitSet) new VertexPartition(hashMap.keySet, hashMap._values, hashMap.keySet.getBitSet)
} }
def iterator = mask.iterator.map(ind => (index.getValueSafe(ind), values(ind))) def iterator: Iterator[(Vid, VD)] = mask.iterator.map(ind => (index.getValue(ind), values(ind)))
def vidIterator: Iterator[Vid] = mask.iterator.map(ind => index.getValue(ind))
} }

View file

@ -1,21 +1,18 @@
package org.apache.spark.graph.impl package org.apache.spark.graph.impl
import scala.collection.JavaConversions._ import org.apache.spark.SparkContext._
import scala.collection.mutable.ArrayBuffer import org.apache.spark.graph._
import scala.collection.mutable.ArrayBuilder
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel import org.apache.spark.storage.StorageLevel
import org.apache.spark.util.collection.PrimitiveVector
import org.apache.spark.graph._
/** /**
* Stores the layout of replicated vertex attributes for GraphImpl. Tells each * Stores the locations of edge-partition join sites for each vertex attribute in `vTable`; that is,
* partition of the vertex data where it should go. * the routing information for shipping vertex attributes to edge partitions. This is always cached
* because it may be used multiple times in VTableReplicated -- once to ship the vertex attributes
* and (possibly) once to ship the active-set information.
*/ */
class VertexPlacement( class VertexPlacement(eTable: EdgeRDD[_], vTable: VertexRDD[_]) {
eTable: RDD[(Pid, EdgePartition[ED])] forSome { type ED },
vTable: VertexSetRDD[_]) {
val bothAttrs: RDD[Array[Array[Vid]]] = createPid2Vid(true, true) val bothAttrs: RDD[Array[Array[Vid]]] = createPid2Vid(true, true)
val srcAttrOnly: RDD[Array[Array[Vid]]] = createPid2Vid(true, false) val srcAttrOnly: RDD[Array[Array[Vid]]] = createPid2Vid(true, false)
@ -30,43 +27,38 @@ class VertexPlacement(
case (false, false) => noAttrs case (false, false) => noAttrs
} }
def persist(newLevel: StorageLevel) {
bothAttrs.persist(newLevel)
srcAttrOnly.persist(newLevel)
dstAttrOnly.persist(newLevel)
noAttrs.persist(newLevel)
}
private def createPid2Vid( private def createPid2Vid(
includeSrcAttr: Boolean, includeDstAttr: Boolean): RDD[Array[Array[Vid]]] = { includeSrcAttr: Boolean, includeDstAttr: Boolean): RDD[Array[Array[Vid]]] = {
// Determine which vertices each edge partition needs by creating a mapping // Determine which vertices each edge partition needs by creating a mapping from vid to pid.
// from vid to pid val vid2pid: RDD[(Vid, Pid)] = eTable.partitionsRDD.mapPartitions { iter =>
val preAgg = eTable.mapPartitions { iter => val (pid: Pid, edgePartition: EdgePartition[_]) = iter.next()
val (pid, edgePartition) = iter.next() val numEdges = edgePartition.size
val vSet = new VertexSet val vSet = new VertexSet
if (includeSrcAttr || includeDstAttr) { if (includeSrcAttr) { // Add src vertices to the set.
edgePartition.foreach { e => var i = 0
if (includeSrcAttr) vSet.add(e.srcId) while (i < numEdges) {
if (includeDstAttr) vSet.add(e.dstId) vSet.add(edgePartition.srcIds(i))
i += 1
} }
} }
vSet.iterator.map { vid => (vid.toLong, pid) } if (includeDstAttr) { // Add dst vertices to the set.
var i = 0
while (i < numEdges) {
vSet.add(edgePartition.dstIds(i))
i += 1
} }
// Aggregate the mappings to determine where each vertex should go
val vid2pid = VertexSetRDD[Pid, ArrayBuffer[Pid]](preAgg, vTable.index,
(p: Pid) => ArrayBuffer(p),
(ab: ArrayBuffer[Pid], p:Pid) => {ab.append(p); ab},
(a: ArrayBuffer[Pid], b: ArrayBuffer[Pid]) => a ++ b)
.mapValues(a => a.toArray)
// Within each vertex partition, reorganize the placement information into
// columnar format keyed on the destination partition
val numPartitions = vid2pid.partitions.size
vid2pid.mapPartitions { iter =>
val pid2vid = Array.fill[ArrayBuilder[Vid]](numPartitions)(ArrayBuilder.make[Vid])
for ((vid, pids) <- iter) {
pids.foreach { pid => pid2vid(pid) += vid }
} }
Iterator(pid2vid.map(_.result)) vSet.iterator.map { vid => (vid, pid) }
} }
val numPartitions = vTable.partitions.size
vid2pid.partitionBy(vTable.partitioner.get).mapPartitions { iter =>
val pid2vid = Array.fill(numPartitions)(new PrimitiveVector[Vid])
for ((vid, pid) <- iter) {
pid2vid(pid) += vid
}
Iterator(pid2vid.map(_.trim().array))
}.cache().setName("VertexPlacement %s %s".format(includeSrcAttr, includeDstAttr))
} }
} }

View file

@ -6,10 +6,11 @@ import org.apache.spark.util.collection.OpenHashSet
package object graph { package object graph {
type Vid = Long type Vid = Long
// TODO: Consider using Char.
type Pid = Int type Pid = Int
type VertexSet = OpenHashSet[Vid] type VertexSet = OpenHashSet[Vid]
type VertexArrayList = it.unimi.dsi.fastutil.longs.LongArrayList
// type VertexIdToIndexMap = it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap // type VertexIdToIndexMap = it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap
type VertexIdToIndexMap = OpenHashSet[Vid] type VertexIdToIndexMap = OpenHashSet[Vid]
@ -18,11 +19,4 @@ package object graph {
* Return the default null-like value for a data type T. * Return the default null-like value for a data type T.
*/ */
def nullValue[T] = null.asInstanceOf[T] def nullValue[T] = null.asInstanceOf[T]
private[graph]
case class MutableTuple2[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) U,
@specialized(Char, Int, Boolean, Byte, Long, Float, Double) V](
var _1: U, var _2: V)
} }

View file

@ -268,14 +268,14 @@ object GraphGenerators {
* Create a star graph with vertex 0 being the center. * Create a star graph with vertex 0 being the center.
* *
* @param sc the spark context in which to construct the graph * @param sc the spark context in which to construct the graph
* @param the number of vertices in the star * @param nverts the number of vertices in the star
* *
* @return A star graph containing `nverts` vertices with vertex 0 * @return A star graph containing `nverts` vertices with vertex 0
* being the center vertex. * being the center vertex.
*/ */
def starGraph(sc: SparkContext, nverts: Int): Graph[Int, Int] = { def starGraph(sc: SparkContext, nverts: Int): Graph[Int, Int] = {
val edges: RDD[(Vid, Vid)] = sc.parallelize(1 until nverts).map(vid => (vid, 0)) val edges: RDD[(Vid, Vid)] = sc.parallelize(1 until nverts).map(vid => (vid, 0))
Graph(edges, 1) Graph.fromEdgeTuples(edges, 1)
} // end of starGraph } // end of starGraph

View file

@ -0,0 +1,28 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Set everything to be logged to the file core/target/unit-tests.log
log4j.rootCategory=INFO, file
log4j.appender.file=org.apache.log4j.FileAppender
log4j.appender.file.append=false
log4j.appender.file.file=graph/target/unit-tests.log
log4j.appender.file.layout=org.apache.log4j.PatternLayout
log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n
# Ignore messages below warning level from Jetty, because it's a bit verbose
log4j.logger.org.eclipse.jetty=WARN
org.eclipse.jetty.LEVEL=WARN

View file

@ -4,6 +4,7 @@ import org.scalatest.FunSuite
import org.apache.spark.SparkContext import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._ import org.apache.spark.SparkContext._
import org.apache.spark.graph.algorithms._
import org.apache.spark.rdd._ import org.apache.spark.rdd._
import org.apache.spark.graph.LocalSparkContext._ import org.apache.spark.graph.LocalSparkContext._
@ -50,35 +51,38 @@ class AnalyticsSuite extends FunSuite with LocalSparkContext {
System.setProperty("spark.serializer", "org.apache.spark.serializer.KryoSerializer") System.setProperty("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
System.setProperty("spark.kryo.registrator", "org.apache.spark.graph.GraphKryoRegistrator") System.setProperty("spark.kryo.registrator", "org.apache.spark.graph.GraphKryoRegistrator")
def compareRanks(a: VertexRDD[Double], b: VertexRDD[Double]): Double = {
a.leftJoin(b) { case (id, a, bOpt) => (a - bOpt.getOrElse(0.0)) * (a - bOpt.getOrElse(0.0)) }
.map { case (id, error) => error }.sum
}
test("Star PageRank") { test("Star PageRank") {
withSpark(new SparkContext("local", "test")) { sc => withSpark(new SparkContext("local", "test")) { sc =>
val nVertices = 100 val nVertices = 100
val starGraph = GraphGenerators.starGraph(sc, nVertices) val starGraph = GraphGenerators.starGraph(sc, nVertices).cache()
val resetProb = 0.15 val resetProb = 0.15
val prGraph1 = Analytics.pagerank(starGraph, 1, resetProb) val errorTol = 1.0e-5
val prGraph2 = Analytics.pagerank(starGraph, 2, resetProb)
val notMatching = prGraph1.vertices.zipJoin(prGraph2.vertices) { (vid, pr1, pr2) => val staticRanks1 = PageRank.run(starGraph, numIter = 1, resetProb).vertices.cache()
if (pr1 != pr2) { 1 } else { 0 } val staticRanks2 = PageRank.run(starGraph, numIter = 2, resetProb).vertices.cache()
// Static PageRank should only take 2 iterations to converge
val notMatching = staticRanks1.zipJoin(staticRanks2) { (vid, pr1, pr2) =>
if (pr1 != pr2) 1 else 0
}.map { case (vid, test) => test }.sum }.map { case (vid, test) => test }.sum
assert(notMatching === 0) assert(notMatching === 0)
prGraph2.vertices.foreach(println(_))
val errors = prGraph2.vertices.map { case (vid, pr) => val staticErrors = staticRanks2.map { case (vid, pr) =>
val correct = (vid > 0 && pr == resetProb) || val correct = (vid > 0 && pr == resetProb) ||
(vid == 0 && math.abs(pr - (resetProb + (1.0 - resetProb) * (resetProb * (nVertices - 1)) )) < 1.0E-5) (vid == 0 && math.abs(pr - (resetProb + (1.0 - resetProb) * (resetProb * (nVertices - 1)) )) < 1.0E-5)
if ( !correct ) { 1 } else { 0 } if (!correct) 1 else 0
} }
assert(errors.sum === 0) assert(staticErrors.sum === 0)
val prGraph3 = Analytics.deltaPagerank(starGraph, 0, resetProb) val dynamicRanks = PageRank.runUntillConvergence(starGraph, 0, resetProb).vertices.cache()
val errors2 = prGraph2.vertices.leftJoin(prGraph3.vertices){ (vid, pr1, pr2Opt) => val standaloneRanks = PageRank.runStandalone(starGraph, 0, resetProb).cache()
pr2Opt match { assert(compareRanks(staticRanks2, dynamicRanks) < errorTol)
case Some(pr2) if(pr1 == pr2) => 0 assert(compareRanks(staticRanks2, standaloneRanks) < errorTol)
case _ => 1
}
}.map { case (vid, test) => test }.sum
assert(errors2 === 0)
} }
} // end of test Star PageRank } // end of test Star PageRank
@ -86,31 +90,50 @@ class AnalyticsSuite extends FunSuite with LocalSparkContext {
test("Grid PageRank") { test("Grid PageRank") {
withSpark(new SparkContext("local", "test")) { sc => withSpark(new SparkContext("local", "test")) { sc =>
val gridGraph = GraphGenerators.gridGraph(sc, 10, 10) val rows = 10
val cols = 10
val resetProb = 0.15 val resetProb = 0.15
val prGraph1 = Analytics.pagerank(gridGraph, 50, resetProb).cache() val tol = 0.0001
val prGraph2 = Analytics.deltaPagerank(gridGraph, 0.0001, resetProb).cache() val numIter = 50
val error = prGraph1.vertices.zipJoin(prGraph2.vertices) { case (id, a, b) => (a - b) * (a - b) } val errorTol = 1.0e-5
.map { case (id, error) => error }.sum val gridGraph = GraphGenerators.gridGraph(sc, rows, cols).cache()
prGraph1.vertices.zipJoin(prGraph2.vertices) { (id, a, b) => (a, b, a-b) }.foreach(println(_))
println(error) val staticRanks = PageRank.run(gridGraph, numIter, resetProb).vertices.cache()
assert(error < 1.0e-5) val dynamicRanks = PageRank.runUntillConvergence(gridGraph, tol, resetProb).vertices.cache()
val pr3: RDD[(Vid, Double)] = sc.parallelize(GridPageRank(10,10, 50, resetProb)) val standaloneRanks = PageRank.runStandalone(gridGraph, tol, resetProb).cache()
val error2 = prGraph1.vertices.leftJoin(pr3) { (id, a, bOpt) => val referenceRanks = VertexRDD(sc.parallelize(GridPageRank(rows, cols, numIter, resetProb)))
val b: Double = bOpt.get
(a - b) * (a - b) assert(compareRanks(staticRanks, referenceRanks) < errorTol)
}.map { case (id, error) => error }.sum assert(compareRanks(dynamicRanks, referenceRanks) < errorTol)
prGraph1.vertices.leftJoin(pr3) { (id, a, b) => (a, b) }.foreach( println(_) ) assert(compareRanks(standaloneRanks, referenceRanks) < errorTol)
println(error2)
assert(error2 < 1.0e-5)
} }
} // end of Grid PageRank } // end of Grid PageRank
test("Chain PageRank") {
withSpark(new SparkContext("local", "test")) { sc =>
val chain1 = (0 until 9).map(x => (x, x+1) )
val rawEdges = sc.parallelize(chain1, 1).map { case (s,d) => (s.toLong, d.toLong) }
val chain = Graph.fromEdgeTuples(rawEdges, 1.0).cache()
val resetProb = 0.15
val tol = 0.0001
val numIter = 10
val errorTol = 1.0e-5
val staticRanks = PageRank.run(chain, numIter, resetProb).vertices.cache()
val dynamicRanks = PageRank.runUntillConvergence(chain, tol, resetProb).vertices.cache()
val standaloneRanks = PageRank.runStandalone(chain, tol, resetProb).cache()
assert(compareRanks(staticRanks, dynamicRanks) < errorTol)
assert(compareRanks(dynamicRanks, standaloneRanks) < errorTol)
}
}
test("Grid Connected Components") { test("Grid Connected Components") {
withSpark(new SparkContext("local", "test")) { sc => withSpark(new SparkContext("local", "test")) { sc =>
val gridGraph = GraphGenerators.gridGraph(sc, 10, 10) val gridGraph = GraphGenerators.gridGraph(sc, 10, 10).cache()
val ccGraph = Analytics.connectedComponents(gridGraph).cache() val ccGraph = ConnectedComponents.run(gridGraph).cache()
val maxCCid = ccGraph.vertices.map { case (vid, ccId) => ccId }.sum val maxCCid = ccGraph.vertices.map { case (vid, ccId) => ccId }.sum
assert(maxCCid === 0) assert(maxCCid === 0)
} }
@ -119,8 +142,8 @@ class AnalyticsSuite extends FunSuite with LocalSparkContext {
test("Reverse Grid Connected Components") { test("Reverse Grid Connected Components") {
withSpark(new SparkContext("local", "test")) { sc => withSpark(new SparkContext("local", "test")) { sc =>
val gridGraph = GraphGenerators.gridGraph(sc, 10, 10).reverse val gridGraph = GraphGenerators.gridGraph(sc, 10, 10).reverse.cache()
val ccGraph = Analytics.connectedComponents(gridGraph).cache() val ccGraph = ConnectedComponents.run(gridGraph).cache()
val maxCCid = ccGraph.vertices.map { case (vid, ccId) => ccId }.sum val maxCCid = ccGraph.vertices.map { case (vid, ccId) => ccId }.sum
assert(maxCCid === 0) assert(maxCCid === 0)
} }
@ -132,15 +155,14 @@ class AnalyticsSuite extends FunSuite with LocalSparkContext {
val chain1 = (0 until 9).map(x => (x, x+1) ) val chain1 = (0 until 9).map(x => (x, x+1) )
val chain2 = (10 until 20).map(x => (x, x+1) ) val chain2 = (10 until 20).map(x => (x, x+1) )
val rawEdges = sc.parallelize(chain1 ++ chain2, 3).map { case (s,d) => (s.toLong, d.toLong) } val rawEdges = sc.parallelize(chain1 ++ chain2, 3).map { case (s,d) => (s.toLong, d.toLong) }
val twoChains = Graph(rawEdges, 1.0) val twoChains = Graph.fromEdgeTuples(rawEdges, 1.0).cache()
val ccGraph = Analytics.connectedComponents(twoChains).cache() val ccGraph = ConnectedComponents.run(twoChains).cache()
val vertices = ccGraph.vertices.collect val vertices = ccGraph.vertices.collect()
for ( (id, cc) <- vertices ) { for ( (id, cc) <- vertices ) {
if(id < 10) { assert(cc === 0) } if(id < 10) { assert(cc === 0) }
else { assert(cc === 10) } else { assert(cc === 10) }
} }
val ccMap = vertices.toMap val ccMap = vertices.toMap
println(ccMap)
for (id <- 0 until 20) { for (id <- 0 until 20) {
if (id < 10) { if (id < 10) {
assert(ccMap(id) === 0) assert(ccMap(id) === 0)
@ -156,8 +178,8 @@ class AnalyticsSuite extends FunSuite with LocalSparkContext {
val chain1 = (0 until 9).map(x => (x, x+1) ) val chain1 = (0 until 9).map(x => (x, x+1) )
val chain2 = (10 until 20).map(x => (x, x+1) ) val chain2 = (10 until 20).map(x => (x, x+1) )
val rawEdges = sc.parallelize(chain1 ++ chain2, 3).map { case (s,d) => (s.toLong, d.toLong) } val rawEdges = sc.parallelize(chain1 ++ chain2, 3).map { case (s,d) => (s.toLong, d.toLong) }
val twoChains = Graph(rawEdges, true).reverse val twoChains = Graph.fromEdgeTuples(rawEdges, true).reverse.cache()
val ccGraph = Analytics.connectedComponents(twoChains).cache() val ccGraph = ConnectedComponents.run(twoChains).cache()
val vertices = ccGraph.vertices.collect val vertices = ccGraph.vertices.collect
for ( (id, cc) <- vertices ) { for ( (id, cc) <- vertices ) {
if (id < 10) { if (id < 10) {
@ -167,7 +189,6 @@ class AnalyticsSuite extends FunSuite with LocalSparkContext {
} }
} }
val ccMap = vertices.toMap val ccMap = vertices.toMap
println(ccMap)
for ( id <- 0 until 20 ) { for ( id <- 0 until 20 ) {
if (id < 10) { if (id < 10) {
assert(ccMap(id) === 0) assert(ccMap(id) === 0)
@ -181,8 +202,8 @@ class AnalyticsSuite extends FunSuite with LocalSparkContext {
test("Count a single triangle") { test("Count a single triangle") {
withSpark(new SparkContext("local", "test")) { sc => withSpark(new SparkContext("local", "test")) { sc =>
val rawEdges = sc.parallelize(Array( 0L->1L, 1L->2L, 2L->0L ), 2) val rawEdges = sc.parallelize(Array( 0L->1L, 1L->2L, 2L->0L ), 2)
val graph = Graph(rawEdges, true).cache val graph = Graph.fromEdgeTuples(rawEdges, true).cache()
val triangleCount = Analytics.triangleCount(graph) val triangleCount = TriangleCount.run(graph)
val verts = triangleCount.vertices val verts = triangleCount.vertices
verts.collect.foreach { case (vid, count) => assert(count === 1) } verts.collect.foreach { case (vid, count) => assert(count === 1) }
} }
@ -193,10 +214,10 @@ class AnalyticsSuite extends FunSuite with LocalSparkContext {
val triangles = Array(0L -> 1L, 1L -> 2L, 2L -> 0L) ++ val triangles = Array(0L -> 1L, 1L -> 2L, 2L -> 0L) ++
Array(0L -> -1L, -1L -> -2L, -2L -> 0L) Array(0L -> -1L, -1L -> -2L, -2L -> 0L)
val rawEdges = sc.parallelize(triangles, 2) val rawEdges = sc.parallelize(triangles, 2)
val graph = Graph(rawEdges, true).cache val graph = Graph.fromEdgeTuples(rawEdges, true).cache()
val triangleCount = Analytics.triangleCount(graph) val triangleCount = TriangleCount.run(graph)
val verts = triangleCount.vertices val verts = triangleCount.vertices
verts.collect.foreach { case (vid, count) => verts.collect().foreach { case (vid, count) =>
if (vid == 0) { if (vid == 0) {
assert(count === 2) assert(count === 2)
} else { } else {
@ -213,10 +234,10 @@ class AnalyticsSuite extends FunSuite with LocalSparkContext {
Array(0L -> -1L, -1L -> -2L, -2L -> 0L) Array(0L -> -1L, -1L -> -2L, -2L -> 0L)
val revTriangles = triangles.map { case (a,b) => (b,a) } val revTriangles = triangles.map { case (a,b) => (b,a) }
val rawEdges = sc.parallelize(triangles ++ revTriangles, 2) val rawEdges = sc.parallelize(triangles ++ revTriangles, 2)
val graph = Graph(rawEdges, true).cache val graph = Graph.fromEdgeTuples(rawEdges, true).cache()
val triangleCount = Analytics.triangleCount(graph) val triangleCount = TriangleCount.run(graph)
val verts = triangleCount.vertices val verts = triangleCount.vertices
verts.collect.foreach { case (vid, count) => verts.collect().foreach { case (vid, count) =>
if (vid == 0) { if (vid == 0) {
assert(count === 4) assert(count === 4)
} else { } else {
@ -230,10 +251,25 @@ class AnalyticsSuite extends FunSuite with LocalSparkContext {
withSpark(new SparkContext("local", "test")) { sc => withSpark(new SparkContext("local", "test")) { sc =>
val rawEdges = sc.parallelize(Array(0L -> 1L, 1L -> 2L, 2L -> 0L) ++ val rawEdges = sc.parallelize(Array(0L -> 1L, 1L -> 2L, 2L -> 0L) ++
Array(0L -> 1L, 1L -> 2L, 2L -> 0L), 2) Array(0L -> 1L, 1L -> 2L, 2L -> 0L), 2)
val graph = Graph(rawEdges, true).cache val graph = Graph.fromEdgeTuples(rawEdges, true, uniqueEdges = Some(RandomVertexCut)).cache()
val triangleCount = Analytics.triangleCount(graph) val triangleCount = TriangleCount.run(graph)
val verts = triangleCount.vertices val verts = triangleCount.vertices
verts.collect.foreach { case (vid, count) => assert(count === 1) } verts.collect.foreach { case (vid, count) => assert(count === 1) }
} }
} }
test("Test SVD++ with mean square error on training set") {
withSpark(new SparkContext("local", "test")) { sc =>
val SvdppErr = 0.01
val edges = sc.textFile("mllib/data/als/test.data").map { line =>
val fields = line.split(",")
Edge(fields(0).toLong * 2, fields(1).toLong * 2 + 1, fields(2).toDouble)
}
val graph = Svdpp.run(edges)
val err = graph.vertices.collect.map{ case (vid, vd) =>
if (vid % 2 == 1) { vd.norm } else { 0.0 }
}.reduce(_ + _) / graph.triplets.collect.size
assert(err < SvdppErr)
}
}
} // end of AnalyticsSuite } // end of AnalyticsSuite

View file

@ -1,9 +1,12 @@
package org.apache.spark.graph package org.apache.spark.graph
import scala.util.Random
import org.scalatest.FunSuite import org.scalatest.FunSuite
import org.apache.spark.SparkContext import org.apache.spark.SparkContext
import org.apache.spark.graph.LocalSparkContext._ import org.apache.spark.graph.LocalSparkContext._
import org.apache.spark.graph.impl.EdgePartitionBuilder
import org.apache.spark.rdd._ import org.apache.spark.rdd._
class GraphSuite extends FunSuite with LocalSparkContext { class GraphSuite extends FunSuite with LocalSparkContext {
@ -15,7 +18,7 @@ class GraphSuite extends FunSuite with LocalSparkContext {
withSpark(new SparkContext("local", "test")) { sc => withSpark(new SparkContext("local", "test")) { sc =>
val rawEdges = (0L to 100L).zip((1L to 99L) :+ 0L) val rawEdges = (0L to 100L).zip((1L to 99L) :+ 0L)
val edges = sc.parallelize(rawEdges) val edges = sc.parallelize(rawEdges)
val graph = Graph(edges, 1.0F) val graph = Graph.fromEdgeTuples(edges, 1.0F)
assert(graph.edges.count() === rawEdges.size) assert(graph.edges.count() === rawEdges.size)
} }
} }
@ -35,10 +38,45 @@ class GraphSuite extends FunSuite with LocalSparkContext {
} }
} }
test("core operations") {
withSpark(new SparkContext("local", "test")) { sc =>
val n = 5
val star = Graph.fromEdgeTuples(
sc.parallelize((1 to n).map(x => (0: Vid, x: Vid)), 3), "v")
// triplets
assert(star.triplets.map(et => (et.srcId, et.dstId, et.srcAttr, et.dstAttr)).collect.toSet ===
(1 to n).map(x => (0: Vid, x: Vid, "v", "v")).toSet)
// reverse
val reverseStar = star.reverse
assert(reverseStar.outDegrees.collect.toSet === (1 to n).map(x => (x: Vid, 1)).toSet)
// outerJoinVertices
val reverseStarDegrees =
reverseStar.outerJoinVertices(reverseStar.outDegrees) { (vid, a, bOpt) => bOpt.getOrElse(0) }
val neighborDegreeSums = reverseStarDegrees.mapReduceTriplets(
et => Iterator((et.srcId, et.dstAttr), (et.dstId, et.srcAttr)),
(a: Int, b: Int) => a + b).collect.toSet
assert(neighborDegreeSums === Set((0: Vid, n)) ++ (1 to n).map(x => (x: Vid, 0)))
// mapVertices preserving type
val mappedVAttrs = reverseStar.mapVertices((vid, attr) => attr + "2")
assert(mappedVAttrs.vertices.collect.toSet === (0 to n).map(x => (x: Vid, "v2")).toSet)
// mapVertices changing type
val mappedVAttrs2 = reverseStar.mapVertices((vid, attr) => attr.length)
assert(mappedVAttrs2.vertices.collect.toSet === (0 to n).map(x => (x: Vid, 1)).toSet)
// groupEdges
val doubleStar = Graph.fromEdgeTuples(
sc.parallelize((1 to n).flatMap(x => List((0: Vid, x: Vid), (0: Vid, x: Vid))), 1), "v")
val star2 = doubleStar.groupEdges { (a, b) => a}
assert(star2.edges.collect.toArray.sorted(Edge.lexicographicOrdering[Int]) ===
star.edges.collect.toArray.sorted(Edge.lexicographicOrdering[Int]))
assert(star2.vertices.collect.toSet === star.vertices.collect.toSet)
}
}
test("mapEdges") { test("mapEdges") {
withSpark(new SparkContext("local", "test")) { sc => withSpark(new SparkContext("local", "test")) { sc =>
val n = 3 val n = 3
val star = Graph(sc.parallelize((1 to n).map(x => (0: Vid, x: Vid))), "defaultValue") val star = Graph.fromEdgeTuples(
sc.parallelize((1 to n).map(x => (0: Vid, x: Vid))), "v")
val starWithEdgeAttrs = star.mapEdges(e => e.dstId) val starWithEdgeAttrs = star.mapEdges(e => e.dstId)
// map(_.copy()) is a workaround for https://github.com/amplab/graphx/issues/25 // map(_.copy()) is a workaround for https://github.com/amplab/graphx/issues/25
@ -50,20 +88,49 @@ class GraphSuite extends FunSuite with LocalSparkContext {
test("mapReduceTriplets") { test("mapReduceTriplets") {
withSpark(new SparkContext("local", "test")) { sc => withSpark(new SparkContext("local", "test")) { sc =>
val n = 3 val n = 5
val star = Graph(sc.parallelize((1 to n).map(x => (0: Vid, x: Vid))), 0) val star = Graph.fromEdgeTuples(sc.parallelize((1 to n).map(x => (0: Vid, x: Vid))), 0)
val starDeg = star.joinVertices(star.degrees){ (vid, oldV, deg) => deg } val starDeg = star.joinVertices(star.degrees){ (vid, oldV, deg) => deg }
val neighborDegreeSums = starDeg.mapReduceTriplets( val neighborDegreeSums = starDeg.mapReduceTriplets(
edge => Iterator((edge.srcId, edge.dstAttr), (edge.dstId, edge.srcAttr)), edge => Iterator((edge.srcId, edge.dstAttr), (edge.dstId, edge.srcAttr)),
(a: Int, b: Int) => a + b) (a: Int, b: Int) => a + b)
assert(neighborDegreeSums.collect().toSet === (0 to n).map(x => (x, n)).toSet) assert(neighborDegreeSums.collect().toSet === (0 to n).map(x => (x, n)).toSet)
// activeSetOpt
val allPairs = for (x <- 1 to n; y <- 1 to n) yield (x: Vid, y: Vid)
val complete = Graph.fromEdgeTuples(sc.parallelize(allPairs, 3), 0)
val vids = complete.mapVertices((vid, attr) => vid).cache()
val active = vids.vertices.filter { case (vid, attr) => attr % 2 == 0 }
val numEvenNeighbors = vids.mapReduceTriplets(et => {
// Map function should only run on edges with destination in the active set
if (et.dstId % 2 != 0) {
throw new Exception("map ran on edge with dst vid %d, which is odd".format(et.dstId))
}
Iterator((et.srcId, 1))
}, (a: Int, b: Int) => a + b, Some((active, EdgeDirection.In))).collect.toSet
assert(numEvenNeighbors === (1 to n).map(x => (x: Vid, n / 2)).toSet)
// outerJoinVertices followed by mapReduceTriplets(activeSetOpt)
val ring = Graph.fromEdgeTuples(sc.parallelize((0 until n).map(x => (x: Vid, (x+1) % n: Vid)), 3), 0)
.mapVertices((vid, attr) => vid).cache()
val changed = ring.vertices.filter { case (vid, attr) => attr % 2 == 1 }.mapValues(-_)
val changedGraph = ring.outerJoinVertices(changed) { (vid, old, newOpt) => newOpt.getOrElse(old) }
val numOddNeighbors = changedGraph.mapReduceTriplets(et => {
// Map function should only run on edges with source in the active set
if (et.srcId % 2 != 1) {
throw new Exception("map ran on edge with src vid %d, which is even".format(et.dstId))
}
Iterator((et.dstId, 1))
}, (a: Int, b: Int) => a + b, Some(changed, EdgeDirection.Out)).collect.toSet
assert(numOddNeighbors === (2 to n by 2).map(x => (x: Vid, 1)).toSet)
} }
} }
test("aggregateNeighbors") { test("aggregateNeighbors") {
withSpark(new SparkContext("local", "test")) { sc => withSpark(new SparkContext("local", "test")) { sc =>
val n = 3 val n = 3
val star = Graph(sc.parallelize((1 to n).map(x => (0: Vid, x: Vid))), 1) val star = Graph.fromEdgeTuples(sc.parallelize((1 to n).map(x => (0: Vid, x: Vid))), 1)
val indegrees = star.aggregateNeighbors( val indegrees = star.aggregateNeighbors(
(vid, edge) => Some(1), (vid, edge) => Some(1),
@ -103,7 +170,7 @@ class GraphSuite extends FunSuite with LocalSparkContext {
withSpark(new SparkContext("local", "test")) { sc => withSpark(new SparkContext("local", "test")) { sc =>
val chain = (0 until 100).map(x => (x, (x+1)%100) ) val chain = (0 until 100).map(x => (x, (x+1)%100) )
val rawEdges = sc.parallelize(chain, 3).map { case (s,d) => (s.toLong, d.toLong) } val rawEdges = sc.parallelize(chain, 3).map { case (s,d) => (s.toLong, d.toLong) }
val graph = Graph(rawEdges, 1.0) val graph = Graph.fromEdgeTuples(rawEdges, 1.0)
val nbrs = graph.collectNeighborIds(EdgeDirection.Both) val nbrs = graph.collectNeighborIds(EdgeDirection.Both)
assert(nbrs.count === chain.size) assert(nbrs.count === chain.size)
assert(graph.numVertices === nbrs.count) assert(graph.numVertices === nbrs.count)
@ -165,18 +232,50 @@ class GraphSuite extends FunSuite with LocalSparkContext {
test("VertexSetRDD") { test("VertexSetRDD") {
withSpark(new SparkContext("local", "test")) { sc => withSpark(new SparkContext("local", "test")) { sc =>
val a = sc.parallelize((0 to 100).map(x => (x.toLong, x.toLong)), 5) val n = 100
val b = VertexSetRDD(a).mapValues(x => -x) val a = sc.parallelize((0 to n).map(x => (x.toLong, x.toLong)), 5)
assert(b.count === 101) val b = VertexRDD(a).mapValues(x => -x).cache() // Allow joining b with a derived RDD of b
assert(b.count === n + 1)
assert(b.leftJoin(a){ (id, a, bOpt) => a + bOpt.get }.map(x=> x._2).reduce(_+_) === 0) assert(b.leftJoin(a){ (id, a, bOpt) => a + bOpt.get }.map(x=> x._2).reduce(_+_) === 0)
val c = VertexSetRDD(a, b.index) val c = b.aggregateUsingIndex[Long](a, (x, y) => x)
assert(b.leftJoin(c){ (id, b, cOpt) => b + cOpt.get }.map(x=> x._2).reduce(_+_) === 0) assert(b.leftJoin(c){ (id, b, cOpt) => b + cOpt.get }.map(x=> x._2).reduce(_+_) === 0)
val d = c.filter(q => ((q._2 % 2) == 0)) val d = c.filter(q => ((q._2 % 2) == 0))
val e = a.filter(q => ((q._2 % 2) == 0)) val e = a.filter(q => ((q._2 % 2) == 0))
assert(d.count === e.count) assert(d.count === e.count)
assert(b.zipJoin(c)((id, b, c) => b + c).map(x => x._2).reduce(_+_) === 0) assert(b.zipJoin(c)((id, b, c) => b + c).map(x => x._2).reduce(_+_) === 0)
val f = b.mapValues(x => if (x % 2 == 0) -x else x)
assert(b.diff(f).collect().toSet === (2 to n by 2).map(x => (x.toLong, x.toLong)).toSet)
} }
} }
test("subgraph") {
withSpark(new SparkContext("local", "test")) { sc =>
// Create a star graph of 10 veritces.
val n = 10
val star = Graph.fromEdgeTuples(sc.parallelize((1 to n).map(x => (0: Vid, x: Vid))), "v")
// Take only vertices whose vids are even
val subgraph = star.subgraph(vpred = (vid, attr) => vid % 2 == 0)
// We should have 5 vertices.
assert(subgraph.vertices.collect().toSet === (0 to n by 2).map(x => (x, "v")).toSet)
// And 4 edges.
assert(subgraph.edges.map(_.copy()).collect().toSet === (2 to n by 2).map(x => Edge(0, x, 1)).toSet)
}
}
test("EdgePartition.sort") {
val edgesFrom0 = List(Edge(0, 1, 0))
val edgesFrom1 = List(Edge(1, 0, 0), Edge(1, 2, 0))
val sortedEdges = edgesFrom0 ++ edgesFrom1
val builder = new EdgePartitionBuilder[Int]
for (e <- Random.shuffle(sortedEdges)) {
builder.add(e.srcId, e.dstId, e.attr)
}
val edgePartition = builder.toEdgePartition
assert(edgePartition.iterator.map(_.copy()).toList === sortedEdges)
assert(edgePartition.indexIterator(_ == 0).map(_.copy()).toList === edgesFrom0)
assert(edgePartition.indexIterator(_ == 1).map(_.copy()).toList === edgesFrom1)
}
} }

View file

@ -0,0 +1,43 @@
package org.apache.spark.graph
import org.scalatest.FunSuite
import org.apache.spark.SparkContext
import org.apache.spark.graph.LocalSparkContext._
import org.apache.spark.rdd._
class PregelSuite extends FunSuite with LocalSparkContext {
System.setProperty("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
System.setProperty("spark.kryo.registrator", "org.apache.spark.graph.GraphKryoRegistrator")
test("1 iteration") {
withSpark(new SparkContext("local", "test")) { sc =>
val n = 5
val star = Graph.fromEdgeTuples(sc.parallelize((1 to n).map(x => (0: Vid, x: Vid)), 3), "v")
val result = Pregel(star, 0)(
(vid, attr, msg) => attr,
et => Iterator.empty,
(a: Int, b: Int) => throw new Exception("mergeMsg run unexpectedly"))
assert(result.vertices.collect.toSet === star.vertices.collect.toSet)
}
}
test("chain propagation") {
withSpark(new SparkContext("local", "test")) { sc =>
val n = 5
val chain = Graph.fromEdgeTuples(
sc.parallelize((1 until n).map(x => (x: Vid, x + 1: Vid)), 3),
0).cache()
assert(chain.vertices.collect.toSet === (1 to n).map(x => (x: Vid, 0)).toSet)
val chainWithSeed = chain.mapVertices { (vid, attr) => if (vid == 1) 1 else 0 }
assert(chainWithSeed.vertices.collect.toSet === Set((1: Vid, 1)) ++ (2 to n).map(x => (x: Vid, 0)).toSet)
val result = Pregel(chainWithSeed, 0)(
(vid, attr, msg) => math.max(msg, attr),
et => Iterator((et.dstId, et.srcAttr)),
(a: Int, b: Int) => math.max(a, b))
assert(result.vertices.collect.toSet ===
chain.vertices.mapValues { (vid, attr) => attr + 1 }.collect.toSet)
}
}
}

View file

@ -1,13 +1,16 @@
package org.apache.spark.graph package org.apache.spark.graph
import java.io.{EOFException, ByteArrayInputStream, ByteArrayOutputStream}
import scala.util.Random
import org.scalatest.FunSuite import org.scalatest.FunSuite
import org.apache.spark.SparkContext import org.apache.spark._
import org.apache.spark.graph.LocalSparkContext._ import org.apache.spark.graph.LocalSparkContext._
import java.io.{EOFException, ByteArrayInputStream, ByteArrayOutputStream}
import org.apache.spark.graph.impl._ import org.apache.spark.graph.impl._
import org.apache.spark.graph.impl.MsgRDDFunctions._ import org.apache.spark.graph.impl.MsgRDDFunctions._
import org.apache.spark._ import org.apache.spark.serializer.SerializationStream
class SerializerSuite extends FunSuite with LocalSparkContext { class SerializerSuite extends FunSuite with LocalSparkContext {
@ -79,7 +82,7 @@ class SerializerSuite extends FunSuite with LocalSparkContext {
} }
test("IntAggMsgSerializer") { test("IntAggMsgSerializer") {
val outMsg = new AggregationMsg[Int](4, 5) val outMsg = (4: Vid, 5)
val bout = new ByteArrayOutputStream val bout = new ByteArrayOutputStream
val outStrm = new IntAggMsgSerializer().newInstance().serializeStream(bout) val outStrm = new IntAggMsgSerializer().newInstance().serializeStream(bout)
outStrm.writeObject(outMsg) outStrm.writeObject(outMsg)
@ -87,12 +90,10 @@ class SerializerSuite extends FunSuite with LocalSparkContext {
bout.flush() bout.flush()
val bin = new ByteArrayInputStream(bout.toByteArray) val bin = new ByteArrayInputStream(bout.toByteArray)
val inStrm = new IntAggMsgSerializer().newInstance().deserializeStream(bin) val inStrm = new IntAggMsgSerializer().newInstance().deserializeStream(bin)
val inMsg1: AggregationMsg[Int] = inStrm.readObject() val inMsg1: (Vid, Int) = inStrm.readObject()
val inMsg2: AggregationMsg[Int] = inStrm.readObject() val inMsg2: (Vid, Int) = inStrm.readObject()
assert(outMsg.vid === inMsg1.vid) assert(outMsg === inMsg1)
assert(outMsg.vid === inMsg2.vid) assert(outMsg === inMsg2)
assert(outMsg.data === inMsg1.data)
assert(outMsg.data === inMsg2.data)
intercept[EOFException] { intercept[EOFException] {
inStrm.readObject() inStrm.readObject()
@ -100,7 +101,7 @@ class SerializerSuite extends FunSuite with LocalSparkContext {
} }
test("LongAggMsgSerializer") { test("LongAggMsgSerializer") {
val outMsg = new AggregationMsg[Long](4, 1L << 32) val outMsg = (4: Vid, 1L << 32)
val bout = new ByteArrayOutputStream val bout = new ByteArrayOutputStream
val outStrm = new LongAggMsgSerializer().newInstance().serializeStream(bout) val outStrm = new LongAggMsgSerializer().newInstance().serializeStream(bout)
outStrm.writeObject(outMsg) outStrm.writeObject(outMsg)
@ -108,12 +109,10 @@ class SerializerSuite extends FunSuite with LocalSparkContext {
bout.flush() bout.flush()
val bin = new ByteArrayInputStream(bout.toByteArray) val bin = new ByteArrayInputStream(bout.toByteArray)
val inStrm = new LongAggMsgSerializer().newInstance().deserializeStream(bin) val inStrm = new LongAggMsgSerializer().newInstance().deserializeStream(bin)
val inMsg1: AggregationMsg[Long] = inStrm.readObject() val inMsg1: (Vid, Long) = inStrm.readObject()
val inMsg2: AggregationMsg[Long] = inStrm.readObject() val inMsg2: (Vid, Long) = inStrm.readObject()
assert(outMsg.vid === inMsg1.vid) assert(outMsg === inMsg1)
assert(outMsg.vid === inMsg2.vid) assert(outMsg === inMsg2)
assert(outMsg.data === inMsg1.data)
assert(outMsg.data === inMsg2.data)
intercept[EOFException] { intercept[EOFException] {
inStrm.readObject() inStrm.readObject()
@ -121,7 +120,7 @@ class SerializerSuite extends FunSuite with LocalSparkContext {
} }
test("DoubleAggMsgSerializer") { test("DoubleAggMsgSerializer") {
val outMsg = new AggregationMsg[Double](4, 5.0) val outMsg = (4: Vid, 5.0)
val bout = new ByteArrayOutputStream val bout = new ByteArrayOutputStream
val outStrm = new DoubleAggMsgSerializer().newInstance().serializeStream(bout) val outStrm = new DoubleAggMsgSerializer().newInstance().serializeStream(bout)
outStrm.writeObject(outMsg) outStrm.writeObject(outMsg)
@ -129,12 +128,10 @@ class SerializerSuite extends FunSuite with LocalSparkContext {
bout.flush() bout.flush()
val bin = new ByteArrayInputStream(bout.toByteArray) val bin = new ByteArrayInputStream(bout.toByteArray)
val inStrm = new DoubleAggMsgSerializer().newInstance().deserializeStream(bin) val inStrm = new DoubleAggMsgSerializer().newInstance().deserializeStream(bin)
val inMsg1: AggregationMsg[Double] = inStrm.readObject() val inMsg1: (Vid, Double) = inStrm.readObject()
val inMsg2: AggregationMsg[Double] = inStrm.readObject() val inMsg2: (Vid, Double) = inStrm.readObject()
assert(outMsg.vid === inMsg1.vid) assert(outMsg === inMsg1)
assert(outMsg.vid === inMsg2.vid) assert(outMsg === inMsg2)
assert(outMsg.data === inMsg1.data)
assert(outMsg.data === inMsg2.data)
intercept[EOFException] { intercept[EOFException] {
inStrm.readObject() inStrm.readObject()
@ -150,11 +147,35 @@ class SerializerSuite extends FunSuite with LocalSparkContext {
} }
} }
test("TestShuffleAggregationMsg") { test("variable long encoding") {
withSpark(new SparkContext("local[2]", "test")) { sc => def testVarLongEncoding(v: Long, optimizePositive: Boolean) {
val bmsgs = sc.parallelize(0 until 100, 10).map(pid => new AggregationMsg[Int](pid, pid)) val bout = new ByteArrayOutputStream
bmsgs.partitionBy(new HashPartitioner(3)).collect() val stream = new ShuffleSerializationStream(bout) {
def writeObject[T](t: T): SerializationStream = {
writeVarLong(t.asInstanceOf[Long], optimizePositive = optimizePositive)
this
} }
} }
stream.writeObject(v)
val bin = new ByteArrayInputStream(bout.toByteArray)
val dstream = new ShuffleDeserializationStream(bin) {
def readObject[T](): T = {
readVarLong(optimizePositive).asInstanceOf[T]
}
}
val read = dstream.readObject[Long]()
assert(read === v)
}
// Test all variable encoding code path (each branch uses 7 bits, i.e. 1L << 7 difference)
val d = Random.nextLong() % 128
Seq[Long](0, 1L << 0 + d, 1L << 7 + d, 1L << 14 + d, 1L << 21 + d, 1L << 28 + d, 1L << 35 + d,
1L << 42 + d, 1L << 49 + d, 1L << 56 + d, 1L << 63 + d).foreach { number =>
testVarLongEncoding(number, optimizePositive = false)
testVarLongEncoding(number, optimizePositive = true)
testVarLongEncoding(-number, optimizePositive = false)
testVarLongEncoding(-number, optimizePositive = true)
}
}
} }