Integrated IndexedRDD into graph design.
This commit is contained in:
parent
fa2f87ca63
commit
494472a6cc
|
@ -49,6 +49,32 @@ class IndexedRDDFunctions[K: ClassManifest, V: ClassManifest](self: IndexedRDD[K
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Pass each value in the key-value pair RDD through a map function without changing the keys;
|
||||||
|
* this also retains the original RDD's partitioning.
|
||||||
|
*/
|
||||||
|
override def mapValuesWithKeys[U: ClassManifest](f: (K, V) => U): RDD[(K, U)] = {
|
||||||
|
val cleanF = self.index.rdd.context.clean(f)
|
||||||
|
val newValues = self.index.rdd.zipPartitions(self.valuesRDD){ (keysIter, valuesIter) =>
|
||||||
|
val index = keysIter.next()
|
||||||
|
assert(keysIter.hasNext() == false)
|
||||||
|
val oldValues = valuesIter.next()
|
||||||
|
assert(valuesIter.hasNext() == false)
|
||||||
|
// Allocate the array to store the results into
|
||||||
|
val newValues: Array[Seq[U]] = new Array[Seq[U]](oldValues.size)
|
||||||
|
// Populate the new Values
|
||||||
|
for( (k,i) <- index ) {
|
||||||
|
if(oldValues(i) != null) {
|
||||||
|
newValues(i) = oldValues(i).map( v => f(k,v) )
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Array(newValues.toSeq).iterator
|
||||||
|
}
|
||||||
|
new IndexedRDD[K,U](self.index, newValues)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Pass each value in the key-value pair RDD through a flatMap function without changing the
|
* Pass each value in the key-value pair RDD through a flatMap function without changing the
|
||||||
* keys; this also retains the original RDD's partitioning.
|
* keys; this also retains the original RDD's partitioning.
|
||||||
|
|
|
@ -53,6 +53,8 @@ class RDDIndex[@specialized K: ClassManifest](private[spark] val rdd: RDD[BlockI
|
||||||
rdd.persist(newLevel)
|
rdd.persist(newLevel)
|
||||||
return this
|
return this
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def partitioner: Partitioner = rdd.partitioner.get
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -85,6 +87,9 @@ class IndexedRDD[K: ClassManifest, V: ClassManifest](
|
||||||
override val partitioner = index.rdd.partitioner
|
override val partitioner = index.rdd.partitioner
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The actual partitions are defined by the tuples.
|
* The actual partitions are defined by the tuples.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -393,6 +393,15 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest](self: RDD[(K, V)])
|
||||||
new MappedValuesRDD(self, cleanF)
|
new MappedValuesRDD(self, cleanF)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Pass each value in the key-value pair RDD through a map function without changing the keys;
|
||||||
|
* this also retains the original RDD's partitioning.
|
||||||
|
*/
|
||||||
|
def mapValuesWithKeys[U: ClassManifest](f: (K, V) => U): RDD[(K, U)] = {
|
||||||
|
self.map{ case (k,v) => (k, f(k,v)) }
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Pass each value in the key-value pair RDD through a flatMap function without changing the
|
* Pass each value in the key-value pair RDD through a flatMap function without changing the
|
||||||
* keys; this also retains the original RDD's partitioning.
|
* keys; this also retains the original RDD's partitioning.
|
||||||
|
|
|
@ -44,9 +44,9 @@ object Analytics extends Logging {
|
||||||
numIter: Int,
|
numIter: Int,
|
||||||
resetProb: Double = 0.15) = {
|
resetProb: Double = 0.15) = {
|
||||||
// Compute the out degree of each vertex
|
// Compute the out degree of each vertex
|
||||||
val pagerankGraph = graph.leftJoinVertices[Int, (Int, Double)](graph.outDegrees,
|
val pagerankGraph = graph.outerJoinVertices(graph.outDegrees){
|
||||||
(vertex, deg) => (deg.getOrElse(0), 1.0)
|
(vid, vdata, deg) => (deg.getOrElse(0), 1.0)
|
||||||
)
|
}
|
||||||
|
|
||||||
println("Vertex Replication: " + pagerankGraph.replication)
|
println("Vertex Replication: " + pagerankGraph.replication)
|
||||||
|
|
||||||
|
@ -59,11 +59,11 @@ object Analytics extends Logging {
|
||||||
|
|
||||||
|
|
||||||
Pregel.iterate[(Int, Double), ED, Double](pagerankGraph)(
|
Pregel.iterate[(Int, Double), ED, Double](pagerankGraph)(
|
||||||
(vertex, a: Double) => (vertex.data._1, (resetProb + (1.0 - resetProb) * a)), // apply
|
(vid, data, a: Double) => (data._1, (resetProb + (1.0 - resetProb) * a)), // apply
|
||||||
(me_id, edge) => Some(edge.src.data._2 / edge.src.data._1), // gather
|
(me_id, edge) => Some(edge.src.data._2 / edge.src.data._1), // gather
|
||||||
(a: Double, b: Double) => a + b, // merge
|
(a: Double, b: Double) => a + b, // merge
|
||||||
1.0,
|
1.0,
|
||||||
numIter).mapVertices{ case Vertex(id, (outDeg, r)) => r }
|
numIter).mapVertices{ case (id, (outDeg, r)) => r }
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -74,18 +74,19 @@ object Analytics extends Logging {
|
||||||
maxIter: Int = Integer.MAX_VALUE,
|
maxIter: Int = Integer.MAX_VALUE,
|
||||||
resetProb: Double = 0.15) = {
|
resetProb: Double = 0.15) = {
|
||||||
// Compute the out degree of each vertex
|
// Compute the out degree of each vertex
|
||||||
val pagerankGraph = graph.leftJoinVertices[Int, (Int, Double, Double)](graph.outDegrees,
|
val pagerankGraph = graph.outerJoinVertices(graph.outDegrees){
|
||||||
(vertex, degIter) => (degIter.sum, 1.0, 1.0)
|
(id, data, degIter) => (degIter.sum, 1.0, 1.0)
|
||||||
)
|
}
|
||||||
|
|
||||||
|
|
||||||
// Run PageRank
|
// Run PageRank
|
||||||
GraphLab.iterate(pagerankGraph)(
|
GraphLab.iterate(pagerankGraph)(
|
||||||
(me_id, edge) => edge.src.data._2 / edge.src.data._1, // gather
|
(me_id, edge) => edge.src.data._2 / edge.src.data._1, // gather
|
||||||
(a: Double, b: Double) => a + b,
|
(a: Double, b: Double) => a + b,
|
||||||
(vertex, a: Option[Double]) =>
|
(id, data, a: Option[Double]) =>
|
||||||
(vertex.data._1, (resetProb + (1.0 - resetProb) * a.getOrElse(0.0)), vertex.data._2), // apply
|
(data._1, (resetProb + (1.0 - resetProb) * a.getOrElse(0.0)), data._2), // apply
|
||||||
(me_id, edge) => math.abs(edge.src.data._3 - edge.src.data._2) > tol, // scatter
|
(me_id, edge) => math.abs(edge.src.data._3 - edge.src.data._2) > tol, // scatter
|
||||||
maxIter).mapVertices { case Vertex(vid, data) => data._2 }
|
maxIter).mapVertices { case (vid, data) => data._2 }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -96,12 +97,12 @@ object Analytics extends Logging {
|
||||||
* that vertex.
|
* that vertex.
|
||||||
*/
|
*/
|
||||||
def connectedComponents[VD: Manifest, ED: Manifest](graph: Graph[VD, ED]) = {
|
def connectedComponents[VD: Manifest, ED: Manifest](graph: Graph[VD, ED]) = {
|
||||||
val ccGraph = graph.mapVertices { case Vertex(vid, _) => vid }
|
val ccGraph = graph.mapVertices { case (vid, _) => vid }
|
||||||
|
|
||||||
GraphLab.iterate(ccGraph)(
|
GraphLab.iterate(ccGraph)(
|
||||||
(me_id, edge) => edge.otherVertex(me_id).data, // gather
|
(me_id, edge) => edge.otherVertex(me_id).data, // gather
|
||||||
(a: Vid, b: Vid) => math.min(a, b), // merge
|
(a: Vid, b: Vid) => math.min(a, b), // merge
|
||||||
(v, a: Option[Vid]) => math.min(v.data, a.getOrElse(Long.MaxValue)), // apply
|
(id, data, a: Option[Vid]) => math.min(data, a.getOrElse(Long.MaxValue)), // apply
|
||||||
(me_id, edge) => (edge.vertex(me_id).data < edge.otherVertex(me_id).data), // scatter
|
(me_id, edge) => (edge.vertex(me_id).data < edge.otherVertex(me_id).data), // scatter
|
||||||
gatherDirection = EdgeDirection.Both, scatterDirection = EdgeDirection.Both
|
gatherDirection = EdgeDirection.Both, scatterDirection = EdgeDirection.Both
|
||||||
)
|
)
|
||||||
|
|
|
@ -2,6 +2,7 @@ package org.apache.spark.graph
|
||||||
|
|
||||||
|
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
|
import org.apache.spark.util.ClosureCleaner
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -33,7 +34,7 @@ abstract class Graph[VD: ClassManifest, ED: ClassManifest] {
|
||||||
*
|
*
|
||||||
* @todo should vertices return tuples instead of vertex objects?
|
* @todo should vertices return tuples instead of vertex objects?
|
||||||
*/
|
*/
|
||||||
def vertices: RDD[Vertex[VD]]
|
def vertices: RDD[(Vid,VD)]
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the Edges and their data as an RDD. The entries in the RDD contain
|
* Get the Edges and their data as an RDD. The entries in the RDD contain
|
||||||
|
@ -101,7 +102,7 @@ abstract class Graph[VD: ClassManifest, ED: ClassManifest] {
|
||||||
* }}}
|
* }}}
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
def mapVertices[VD2: ClassManifest](map: Vertex[VD] => VD2): Graph[VD2, ED]
|
def mapVertices[VD2: ClassManifest](map: (Vid, VD) => VD2): Graph[VD2, ED]
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Construct a new graph where each the value of each edge is transformed by
|
* Construct a new graph where each the value of each edge is transformed by
|
||||||
|
@ -149,13 +150,13 @@ abstract class Graph[VD: ClassManifest, ED: ClassManifest] {
|
||||||
map: EdgeTriplet[VD, ED] => ED2): Graph[VD, ED2]
|
map: EdgeTriplet[VD, ED] => ED2): Graph[VD, ED2]
|
||||||
|
|
||||||
|
|
||||||
/**
|
// /**
|
||||||
* Remove edges conntecting vertices that are not in the graph.
|
// * Remove edges conntecting vertices that are not in the graph.
|
||||||
*
|
// *
|
||||||
* @todo remove this function and ensure that for a graph G=(V,E):
|
// * @todo remove this function and ensure that for a graph G=(V,E):
|
||||||
* if (u,v) in E then u in V and v in V
|
// * if (u,v) in E then u in V and v in V
|
||||||
*/
|
// */
|
||||||
def correctEdges(): Graph[VD, ED]
|
// def correctEdges(): Graph[VD, ED]
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Construct a new graph with all the edges reversed. If this graph contains
|
* Construct a new graph with all the edges reversed. If this graph contains
|
||||||
|
@ -183,8 +184,8 @@ abstract class Graph[VD: ClassManifest, ED: ClassManifest] {
|
||||||
* @return the subgraph containing only the vertices and edges that satisfy the
|
* @return the subgraph containing only the vertices and edges that satisfy the
|
||||||
* predicates.
|
* predicates.
|
||||||
*/
|
*/
|
||||||
def subgraph(epred: EdgeTriplet[VD,ED] => Boolean = (_ => true),
|
def subgraph(epred: EdgeTriplet[VD,ED] => Boolean = (x => true),
|
||||||
vpred: Vertex[VD] => Boolean = (_ => true) ): Graph[VD, ED]
|
vpred: (Vid, VD) => Boolean = ((v,d) => true) ): Graph[VD, ED]
|
||||||
|
|
||||||
|
|
||||||
// /**
|
// /**
|
||||||
|
@ -200,51 +201,55 @@ abstract class Graph[VD: ClassManifest, ED: ClassManifest] {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def mapReduceTriplets[A: ClassManifest](
|
||||||
|
mapFunc: EdgeTriplet[VD, ED] => Array[(Vid, A)],
|
||||||
|
reduceFunc: (A, A) => A)
|
||||||
|
: RDD[(Vid, A)]
|
||||||
|
|
||||||
|
|
||||||
/**
|
// /**
|
||||||
* This function is used to compute a statistic for the neighborhood of each
|
// * This function is used to compute a statistic for the neighborhood of each
|
||||||
* vertex.
|
// * vertex.
|
||||||
*
|
// *
|
||||||
* This is one of the core functions in the Graph API in that enables
|
// * This is one of the core functions in the Graph API in that enables
|
||||||
* neighborhood level computation. For example this function can be used to
|
// * neighborhood level computation. For example this function can be used to
|
||||||
* count neighbors satisfying a predicate or implement PageRank.
|
// * count neighbors satisfying a predicate or implement PageRank.
|
||||||
*
|
// *
|
||||||
* @note The returned RDD may contain fewer entries than their are vertices
|
// * @note The returned RDD may contain fewer entries than their are vertices
|
||||||
* in the graph. This is because some vertices may not have neighbors or the
|
// * in the graph. This is because some vertices may not have neighbors or the
|
||||||
* map function may return None for all neighbors.
|
// * map function may return None for all neighbors.
|
||||||
*
|
// *
|
||||||
* @param mapFunc the function applied to each edge adjacent to each vertex.
|
// * @param mapFunc the function applied to each edge adjacent to each vertex.
|
||||||
* The mapFunc can optionally return None in which case it does not
|
// * The mapFunc can optionally return None in which case it does not
|
||||||
* contribute to the final sum.
|
// * contribute to the final sum.
|
||||||
* @param mergeFunc the function used to merge the results of each map
|
// * @param mergeFunc the function used to merge the results of each map
|
||||||
* operation.
|
// * operation.
|
||||||
* @param direction the direction of edges to consider (e.g., In, Out, Both).
|
// * @param direction the direction of edges to consider (e.g., In, Out, Both).
|
||||||
* @tparam VD2 The returned type of the aggregation operation.
|
// * @tparam VD2 The returned type of the aggregation operation.
|
||||||
*
|
// *
|
||||||
* @return A Spark.RDD containing tuples of vertex identifiers and thee
|
// * @return A Spark.RDD containing tuples of vertex identifiers and thee
|
||||||
* resulting value. Note that the returned RDD may contain fewer vertices
|
// * resulting value. Note that the returned RDD may contain fewer vertices
|
||||||
* than in the original graph since some vertices may not have neighbors or
|
// * than in the original graph since some vertices may not have neighbors or
|
||||||
* the map function could return None for all neighbors.
|
// * the map function could return None for all neighbors.
|
||||||
*
|
// *
|
||||||
* @example We can use this function to compute the average follower age for
|
// * @example We can use this function to compute the average follower age for
|
||||||
* each user
|
// * each user
|
||||||
* {{{
|
// * {{{
|
||||||
* val graph: Graph[Int,Int] = loadGraph()
|
// * val graph: Graph[Int,Int] = loadGraph()
|
||||||
* val averageFollowerAge: RDD[(Int, Int)] =
|
// * val averageFollowerAge: RDD[(Int, Int)] =
|
||||||
* graph.aggregateNeighbors[(Int,Double)](
|
// * graph.aggregateNeighbors[(Int,Double)](
|
||||||
* (vid, edge) => (edge.otherVertex(vid).data, 1),
|
// * (vid, edge) => (edge.otherVertex(vid).data, 1),
|
||||||
* (a, b) => (a._1 + b._1, a._2 + b._2),
|
// * (a, b) => (a._1 + b._1, a._2 + b._2),
|
||||||
* EdgeDirection.In)
|
// * EdgeDirection.In)
|
||||||
* .mapValues{ case (sum,followers) => sum.toDouble / followers}
|
// * .mapValues{ case (sum,followers) => sum.toDouble / followers}
|
||||||
* }}}
|
// * }}}
|
||||||
*
|
// *
|
||||||
*/
|
// */
|
||||||
def aggregateNeighbors[A: ClassManifest](
|
// def aggregateNeighbors[A: ClassManifest](
|
||||||
mapFunc: (Vid, EdgeTriplet[VD, ED]) => Option[A],
|
// mapFunc: (Vid, EdgeTriplet[VD, ED]) => Option[A],
|
||||||
mergeFunc: (A, A) => A,
|
// mergeFunc: (A, A) => A,
|
||||||
direction: EdgeDirection)
|
// direction: EdgeDirection)
|
||||||
: Graph[(VD, Option[A]), ED]
|
// : Graph[(VD, Option[A]), ED]
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This function is used to compute a statistic for the neighborhood of each
|
* This function is used to compute a statistic for the neighborhood of each
|
||||||
|
@ -291,9 +296,8 @@ abstract class Graph[VD: ClassManifest, ED: ClassManifest] {
|
||||||
def aggregateNeighbors[A: ClassManifest](
|
def aggregateNeighbors[A: ClassManifest](
|
||||||
mapFunc: (Vid, EdgeTriplet[VD, ED]) => Option[A],
|
mapFunc: (Vid, EdgeTriplet[VD, ED]) => Option[A],
|
||||||
reduceFunc: (A, A) => A,
|
reduceFunc: (A, A) => A,
|
||||||
default: A, // Should this be a function or a value?
|
|
||||||
direction: EdgeDirection)
|
direction: EdgeDirection)
|
||||||
: Graph[(VD, Option[A]), ED]
|
: RDD[(Vid, A)]
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -328,9 +332,8 @@ abstract class Graph[VD: ClassManifest, ED: ClassManifest] {
|
||||||
* }}}
|
* }}}
|
||||||
* @todo Is leftJoinVertices the right name?
|
* @todo Is leftJoinVertices the right name?
|
||||||
*/
|
*/
|
||||||
def leftJoinVertices[U: ClassManifest, VD2: ClassManifest](
|
def outerJoinVertices[U: ClassManifest, VD2: ClassManifest](table: RDD[(Vid, U)])
|
||||||
table: RDD[(Vid, U)],
|
(mapFunc: (Vid, VD, Option[U]) => VD2)
|
||||||
mapFunc: (Vertex[VD], Option[U]) => VD2)
|
|
||||||
: Graph[VD2, ED]
|
: Graph[VD2, ED]
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -366,10 +369,15 @@ abstract class Graph[VD: ClassManifest, ED: ClassManifest] {
|
||||||
* graph.joinVertices(tbl)( (v, row) => row )
|
* graph.joinVertices(tbl)( (v, row) => row )
|
||||||
* }}}
|
* }}}
|
||||||
*/
|
*/
|
||||||
def joinVertices[U: ClassManifest](
|
def joinVertices[U: ClassManifest](table: RDD[(Vid, U)])(mapFunc: (Vid, VD, U) => VD)
|
||||||
table: RDD[(Vid, U)],
|
: Graph[VD, ED] = {
|
||||||
mapFunc: (Vertex[VD], U) => VD)
|
ClosureCleaner.clean(mapFunc)
|
||||||
: Graph[VD, ED]
|
def uf(id: Vid, data: VD, o: Option[U]): VD = o match {
|
||||||
|
case Some(u) => mapFunc(id, data, u)
|
||||||
|
case None => data
|
||||||
|
}
|
||||||
|
outerJoinVertices(table)(uf)
|
||||||
|
}
|
||||||
|
|
||||||
// Save a copy of the GraphOps object so there is always one unique GraphOps object
|
// Save a copy of the GraphOps object so there is always one unique GraphOps object
|
||||||
// for a given Graph object, and thus the lazy vals in GraphOps would work as intended.
|
// for a given Graph object, and thus the lazy vals in GraphOps would work as intended.
|
||||||
|
@ -391,16 +399,16 @@ object Graph {
|
||||||
rawEdges.map { case (s, t) => Edge(s, t, 1) }
|
rawEdges.map { case (s, t) => Edge(s, t, 1) }
|
||||||
}
|
}
|
||||||
// Determine unique vertices
|
// Determine unique vertices
|
||||||
val vertices: RDD[Vertex[Int]] = edges.flatMap{ case Edge(s, t, cnt) => Array((s, 1), (t, 1)) }
|
val vertices: RDD[(Vid, Int)] =
|
||||||
.reduceByKey(_ + _)
|
edges.flatMap{ case Edge(s, t, cnt) => Array((s, 1), (t, 1)) }.reduceByKey(_ + _)
|
||||||
.map{ case (id, deg) => Vertex(id, deg) }
|
|
||||||
// Return graph
|
// Return graph
|
||||||
new GraphImpl(vertices, edges)
|
GraphImpl(vertices, edges)
|
||||||
}
|
}
|
||||||
|
|
||||||
def apply[VD: ClassManifest, ED: ClassManifest](
|
def apply[VD: ClassManifest, ED: ClassManifest](
|
||||||
vertices: RDD[Vertex[VD]], edges: RDD[Edge[ED]]): Graph[VD, ED] = {
|
vertices: RDD[(Vid,VD)], edges: RDD[Edge[ED]]): Graph[VD, ED] = {
|
||||||
new GraphImpl(vertices, edges)
|
GraphImpl(vertices, edges)
|
||||||
}
|
}
|
||||||
|
|
||||||
implicit def graphToGraphOps[VD: ClassManifest, ED: ClassManifest](g: Graph[VD, ED]) = g.ops
|
implicit def graphToGraphOps[VD: ClassManifest, ED: ClassManifest](g: Graph[VD, ED]) = g.ops
|
||||||
|
|
|
@ -36,7 +36,7 @@ object GraphLab {
|
||||||
def iterate[VD: ClassManifest, ED: ClassManifest, A: ClassManifest](graph: Graph[VD, ED])(
|
def iterate[VD: ClassManifest, ED: ClassManifest, A: ClassManifest](graph: Graph[VD, ED])(
|
||||||
gatherFunc: (Vid, EdgeTriplet[VD, ED]) => A,
|
gatherFunc: (Vid, EdgeTriplet[VD, ED]) => A,
|
||||||
mergeFunc: (A, A) => A,
|
mergeFunc: (A, A) => A,
|
||||||
applyFunc: (Vertex[VD], Option[A]) => VD,
|
applyFunc: (Vid, VD, Option[A]) => VD,
|
||||||
scatterFunc: (Vid, EdgeTriplet[VD, ED]) => Boolean,
|
scatterFunc: (Vid, EdgeTriplet[VD, ED]) => Boolean,
|
||||||
numIter: Int = Integer.MAX_VALUE,
|
numIter: Int = Integer.MAX_VALUE,
|
||||||
gatherDirection: EdgeDirection = EdgeDirection.In,
|
gatherDirection: EdgeDirection = EdgeDirection.In,
|
||||||
|
@ -45,7 +45,7 @@ object GraphLab {
|
||||||
|
|
||||||
// Add an active attribute to all vertices to track convergence.
|
// Add an active attribute to all vertices to track convergence.
|
||||||
var activeGraph: Graph[(Boolean, VD), ED] = graph.mapVertices {
|
var activeGraph: Graph[(Boolean, VD), ED] = graph.mapVertices {
|
||||||
case Vertex(id, data) => (true, data)
|
case (id, data) => (true, data)
|
||||||
}.cache()
|
}.cache()
|
||||||
|
|
||||||
// The gather function wrapper strips the active attribute and
|
// The gather function wrapper strips the active attribute and
|
||||||
|
@ -64,9 +64,9 @@ object GraphLab {
|
||||||
|
|
||||||
// The apply function wrapper strips the vertex of the active attribute
|
// The apply function wrapper strips the vertex of the active attribute
|
||||||
// and only invokes the apply function on active vertices
|
// and only invokes the apply function on active vertices
|
||||||
def apply(v: Vertex[((Boolean, VD), Option[A])]): (Boolean, VD) = {
|
def apply(vid: Vid, data: (Boolean, VD), accum: Option[A]): (Boolean, VD) = {
|
||||||
val ((active, vData), accum) = v.data
|
val (active, vData) = data
|
||||||
if (active) (true, applyFunc(Vertex(v.id, vData), accum))
|
if (active) (true, applyFunc(vid, vData, accum))
|
||||||
else (false, vData)
|
else (false, vData)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -89,9 +89,9 @@ object GraphLab {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Used to set the active status of vertices for the next round
|
// Used to set the active status of vertices for the next round
|
||||||
def applyActive(v: Vertex[((Boolean, VD), Option[Boolean])]): (Boolean, VD) = {
|
def applyActive(vid: Vid, data: (Boolean, VD), newActive: Boolean): (Boolean, VD) = {
|
||||||
val ((prevActive, vData), newActive) = v.data
|
val (prevActive, vData) = data
|
||||||
(newActive.getOrElse(false), vData)
|
(newActive, vData)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Main Loop ---------------------------------------------------------------------
|
// Main Loop ---------------------------------------------------------------------
|
||||||
|
@ -99,29 +99,32 @@ object GraphLab {
|
||||||
var numActive = activeGraph.numVertices
|
var numActive = activeGraph.numVertices
|
||||||
while (i < numIter && numActive > 0) {
|
while (i < numIter && numActive > 0) {
|
||||||
|
|
||||||
val gathered: Graph[((Boolean, VD), Option[A]), ED] =
|
// Gather
|
||||||
|
val gathered: RDD[(Vid, A)] =
|
||||||
activeGraph.aggregateNeighbors(gather, mergeFunc, gatherDirection)
|
activeGraph.aggregateNeighbors(gather, mergeFunc, gatherDirection)
|
||||||
|
|
||||||
val applied: Graph[(Boolean, VD), ED] = gathered.mapVertices(apply).cache()
|
// Apply
|
||||||
|
activeGraph = activeGraph.outerJoinVertices(gathered)(apply).cache()
|
||||||
|
|
||||||
|
|
||||||
activeGraph = applied.cache()
|
|
||||||
|
|
||||||
// Scatter is basically a gather in the opposite direction so we reverse the edge direction
|
// Scatter is basically a gather in the opposite direction so we reverse the edge direction
|
||||||
// activeGraph: Graph[(Boolean, VD), ED]
|
// activeGraph: Graph[(Boolean, VD), ED]
|
||||||
val scattered: Graph[((Boolean, VD), Option[Boolean]), ED] =
|
val scattered: RDD[(Vid, Boolean)] =
|
||||||
activeGraph.aggregateNeighbors(scatter, _ || _, scatterDirection.reverse)
|
activeGraph.aggregateNeighbors(scatter, _ || _, scatterDirection.reverse)
|
||||||
val newActiveGraph: Graph[(Boolean, VD), ED] =
|
|
||||||
scattered.mapVertices(applyActive)
|
|
||||||
|
|
||||||
activeGraph = newActiveGraph.cache()
|
activeGraph = activeGraph.joinVertices(scattered)(applyActive).cache()
|
||||||
|
|
||||||
numActive = activeGraph.vertices.map(v => if (v.data._1) 1 else 0).reduce(_ + _)
|
// Calculate the number of active vertices
|
||||||
|
numActive = activeGraph.vertices.map{
|
||||||
|
case (vid, data) => if (data._1) 1 else 0
|
||||||
|
}.reduce(_ + _)
|
||||||
println("Number active vertices: " + numActive)
|
println("Number active vertices: " + numActive)
|
||||||
i += 1
|
i += 1
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove the active attribute from the vertex data before returning the graph
|
// Remove the active attribute from the vertex data before returning the graph
|
||||||
activeGraph.mapVertices(v => v.data._2)
|
activeGraph.mapVertices{case (vid, data) => data._2 }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -48,7 +48,7 @@ object GraphLoader {
|
||||||
def fromEdges[ED: ClassManifest](edges: RDD[Edge[ED]]): GraphImpl[Int, ED] = {
|
def fromEdges[ED: ClassManifest](edges: RDD[Edge[ED]]): GraphImpl[Int, ED] = {
|
||||||
val vertices = edges.flatMap { edge => List((edge.src, 1), (edge.dst, 1)) }
|
val vertices = edges.flatMap { edge => List((edge.src, 1), (edge.dst, 1)) }
|
||||||
.reduceByKey(_ + _)
|
.reduceByKey(_ + _)
|
||||||
.map{ case (vid, degree) => Vertex(vid, degree) }
|
.map{ case (vid, degree) => (vid, degree) }
|
||||||
new GraphImpl[Int, ED](vertices, edges)
|
GraphImpl(vertices, edges)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
package org.apache.spark.graph
|
package org.apache.spark.graph
|
||||||
|
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
|
import org.apache.spark.SparkContext._
|
||||||
|
|
||||||
class GraphOps[VD: ClassManifest, ED: ClassManifest](g: Graph[VD, ED]) {
|
class GraphOps[VD: ClassManifest, ED: ClassManifest](g: Graph[VD, ED]) {
|
||||||
|
|
||||||
|
@ -16,22 +16,18 @@ class GraphOps[VD: ClassManifest, ED: ClassManifest](g: Graph[VD, ED]) {
|
||||||
lazy val degrees: RDD[(Vid, Int)] = degreesRDD(EdgeDirection.Both)
|
lazy val degrees: RDD[(Vid, Int)] = degreesRDD(EdgeDirection.Both)
|
||||||
|
|
||||||
def collectNeighborIds(edgeDirection: EdgeDirection) : RDD[(Vid, Array[Vid])] = {
|
def collectNeighborIds(edgeDirection: EdgeDirection) : RDD[(Vid, Array[Vid])] = {
|
||||||
val graph: Graph[(VD, Option[Array[Vid]]), ED] = g.aggregateNeighbors(
|
val nbrs = g.aggregateNeighbors[Array[Vid]](
|
||||||
(vid, edge) => Some(Array(edge.otherVertex(vid).id)),
|
(vid, edge) => Some(Array(edge.otherVertex(vid).id)),
|
||||||
(a, b) => a ++ b,
|
(a, b) => a ++ b,
|
||||||
edgeDirection)
|
edgeDirection)
|
||||||
graph.vertices.map(v => {
|
|
||||||
val (_, neighborIds) = v.data
|
g.vertices.leftOuterJoin(nbrs).mapValues{
|
||||||
(v.id, neighborIds.getOrElse(Array()))
|
case (_, Some(nbrs)) => nbrs
|
||||||
})
|
case (_, None) => Array.empty[Vid]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private def degreesRDD(edgeDirection: EdgeDirection): RDD[(Vid, Int)] = {
|
private def degreesRDD(edgeDirection: EdgeDirection): RDD[(Vid, Int)] = {
|
||||||
val degreeGraph: Graph[(VD, Option[Int]), ED] =
|
|
||||||
g.aggregateNeighbors((vid, edge) => Some(1), _+_, edgeDirection)
|
g.aggregateNeighbors((vid, edge) => Some(1), _+_, edgeDirection)
|
||||||
degreeGraph.vertices.map(v => {
|
|
||||||
val (_, degree) = v.data
|
|
||||||
(v.id, degree.getOrElse(0))
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,7 +6,7 @@ import org.apache.spark.rdd.RDD
|
||||||
object Pregel {
|
object Pregel {
|
||||||
|
|
||||||
def iterate[VD: ClassManifest, ED: ClassManifest, A: ClassManifest](graph: Graph[VD, ED])(
|
def iterate[VD: ClassManifest, ED: ClassManifest, A: ClassManifest](graph: Graph[VD, ED])(
|
||||||
vprog: (Vertex[VD], A) => VD,
|
vprog: (Vid, VD, A) => VD,
|
||||||
sendMsg: (Vid, EdgeTriplet[VD, ED]) => Option[A],
|
sendMsg: (Vid, EdgeTriplet[VD, ED]) => Option[A],
|
||||||
mergeMsg: (A, A) => A,
|
mergeMsg: (A, A) => A,
|
||||||
initialMsg: A,
|
initialMsg: A,
|
||||||
|
@ -19,25 +19,26 @@ object Pregel {
|
||||||
|
|
||||||
def mapF(vid: Vid, edge: EdgeTriplet[VD,ED]) = sendMsg(edge.otherVertex(vid).id, edge)
|
def mapF(vid: Vid, edge: EdgeTriplet[VD,ED]) = sendMsg(edge.otherVertex(vid).id, edge)
|
||||||
|
|
||||||
def runProg(vertexWithMsgs: Vertex[(VD, Option[A])]): VD = {
|
def runProg(id: Vid, data: (VD, Option[A]) ): VD = {
|
||||||
val (vData, msg) = vertexWithMsgs.data
|
val (vData, msg) = data
|
||||||
val v = Vertex(vertexWithMsgs.id, vData)
|
|
||||||
msg match {
|
msg match {
|
||||||
case Some(m) => vprog(v, m)
|
case Some(m) => vprog(id, vData, m)
|
||||||
case None => v.data
|
case None => vData
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var graphWithMsgs: Graph[(VD, Option[A]), ED] =
|
// Receive the first set of messages
|
||||||
g.mapVertices(v => (v.data, Some(initialMsg)))
|
g.mapVertices( (vid, vdata) => vprog(vid, vdata, initialMsg))
|
||||||
|
|
||||||
while (i < numIter) {
|
while (i < numIter) {
|
||||||
val newGraph: Graph[VD, ED] = graphWithMsgs.mapVertices(runProg).cache()
|
// compute the messages
|
||||||
graphWithMsgs = newGraph.aggregateNeighbors(mapF, mergeMsg, EdgeDirection.In)
|
val messages = g.aggregateNeighbors(mapF, mergeMsg, EdgeDirection.In)
|
||||||
|
// receive the messages
|
||||||
|
g = g.joinVertices(messages)(vprog)
|
||||||
|
// count the iteration
|
||||||
i += 1
|
i += 1
|
||||||
}
|
}
|
||||||
graphWithMsgs.mapVertices(vertexWithMsgs => vertexWithMsgs.data match {
|
// Return the final graph
|
||||||
case (vData, _) => vData
|
g
|
||||||
})
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,9 +1,6 @@
|
||||||
package org.apache.spark.graph.impl
|
package org.apache.spark.graph.impl
|
||||||
|
|
||||||
import scala.collection.mutable.ArrayBuilder
|
import scala.collection.mutable.ArrayBuilder
|
||||||
|
|
||||||
import it.unimi.dsi.fastutil.ints.IntArrayList
|
|
||||||
|
|
||||||
import org.apache.spark.graph._
|
import org.apache.spark.graph._
|
||||||
|
|
||||||
|
|
||||||
|
@ -11,28 +8,42 @@ import org.apache.spark.graph._
|
||||||
* A partition of edges in 3 large columnar arrays.
|
* A partition of edges in 3 large columnar arrays.
|
||||||
*/
|
*/
|
||||||
private[graph]
|
private[graph]
|
||||||
class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) ED: ClassManifest] {
|
class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) ED: ClassManifest](
|
||||||
|
val srcIds: Array[Vid],
|
||||||
|
val dstIds: Array[Vid],
|
||||||
|
val data: Array[ED]
|
||||||
|
){
|
||||||
|
|
||||||
private var _data: Array[ED] = _
|
// private var _data: Array[ED] = _
|
||||||
private var _dataBuilder = ArrayBuilder.make[ED]
|
// private var _dataBuilder = ArrayBuilder.make[ED]
|
||||||
|
|
||||||
val srcIds = new VertexArrayList
|
// var srcIds = new VertexArrayList
|
||||||
val dstIds = new VertexArrayList
|
// var dstIds = new VertexArrayList
|
||||||
|
|
||||||
def data: Array[ED] = _data
|
def reverse: EdgePartition[ED] = new EdgePartition(dstIds, srcIds, data)
|
||||||
|
|
||||||
/** Add a new edge to the partition. */
|
def map[ED2: ClassManifest](f: Edge[ED] => ED2): EdgePartition[ED2] = {
|
||||||
def add(src: Vid, dst: Vid, d: ED) {
|
val newData = new Array[ED2](data.size)
|
||||||
srcIds.add(src)
|
val edge = new Edge[ED]()
|
||||||
dstIds.add(dst)
|
for(i <- 0 until data.size){
|
||||||
_dataBuilder += d
|
edge.src = srcIds(i)
|
||||||
|
edge.dst = dstIds(i)
|
||||||
|
edge.data = data(i)
|
||||||
|
newData(i) = f(edge)
|
||||||
|
}
|
||||||
|
new EdgePartition(srcIds, dstIds, newData)
|
||||||
}
|
}
|
||||||
|
|
||||||
def trim() {
|
def foreach(f: Edge[ED] => Unit) {
|
||||||
srcIds.trim()
|
val edge = new Edge[ED]
|
||||||
dstIds.trim()
|
for(i <- 0 until data.size){
|
||||||
_data = _dataBuilder.result()
|
edge.src = srcIds(i)
|
||||||
|
edge.dst = dstIds(i)
|
||||||
|
edge.data = data(i)
|
||||||
|
f(edge)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def size: Int = srcIds.size
|
def size: Int = srcIds.size
|
||||||
|
|
||||||
|
@ -43,11 +54,13 @@ class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double)
|
||||||
override def hasNext: Boolean = pos < EdgePartition.this.size
|
override def hasNext: Boolean = pos < EdgePartition.this.size
|
||||||
|
|
||||||
override def next(): Edge[ED] = {
|
override def next(): Edge[ED] = {
|
||||||
edge.src = srcIds.get(pos)
|
edge.src = srcIds(pos)
|
||||||
edge.dst = dstIds.get(pos)
|
edge.dst = dstIds(pos)
|
||||||
edge.data = _data(pos)
|
edge.data = data(pos)
|
||||||
pos += 1
|
pos += 1
|
||||||
edge
|
edge
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,31 @@
|
||||||
|
package org.apache.spark.graph.impl
|
||||||
|
|
||||||
|
import scala.collection.mutable.ArrayBuilder
|
||||||
|
import org.apache.spark.graph._
|
||||||
|
|
||||||
|
|
||||||
|
private[graph]
|
||||||
|
class EdgePartitionBuilder[@specialized(Char, Int, Boolean, Byte, Long, Float, Double)
|
||||||
|
ED: ClassManifest]{
|
||||||
|
val srcIds = new VertexArrayList
|
||||||
|
val dstIds = new VertexArrayList
|
||||||
|
var dataBuilder = ArrayBuilder.make[ED]
|
||||||
|
|
||||||
|
|
||||||
|
/** Add a new edge to the partition. */
|
||||||
|
def add(src: Vid, dst: Vid, d: ED) {
|
||||||
|
srcIds.add(src)
|
||||||
|
dstIds.add(dst)
|
||||||
|
dataBuilder += d
|
||||||
|
}
|
||||||
|
|
||||||
|
def toEdgePartition: EdgePartition[ED] = {
|
||||||
|
new EdgePartition(srcIds.toLongArray(), dstIds.toLongArray(), dataBuilder.result())
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,112 +1,112 @@
|
||||||
package org.apache.spark.graph.impl
|
// package org.apache.spark.graph.impl
|
||||||
|
|
||||||
import scala.collection.mutable
|
// import scala.collection.mutable
|
||||||
|
|
||||||
import org.apache.spark.Aggregator
|
// import org.apache.spark.Aggregator
|
||||||
import org.apache.spark.Partition
|
// import org.apache.spark.Partition
|
||||||
import org.apache.spark.SparkEnv
|
// import org.apache.spark.SparkEnv
|
||||||
import org.apache.spark.TaskContext
|
// import org.apache.spark.TaskContext
|
||||||
import org.apache.spark.rdd.RDD
|
// import org.apache.spark.rdd.RDD
|
||||||
import org.apache.spark.Dependency
|
// import org.apache.spark.Dependency
|
||||||
import org.apache.spark.OneToOneDependency
|
// import org.apache.spark.OneToOneDependency
|
||||||
import org.apache.spark.ShuffleDependency
|
// import org.apache.spark.ShuffleDependency
|
||||||
import org.apache.spark.SparkContext._
|
// import org.apache.spark.SparkContext._
|
||||||
import org.apache.spark.graph._
|
// import org.apache.spark.graph._
|
||||||
|
|
||||||
|
|
||||||
private[graph]
|
// private[graph]
|
||||||
class EdgeTripletPartition(idx: Int, val vPart: Partition, val ePart: Partition)
|
// class EdgeTripletPartition(idx: Int, val vPart: Partition, val ePart: Partition)
|
||||||
extends Partition {
|
// extends Partition {
|
||||||
override val index: Int = idx
|
// override val index: Int = idx
|
||||||
override def hashCode(): Int = idx
|
// override def hashCode(): Int = idx
|
||||||
}
|
// }
|
||||||
|
|
||||||
|
|
||||||
/**
|
// /**
|
||||||
* A RDD that brings together edge data with its associated vertex data.
|
// * A RDD that brings together edge data with its associated vertex data.
|
||||||
*/
|
// */
|
||||||
private[graph]
|
// private[graph]
|
||||||
class EdgeTripletRDD[VD: ClassManifest, ED: ClassManifest](
|
// class EdgeTripletRDD[VD: ClassManifest, ED: ClassManifest](
|
||||||
vTableReplicated: RDD[(Vid, VD)],
|
// vTableReplicated: IndexedRDD[Pid, VertexHashMap[VD]],
|
||||||
eTable: RDD[(Pid, EdgePartition[ED])])
|
// eTable: IndexedRDD[Pid, EdgePartition[ED]])
|
||||||
extends RDD[(VertexHashMap[VD], Iterator[EdgeTriplet[VD, ED]])](eTable.context, Nil) {
|
// extends RDD[(VertexHashMap[VD], Iterator[EdgeTriplet[VD, ED]])](eTable.context, Nil) {
|
||||||
|
|
||||||
//println("ddshfkdfhds" + vTableReplicated.partitioner.get.numPartitions)
|
// //println("ddshfkdfhds" + vTableReplicated.partitioner.get.numPartitions)
|
||||||
//println("9757984589347598734549" + eTable.partitioner.get.numPartitions)
|
// //println("9757984589347598734549" + eTable.partitioner.get.numPartitions)
|
||||||
|
|
||||||
assert(vTableReplicated.partitioner == eTable.partitioner)
|
// assert(vTableReplicated.partitioner == eTable.partitioner)
|
||||||
|
|
||||||
override def getDependencies: List[Dependency[_]] = {
|
// override def getDependencies: List[Dependency[_]] = {
|
||||||
List(new OneToOneDependency(eTable), new OneToOneDependency(vTableReplicated))
|
// List(new OneToOneDependency(eTable), new OneToOneDependency(vTableReplicated))
|
||||||
}
|
// }
|
||||||
|
|
||||||
override def getPartitions = Array.tabulate[Partition](eTable.partitions.size) {
|
// override def getPartitions = Array.tabulate[Partition](eTable.partitions.size) {
|
||||||
i => new EdgeTripletPartition(i, eTable.partitions(i), vTableReplicated.partitions(i))
|
// i => new EdgeTripletPartition(i, eTable.partitions(i), vTableReplicated.partitions(i))
|
||||||
}
|
// }
|
||||||
|
|
||||||
override val partitioner = eTable.partitioner
|
// override val partitioner = eTable.partitioner
|
||||||
|
|
||||||
override def getPreferredLocations(s: Partition) =
|
// override def getPreferredLocations(s: Partition) =
|
||||||
eTable.preferredLocations(s.asInstanceOf[EdgeTripletPartition].ePart)
|
// eTable.preferredLocations(s.asInstanceOf[EdgeTripletPartition].ePart)
|
||||||
|
|
||||||
override def compute(s: Partition, context: TaskContext)
|
// override def compute(s: Partition, context: TaskContext)
|
||||||
: Iterator[(VertexHashMap[VD], Iterator[EdgeTriplet[VD, ED]])] = {
|
// : Iterator[(VertexHashMap[VD], Iterator[EdgeTriplet[VD, ED]])] = {
|
||||||
|
|
||||||
val split = s.asInstanceOf[EdgeTripletPartition]
|
// val split = s.asInstanceOf[EdgeTripletPartition]
|
||||||
|
|
||||||
// Fetch the vertices and put them in a hashmap.
|
// // Fetch the vertices and put them in a hashmap.
|
||||||
// TODO: use primitive hashmaps for primitive VD types.
|
// // TODO: use primitive hashmaps for primitive VD types.
|
||||||
val vmap = new VertexHashMap[VD]//(1000000)
|
// val vmap = new VertexHashMap[VD]//(1000000)
|
||||||
vTableReplicated.iterator(split.vPart, context).foreach { v => vmap.put(v._1, v._2) }
|
// vTableReplicated.iterator(split.vPart, context).foreach { v => vmap.put(v._1, v._2) }
|
||||||
|
|
||||||
val (pid, edgePartition) = eTable.iterator(split.ePart, context).next()
|
// val (pid, edgePartition) = eTable.iterator(split.ePart, context).next()
|
||||||
.asInstanceOf[(Pid, EdgePartition[ED])]
|
// .asInstanceOf[(Pid, EdgePartition[ED])]
|
||||||
|
|
||||||
// Return an iterator that looks up the hash map to find matching vertices for each edge.
|
// // Return an iterator that looks up the hash map to find matching vertices for each edge.
|
||||||
val iter = new Iterator[EdgeTriplet[VD, ED]] {
|
// val iter = new Iterator[EdgeTriplet[VD, ED]] {
|
||||||
private var pos = 0
|
// private var pos = 0
|
||||||
private val e = new EdgeTriplet[VD, ED]
|
// private val e = new EdgeTriplet[VD, ED]
|
||||||
e.src = new Vertex[VD]
|
// e.src = new Vertex[VD]
|
||||||
e.dst = new Vertex[VD]
|
// e.dst = new Vertex[VD]
|
||||||
|
|
||||||
override def hasNext: Boolean = pos < edgePartition.size
|
// override def hasNext: Boolean = pos < edgePartition.size
|
||||||
override def next() = {
|
// override def next() = {
|
||||||
e.src.id = edgePartition.srcIds.getLong(pos)
|
// e.src.id = edgePartition.srcIds.getLong(pos)
|
||||||
// assert(vmap.containsKey(e.src.id))
|
// // assert(vmap.containsKey(e.src.id))
|
||||||
e.src.data = vmap.get(e.src.id)
|
// e.src.data = vmap.get(e.src.id)
|
||||||
|
|
||||||
e.dst.id = edgePartition.dstIds.getLong(pos)
|
// e.dst.id = edgePartition.dstIds.getLong(pos)
|
||||||
// assert(vmap.containsKey(e.dst.id))
|
// // assert(vmap.containsKey(e.dst.id))
|
||||||
e.dst.data = vmap.get(e.dst.id)
|
// e.dst.data = vmap.get(e.dst.id)
|
||||||
|
|
||||||
//println("Iter called: " + pos)
|
// //println("Iter called: " + pos)
|
||||||
e.data = edgePartition.data(pos)
|
// e.data = edgePartition.data(pos)
|
||||||
pos += 1
|
// pos += 1
|
||||||
e
|
// e
|
||||||
}
|
// }
|
||||||
|
|
||||||
override def toList: List[EdgeTriplet[VD, ED]] = {
|
// override def toList: List[EdgeTriplet[VD, ED]] = {
|
||||||
val lb = new mutable.ListBuffer[EdgeTriplet[VD,ED]]
|
// val lb = new mutable.ListBuffer[EdgeTriplet[VD,ED]]
|
||||||
for (i <- (0 until edgePartition.size)) {
|
// for (i <- (0 until edgePartition.size)) {
|
||||||
val currentEdge = new EdgeTriplet[VD, ED]
|
// val currentEdge = new EdgeTriplet[VD, ED]
|
||||||
currentEdge.src = new Vertex[VD]
|
// currentEdge.src = new Vertex[VD]
|
||||||
currentEdge.dst = new Vertex[VD]
|
// currentEdge.dst = new Vertex[VD]
|
||||||
currentEdge.src.id = edgePartition.srcIds.getLong(i)
|
// currentEdge.src.id = edgePartition.srcIds.getLong(i)
|
||||||
// assert(vmap.containsKey(e.src.id))
|
// // assert(vmap.containsKey(e.src.id))
|
||||||
currentEdge.src.data = vmap.get(currentEdge.src.id)
|
// currentEdge.src.data = vmap.get(currentEdge.src.id)
|
||||||
|
|
||||||
currentEdge.dst.id = edgePartition.dstIds.getLong(i)
|
// currentEdge.dst.id = edgePartition.dstIds.getLong(i)
|
||||||
// assert(vmap.containsKey(e.dst.id))
|
// // assert(vmap.containsKey(e.dst.id))
|
||||||
currentEdge.dst.data = vmap.get(currentEdge.dst.id)
|
// currentEdge.dst.data = vmap.get(currentEdge.dst.id)
|
||||||
|
|
||||||
currentEdge.data = edgePartition.data(i)
|
// currentEdge.data = edgePartition.data(i)
|
||||||
//println("Iter: " + pos + " " + e.src.id + " " + e.dst.id + " " + e.data)
|
// //println("Iter: " + pos + " " + e.src.id + " " + e.dst.id + " " + e.data)
|
||||||
//println("List: " + i + " " + currentEdge.src.id + " " + currentEdge.dst.id + " " + currentEdge.data)
|
// //println("List: " + i + " " + currentEdge.src.id + " " + currentEdge.dst.id + " " + currentEdge.data)
|
||||||
lb += currentEdge
|
// lb += currentEdge
|
||||||
}
|
// }
|
||||||
lb.toList
|
// lb.toList
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
Iterator((vmap, iter))
|
// Iterator((vmap, iter))
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
|
|
@ -2,12 +2,18 @@ package org.apache.spark.graph.impl
|
||||||
|
|
||||||
import scala.collection.JavaConversions._
|
import scala.collection.JavaConversions._
|
||||||
|
|
||||||
|
import scala.collection.mutable
|
||||||
|
|
||||||
import org.apache.spark.SparkContext._
|
import org.apache.spark.SparkContext._
|
||||||
import org.apache.spark.Partitioner
|
import org.apache.spark.Partitioner
|
||||||
import org.apache.spark.HashPartitioner
|
import org.apache.spark.HashPartitioner
|
||||||
import org.apache.spark.util.ClosureCleaner
|
import org.apache.spark.util.ClosureCleaner
|
||||||
|
|
||||||
|
import org.apache.spark.rdd
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
|
import org.apache.spark.rdd.IndexedRDD
|
||||||
|
import org.apache.spark.rdd.RDDIndex
|
||||||
|
|
||||||
|
|
||||||
import org.apache.spark.graph._
|
import org.apache.spark.graph._
|
||||||
import org.apache.spark.graph.impl.GraphImpl._
|
import org.apache.spark.graph.impl.GraphImpl._
|
||||||
|
@ -18,112 +24,224 @@ import org.apache.spark.graph.impl.MessageToPartitionRDDFunctions._
|
||||||
* A Graph RDD that supports computation on graphs.
|
* A Graph RDD that supports computation on graphs.
|
||||||
*/
|
*/
|
||||||
class GraphImpl[VD: ClassManifest, ED: ClassManifest] protected (
|
class GraphImpl[VD: ClassManifest, ED: ClassManifest] protected (
|
||||||
val numVertexPartitions: Int,
|
val vTable: IndexedRDD[Vid, VD],
|
||||||
val numEdgePartitions: Int,
|
val vid2pid: IndexedRDD[Vid, Pid],
|
||||||
_rawVertices: RDD[Vertex[VD]],
|
val eTable: IndexedRDD[Pid, EdgePartition[ED]])
|
||||||
_rawEdges: RDD[Edge[ED]],
|
|
||||||
_rawVTable: RDD[(Vid, (VD, Array[Pid]))],
|
|
||||||
_rawETable: RDD[(Pid, EdgePartition[ED])])
|
|
||||||
extends Graph[VD, ED] {
|
extends Graph[VD, ED] {
|
||||||
|
|
||||||
def this(vertices: RDD[Vertex[VD]], edges: RDD[Edge[ED]]) = {
|
|
||||||
this(vertices.partitions.size, edges.partitions.size, vertices, edges, null, null)
|
/**
|
||||||
|
* The vTableReplicated is a version of the vertex data after it is
|
||||||
|
* replicated.
|
||||||
|
*/
|
||||||
|
val vTableReplicated: IndexedRDD[Pid, VertexHashMap[VD]] = {
|
||||||
|
// Join vid2pid and vTable, generate a shuffle dependency on the joined
|
||||||
|
// result, and get the shuffle id so we can use it on the slave.
|
||||||
|
vTable.cogroup(vid2pid)
|
||||||
|
.flatMap { case (vid, (vdatas, pids)) =>
|
||||||
|
pids.iterator.map {
|
||||||
|
pid => MessageToPartition(pid, (vid, vdatas.head))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.partitionBy(eTable.partitioner.get) //@todo assert edge table has partitioner
|
||||||
|
.mapPartitionsWithIndex( (pid, iter) => {
|
||||||
|
// Build the hashmap for each partition
|
||||||
|
val vmap = new VertexHashMap[VD]
|
||||||
|
for( msg <- iter ) { vmap.put(msg.data._1, msg.data._2) }
|
||||||
|
Array((pid, vmap)).iterator
|
||||||
|
}, preservesPartitioning = true)
|
||||||
|
.indexed(eTable.index)
|
||||||
}
|
}
|
||||||
|
|
||||||
def withPartitioner(numVertexPartitions: Int, numEdgePartitions: Int): Graph[VD, ED] = {
|
|
||||||
if (_cached) {
|
|
||||||
new GraphImpl(numVertexPartitions, numEdgePartitions, null, null, _rawVTable, _rawETable)
|
|
||||||
.cache()
|
|
||||||
} else {
|
|
||||||
new GraphImpl(numVertexPartitions, numEdgePartitions, _rawVertices, _rawEdges, null, null)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def withVertexPartitioner(numVertexPartitions: Int) = {
|
|
||||||
withPartitioner(numVertexPartitions, numEdgePartitions)
|
|
||||||
}
|
|
||||||
|
|
||||||
def withEdgePartitioner(numEdgePartitions: Int) = {
|
|
||||||
withPartitioner(numVertexPartitions, numEdgePartitions)
|
|
||||||
}
|
|
||||||
|
|
||||||
protected var _cached = false
|
|
||||||
|
|
||||||
|
// def this(vertices: RDD[Vertex[VD]], edges: RDD[Edge[ED]]) = {
|
||||||
|
// this(vertices.partitions.size, edges.partitions.size, vertices, edges, null, null)
|
||||||
|
// }
|
||||||
|
|
||||||
|
// def withPartitioner(numVertexPartitions: Int, numEdgePartitions: Int): Graph[VD, ED] = {
|
||||||
|
// if (_cached) {
|
||||||
|
// new GraphImpl(numVertexPartitions, numEdgePartitions, null, null, _rawVTable, _rawETable)
|
||||||
|
// .cache()
|
||||||
|
// } else {
|
||||||
|
// new GraphImpl(numVertexPartitions, numEdgePartitions, _rawVertices, _rawEdges, null, null)
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
// def withVertexPartitioner(numVertexPartitions: Int) = {
|
||||||
|
// withPartitioner(numVertexPartitions, numEdgePartitions)
|
||||||
|
// }
|
||||||
|
|
||||||
|
// def withEdgePartitioner(numEdgePartitions: Int) = {
|
||||||
|
// withPartitioner(numVertexPartitions, numEdgePartitions)
|
||||||
|
// }
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
override def cache(): Graph[VD, ED] = {
|
override def cache(): Graph[VD, ED] = {
|
||||||
eTable.cache()
|
eTable.cache()
|
||||||
|
vid2pid.cache()
|
||||||
vTable.cache()
|
vTable.cache()
|
||||||
_cached = true
|
// @todo: should we cache the replicated data?
|
||||||
|
vTableReplicated.cache()
|
||||||
this
|
this
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
override def replication(): Double = {
|
override def replication(): Double = {
|
||||||
val rep = vTable.map{ case (_, (_, a)) => a.size }.sum
|
val rep = vid2pid.groupByKey().map(kv => kv._2.size).sum
|
||||||
rep / vTable.count
|
rep / vTable.count
|
||||||
}
|
}
|
||||||
|
|
||||||
override def balance(): Array[Int] = {
|
override def balance(): Array[Int] = {
|
||||||
eTable.map{ case (_, epart) => epart.data.size }.collect
|
eTable.map{ case (pid, epart) => epart.data.size }.collect
|
||||||
}
|
}
|
||||||
|
|
||||||
override def reverse: Graph[VD, ED] = {
|
override def reverse: Graph[VD, ED] = {
|
||||||
newGraph(vertices, edges.map{ case Edge(s, t, e) => Edge(t, s, e) })
|
val etable = eTable.mapValues( _.reverse ).asInstanceOf[IndexedRDD[Pid, EdgePartition[ED]]]
|
||||||
|
new GraphImpl(vTable, vid2pid, etable)
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Return a RDD of vertices. */
|
/** Return a RDD of vertices. */
|
||||||
override def vertices: RDD[Vertex[VD]] = {
|
override def vertices: RDD[(Vid, VD)] = vTable
|
||||||
if (!_cached && _rawVertices != null) {
|
|
||||||
_rawVertices
|
|
||||||
} else {
|
|
||||||
vTable.map { case(vid, (data, pids)) => new Vertex(vid, data) }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Return a RDD of edges. */
|
/** Return a RDD of edges. */
|
||||||
override def edges: RDD[Edge[ED]] = {
|
override def edges: RDD[Edge[ED]] = {
|
||||||
if (!_cached && _rawEdges != null) {
|
|
||||||
_rawEdges
|
|
||||||
} else {
|
|
||||||
eTable.mapPartitions { iter => iter.next()._2.iterator }
|
eTable.mapPartitions { iter => iter.next()._2.iterator }
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/** Return a RDD that brings edges with its source and destination vertices together. */
|
/** Return a RDD that brings edges with its source and destination vertices together. */
|
||||||
override def triplets: RDD[EdgeTriplet[VD, ED]] = {
|
override def triplets: RDD[EdgeTriplet[VD, ED]] = {
|
||||||
new EdgeTripletRDD(vTableReplicated, eTable).mapPartitions { part => part.next()._2 }
|
vTableReplicated.join(eTable)
|
||||||
|
.mapPartitions{ iter =>
|
||||||
|
val (pid, (vmap, edgePartition)) = iter.next()
|
||||||
|
assert(iter.hasNext == false)
|
||||||
|
// Return an iterator that looks up the hash map to find matching
|
||||||
|
// vertices for each edge.
|
||||||
|
new Iterator[EdgeTriplet[VD, ED]] {
|
||||||
|
private var pos = 0
|
||||||
|
private val e = new EdgeTriplet[VD, ED]
|
||||||
|
e.src = new Vertex[VD]
|
||||||
|
e.dst = new Vertex[VD]
|
||||||
|
|
||||||
|
override def hasNext: Boolean = pos < edgePartition.size
|
||||||
|
override def next() = {
|
||||||
|
e.src.id = edgePartition.srcIds(pos)
|
||||||
|
// assert(vmap.containsKey(e.src.id))
|
||||||
|
e.src.data = vmap.get(e.src.id)
|
||||||
|
e.dst.id = edgePartition.dstIds(pos)
|
||||||
|
// assert(vmap.containsKey(e.dst.id))
|
||||||
|
e.dst.data = vmap.get(e.dst.id)
|
||||||
|
//println("Iter called: " + pos)
|
||||||
|
e.data = edgePartition.data(pos)
|
||||||
|
pos += 1
|
||||||
|
e
|
||||||
}
|
}
|
||||||
|
|
||||||
override def mapVertices[VD2: ClassManifest](f: Vertex[VD] => VD2): Graph[VD2, ED] = {
|
override def toList: List[EdgeTriplet[VD, ED]] = {
|
||||||
newGraph(vertices.map(v => Vertex(v.id, f(v))), edges)
|
val lb = new mutable.ListBuffer[EdgeTriplet[VD,ED]]
|
||||||
|
for (i <- (0 until edgePartition.size)) {
|
||||||
|
val currentEdge = new EdgeTriplet[VD, ED]
|
||||||
|
currentEdge.src = new Vertex[VD]
|
||||||
|
currentEdge.dst = new Vertex[VD]
|
||||||
|
currentEdge.src.id = edgePartition.srcIds(i)
|
||||||
|
// assert(vmap.containsKey(e.src.id))
|
||||||
|
currentEdge.src.data = vmap.get(currentEdge.src.id)
|
||||||
|
|
||||||
|
currentEdge.dst.id = edgePartition.dstIds(i)
|
||||||
|
// assert(vmap.containsKey(e.dst.id))
|
||||||
|
currentEdge.dst.data = vmap.get(currentEdge.dst.id)
|
||||||
|
|
||||||
|
currentEdge.data = edgePartition.data(i)
|
||||||
|
lb += currentEdge
|
||||||
|
}
|
||||||
|
lb.toList
|
||||||
|
}
|
||||||
|
} // end of iterator
|
||||||
|
} // end of map partition
|
||||||
|
}
|
||||||
|
|
||||||
|
override def mapVertices[VD2: ClassManifest](f: (Vid, VD) => VD2): Graph[VD2, ED] = {
|
||||||
|
val newVTable = vTable.mapValuesWithKeys((vid, data) => f(vid, data))
|
||||||
|
.asInstanceOf[IndexedRDD[Vid, VD2]]
|
||||||
|
new GraphImpl(newVTable, vid2pid, eTable)
|
||||||
}
|
}
|
||||||
|
|
||||||
override def mapEdges[ED2: ClassManifest](f: Edge[ED] => ED2): Graph[VD, ED2] = {
|
override def mapEdges[ED2: ClassManifest](f: Edge[ED] => ED2): Graph[VD, ED2] = {
|
||||||
newGraph(vertices, edges.map(e => Edge(e.src, e.dst, f(e))))
|
val newETable = eTable.mapValues(eBlock => eBlock.map(f))
|
||||||
|
.asInstanceOf[IndexedRDD[Pid, EdgePartition[ED2]]]
|
||||||
|
new GraphImpl(vTable, vid2pid, newETable)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
override def mapTriplets[ED2: ClassManifest](f: EdgeTriplet[VD, ED] => ED2):
|
override def mapTriplets[ED2: ClassManifest](f: EdgeTriplet[VD, ED] => ED2):
|
||||||
Graph[VD, ED2] = {
|
Graph[VD, ED2] = {
|
||||||
newGraph(vertices, triplets.map(e => Edge(e.src.id, e.dst.id, f(e))))
|
val newETable = eTable.join(vTableReplicated).mapValues{
|
||||||
|
case (edgePartition, vmap) =>
|
||||||
|
val et = new EdgeTriplet[VD, ED]
|
||||||
|
et.src = new Vertex[VD]
|
||||||
|
et.dst = new Vertex[VD]
|
||||||
|
|
||||||
|
edgePartition.map{e =>
|
||||||
|
et.data = e.data
|
||||||
|
et.src.id = e.src
|
||||||
|
et.src.data = vmap(e.src)
|
||||||
|
et.dst.id = e.dst
|
||||||
|
et.dst.data = vmap(e.dst)
|
||||||
|
f(et)
|
||||||
|
}
|
||||||
|
}.asInstanceOf[IndexedRDD[Pid, EdgePartition[ED2]]]
|
||||||
|
new GraphImpl(vTable, vid2pid, newETable)
|
||||||
}
|
}
|
||||||
|
|
||||||
override def correctEdges(): Graph[VD, ED] = {
|
// override def correctEdges(): Graph[VD, ED] = {
|
||||||
val sc = vertices.context
|
// val sc = vertices.context
|
||||||
val vset = sc.broadcast(vertices.map(_.id).collect().toSet)
|
// val vset = sc.broadcast(vertices.map(_.id).collect().toSet)
|
||||||
val newEdges = edges.filter(e => vset.value.contains(e.src) && vset.value.contains(e.dst))
|
// val newEdges = edges.filter(e => vset.value.contains(e.src) && vset.value.contains(e.dst))
|
||||||
Graph(vertices, newEdges)
|
// Graph(vertices, newEdges)
|
||||||
}
|
// }
|
||||||
|
|
||||||
|
|
||||||
override def subgraph(epred: EdgeTriplet[VD,ED] => Boolean = (_ => true),
|
override def subgraph(epred: EdgeTriplet[VD,ED] => Boolean = (x => true),
|
||||||
vpred: Vertex[VD] => Boolean = (_ => true) ): Graph[VD, ED] = {
|
vpred: (Vid, VD) => Boolean = ((a,b) => true) ): Graph[VD, ED] = {
|
||||||
|
|
||||||
|
/// @todo: The following code behaves deterministically on each
|
||||||
|
/// vertex predicate but uses additional space. Should we swithc to
|
||||||
|
/// this version
|
||||||
|
// val predGraph = mapVertices(v => (v.data, vpred(v)))
|
||||||
|
// val newETable = predGraph.triplets.filter(t =>
|
||||||
|
// if(v.src.data._2 && v.dst.data._2) {
|
||||||
|
// val src = Vertex(t.src.id, t.src.data._1)
|
||||||
|
// val dst = Vertex(t.dst.id, t.dst.data._1)
|
||||||
|
// epred(new EdgeTriplet[VD, ED](src, dst, t.data))
|
||||||
|
// } else { false })
|
||||||
|
|
||||||
|
// val newVTable = predGraph.vertices.filter(v => v.data._1)
|
||||||
|
// .map(v => (v.id, v.data._1)).indexed()
|
||||||
|
|
||||||
|
// Reuse the partitioner (but not the index) from this graph
|
||||||
|
val newVTable = vertices.filter(v => vpred(v._1, v._2)).indexed(vTable.index.partitioner)
|
||||||
|
|
||||||
|
|
||||||
// Restrict the set of vertices to those that satisfy the vertex predicate
|
|
||||||
val newVertices = vertices.filter(vpred)
|
|
||||||
// Restrict the set of edges to those that satisfy the vertex and the edge predicate.
|
// Restrict the set of edges to those that satisfy the vertex and the edge predicate.
|
||||||
val newEdges = triplets.filter(t => vpred(t.src) && vpred(t.dst) && epred(t))
|
val newETable = createETable(
|
||||||
.map( t => Edge(t.src.id, t.dst.id, t.data) )
|
triplets.filter(
|
||||||
|
t => vpred( t.src.id, t.src.data ) && vpred( t.dst.id, t.dst.data ) && epred(t)
|
||||||
|
)
|
||||||
|
.map( t => Edge(t.src.id, t.dst.id, t.data) ),
|
||||||
|
eTable.index.partitioner.numPartitions
|
||||||
|
)
|
||||||
|
|
||||||
new GraphImpl(newVertices, newEdges)
|
// Construct the Vid2Pid map. Here we assume that the filter operation
|
||||||
|
// behaves deterministically.
|
||||||
|
// @todo reindex the vertex and edge tables
|
||||||
|
val newVid2Pid = createVid2Pid(newETable, newVTable.index)
|
||||||
|
|
||||||
|
new GraphImpl(newVTable, newVid2Pid, newETable)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -135,9 +253,9 @@ class GraphImpl[VD: ClassManifest, ED: ClassManifest] protected (
|
||||||
// TODO(crankshaw) is there a better way to do this using RDD.groupBy()
|
// TODO(crankshaw) is there a better way to do this using RDD.groupBy()
|
||||||
// functions?
|
// functions?
|
||||||
|
|
||||||
override def groupEdgeTriplets[ED2: ClassManifest](f: Iterator[EdgeTriplet[VD,ED]] => ED2 ):
|
override def groupEdgeTriplets[ED2: ClassManifest](
|
||||||
|
f: Iterator[EdgeTriplet[VD,ED]] => ED2 ): Graph[VD,ED2] = {
|
||||||
//override def groupEdges[ED2: ClassManifest](f: Iterator[Edge[ED]] => ED2 ):
|
//override def groupEdges[ED2: ClassManifest](f: Iterator[Edge[ED]] => ED2 ):
|
||||||
Graph[VD,ED2] = {
|
|
||||||
|
|
||||||
// I think that
|
// I think that
|
||||||
// myRDD.mapPartitions { part =>
|
// myRDD.mapPartitions { part =>
|
||||||
|
@ -169,7 +287,8 @@ class GraphImpl[VD: ClassManifest, ED: ClassManifest] protected (
|
||||||
.mapValues { ts: List[EdgeTriplet[VD, ED]] => f(ts.toIterator) }
|
.mapValues { ts: List[EdgeTriplet[VD, ED]] => f(ts.toIterator) }
|
||||||
// convert the resulting map back to a list of tuples
|
// convert the resulting map back to a list of tuples
|
||||||
.toList
|
.toList
|
||||||
// TODO(crankshaw) needs an iterator over the tuples? Why can't I map over the list?
|
// TODO(crankshaw) needs an iterator over the tuples?
|
||||||
|
// Why can't I map over the list?
|
||||||
.toIterator
|
.toIterator
|
||||||
// map over those tuples that contain src and dst info plus the
|
// map over those tuples that contain src and dst info plus the
|
||||||
// new edge data to make my new edges
|
// new edge data to make my new edges
|
||||||
|
@ -185,7 +304,12 @@ class GraphImpl[VD: ClassManifest, ED: ClassManifest] protected (
|
||||||
// and http://stackoverflow.com/questions/6998676/converting-a-scala-map-to-a-list
|
// and http://stackoverflow.com/questions/6998676/converting-a-scala-map-to-a-list
|
||||||
|
|
||||||
}
|
}
|
||||||
newGraph(vertices, newEdges)
|
|
||||||
|
// @todo eliminate the need to call createETable
|
||||||
|
val newETable = createETable(newEdges,
|
||||||
|
eTable.index.partitioner.numPartitions)
|
||||||
|
|
||||||
|
new GraphImpl(vTable, vid2pid, newETable)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -202,11 +326,12 @@ class GraphImpl[VD: ClassManifest, ED: ClassManifest] protected (
|
||||||
.toList
|
.toList
|
||||||
.toIterator
|
.toIterator
|
||||||
.map { case ((src, dst), data) => Edge(src, dst, data) }
|
.map { case ((src, dst), data) => Edge(src, dst, data) }
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
newGraph(vertices, newEdges)
|
// @todo eliminate the need to call createETable
|
||||||
|
val newETable = createETable(newEdges,
|
||||||
|
eTable.index.partitioner.numPartitions)
|
||||||
|
|
||||||
|
new GraphImpl(vTable, vid2pid, newETable)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -215,156 +340,90 @@ class GraphImpl[VD: ClassManifest, ED: ClassManifest] protected (
|
||||||
// Lower level transformation methods
|
// Lower level transformation methods
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
override def aggregateNeighbors[A: ClassManifest](
|
override def mapReduceTriplets[A: ClassManifest](
|
||||||
mapFunc: (Vid, EdgeTriplet[VD, ED]) => Option[A],
|
mapFunc: EdgeTriplet[VD, ED] => Array[(Vid, A)],
|
||||||
reduceFunc: (A, A) => A,
|
reduceFunc: (A, A) => A)
|
||||||
default: A,
|
: RDD[(Vid, A)] = {
|
||||||
gatherDirection: EdgeDirection)
|
|
||||||
: Graph[(VD, Option[A]), ED] = {
|
|
||||||
|
|
||||||
ClosureCleaner.clean(mapFunc)
|
ClosureCleaner.clean(mapFunc)
|
||||||
ClosureCleaner.clean(reduceFunc)
|
ClosureCleaner.clean(reduceFunc)
|
||||||
|
|
||||||
val newVTable = vTableReplicated.mapPartitions({ part =>
|
val newVTable: RDD[(Vid, A)] =
|
||||||
part.map { v => (v._1, MutableTuple2(v._2, Option.empty[A])) }
|
vTableReplicated.join(eTable).flatMap{
|
||||||
}, preservesPartitioning = true)
|
case (pid, (vmap, edgePartition)) =>
|
||||||
|
val aggMap = new VertexHashMap[A]
|
||||||
|
val et = new EdgeTriplet[VD, ED]
|
||||||
|
et.src = new Vertex[VD]
|
||||||
|
et.dst = new Vertex[VD]
|
||||||
|
edgePartition.foreach{e =>
|
||||||
|
et.data = e.data
|
||||||
|
et.src.id = e.src
|
||||||
|
et.src.data = vmap(e.src)
|
||||||
|
et.dst.id = e.dst
|
||||||
|
et.dst.data = vmap(e.dst)
|
||||||
|
mapFunc(et).foreach{case (vid, a) =>
|
||||||
|
if(aggMap.containsKey(vid)) {
|
||||||
|
aggMap.put(vid, reduceFunc(aggMap.get(vid), a))
|
||||||
|
} else { aggMap.put(vid, a) }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Return the aggregate map
|
||||||
|
aggMap.long2ObjectEntrySet().fastIterator().map{
|
||||||
|
entry => (entry.getLongKey(), entry.getValue())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.indexed(vTable.index).reduceByKey(reduceFunc)
|
||||||
|
|
||||||
val newVertices: RDD[(Vid, A)] =
|
newVTable
|
||||||
new EdgeTripletRDD[MutableTuple2[VD, Option[A]], ED](newVTable, eTable)
|
|
||||||
.mapPartitions { part =>
|
|
||||||
val (vmap, edges) = part.next()
|
|
||||||
val edgeSansAcc = new EdgeTriplet[VD, ED]()
|
|
||||||
edgeSansAcc.src = new Vertex[VD]
|
|
||||||
edgeSansAcc.dst = new Vertex[VD]
|
|
||||||
edges.foreach { e: EdgeTriplet[MutableTuple2[VD, Option[A]], ED] =>
|
|
||||||
edgeSansAcc.data = e.data
|
|
||||||
edgeSansAcc.src.data = e.src.data._1
|
|
||||||
edgeSansAcc.dst.data = e.dst.data._1
|
|
||||||
edgeSansAcc.src.id = e.src.id
|
|
||||||
edgeSansAcc.dst.id = e.dst.id
|
|
||||||
if (gatherDirection == EdgeDirection.In || gatherDirection == EdgeDirection.Both) {
|
|
||||||
e.dst.data._2 =
|
|
||||||
if (e.dst.data._2.isEmpty) {
|
|
||||||
mapFunc(edgeSansAcc.dst.id, edgeSansAcc)
|
|
||||||
} else {
|
|
||||||
val tmp = mapFunc(edgeSansAcc.dst.id, edgeSansAcc)
|
|
||||||
if (!tmp.isEmpty) Some(reduceFunc(e.dst.data._2.get, tmp.get)) else e.dst.data._2
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (gatherDirection == EdgeDirection.Out || gatherDirection == EdgeDirection.Both) {
|
|
||||||
e.dst.data._2 =
|
|
||||||
if (e.dst.data._2.isEmpty) {
|
|
||||||
mapFunc(edgeSansAcc.src.id, edgeSansAcc)
|
|
||||||
} else {
|
|
||||||
val tmp = mapFunc(edgeSansAcc.src.id, edgeSansAcc)
|
|
||||||
if (!tmp.isEmpty) Some(reduceFunc(e.src.data._2.get, tmp.get)) else e.src.data._2
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
vmap.long2ObjectEntrySet().fastIterator().filter(!_.getValue()._2.isEmpty).map{ entry =>
|
|
||||||
(entry.getLongKey(), entry.getValue()._2)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
.map{ case (vid, aOpt) => (vid, aOpt.get) }
|
|
||||||
.combineByKey((v: A) => v, reduceFunc, null, vertexPartitioner, false)
|
|
||||||
|
|
||||||
this.leftJoinVertices(newVertices, (v: Vertex[VD], a: Option[A]) => (v.data, a))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
def aggregateNeighbors[A: ClassManifest](
|
||||||
* Same as aggregateNeighbors but map function can return none and there is no default value.
|
|
||||||
* As a consequence, the resulting table may be much smaller than the set of vertices.
|
|
||||||
*/
|
|
||||||
override def aggregateNeighbors[A: ClassManifest](
|
|
||||||
mapFunc: (Vid, EdgeTriplet[VD, ED]) => Option[A],
|
mapFunc: (Vid, EdgeTriplet[VD, ED]) => Option[A],
|
||||||
reduceFunc: (A, A) => A,
|
reduceFunc: (A, A) => A,
|
||||||
gatherDirection: EdgeDirection): Graph[(VD, Option[A]), ED] = {
|
dir: EdgeDirection)
|
||||||
|
: RDD[(Vid, A)] = {
|
||||||
|
|
||||||
ClosureCleaner.clean(mapFunc)
|
ClosureCleaner.clean(mapFunc)
|
||||||
ClosureCleaner.clean(reduceFunc)
|
ClosureCleaner.clean(reduceFunc)
|
||||||
|
|
||||||
val newVTable = vTableReplicated.mapPartitions({ part =>
|
// Define a new map function over edge triplets
|
||||||
part.map { v => (v._1, MutableTuple2(v._2, Option.empty[A])) }
|
def mf(et: EdgeTriplet[VD,ED]): Array[(Vid, A)] = {
|
||||||
}, preservesPartitioning = true)
|
// Compute the message to the dst vertex
|
||||||
|
val dstA =
|
||||||
val newVertices: RDD[(Vid, A)] =
|
if (dir == EdgeDirection.In || dir == EdgeDirection.Both) {
|
||||||
new EdgeTripletRDD[MutableTuple2[VD, Option[A]], ED](newVTable, eTable)
|
mapFunc(et.dst.id, et)
|
||||||
.mapPartitions { part =>
|
} else { Option.empty[A] }
|
||||||
val (vmap, edges) = part.next()
|
// Compute the message to the source vertex
|
||||||
val edgeSansAcc = new EdgeTriplet[VD, ED]()
|
val srcA =
|
||||||
edgeSansAcc.src = new Vertex[VD]
|
if (dir == EdgeDirection.Out || dir == EdgeDirection.Both) {
|
||||||
edgeSansAcc.dst = new Vertex[VD]
|
mapFunc(et.src.id, et)
|
||||||
edges.foreach { e: EdgeTriplet[MutableTuple2[VD, Option[A]], ED] =>
|
} else { Option.empty[A] }
|
||||||
edgeSansAcc.data = e.data
|
// construct the return array
|
||||||
edgeSansAcc.src.data = e.src.data._1
|
(srcA, dstA) match {
|
||||||
edgeSansAcc.dst.data = e.dst.data._1
|
case (None, None) => Array.empty[(Vid, A)]
|
||||||
edgeSansAcc.src.id = e.src.id
|
case (Some(src),None) => Array((et.src.id, src))
|
||||||
edgeSansAcc.dst.id = e.dst.id
|
case (None, Some(dst)) => Array((et.dst.id, dst))
|
||||||
if (gatherDirection == EdgeDirection.In || gatherDirection == EdgeDirection.Both) {
|
case (Some(src), Some(dst)) =>
|
||||||
e.dst.data._2 =
|
Array((et.src.id, src), (et.dst.id, dst))
|
||||||
if (e.dst.data._2.isEmpty) {
|
|
||||||
mapFunc(edgeSansAcc.dst.id, edgeSansAcc)
|
|
||||||
} else {
|
|
||||||
val tmp = mapFunc(edgeSansAcc.dst.id, edgeSansAcc)
|
|
||||||
if (!tmp.isEmpty) Some(reduceFunc(e.dst.data._2.get, tmp.get)) else e.dst.data._2
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (gatherDirection == EdgeDirection.Out || gatherDirection == EdgeDirection.Both) {
|
|
||||||
e.src.data._2 =
|
|
||||||
if (e.src.data._2.isEmpty) {
|
|
||||||
mapFunc(edgeSansAcc.src.id, edgeSansAcc)
|
|
||||||
} else {
|
|
||||||
val tmp = mapFunc(edgeSansAcc.src.id, edgeSansAcc)
|
|
||||||
if (!tmp.isEmpty) Some(reduceFunc(e.src.data._2.get, tmp.get)) else e.src.data._2
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
vmap.long2ObjectEntrySet().fastIterator().filter(!_.getValue()._2.isEmpty).map{ entry =>
|
|
||||||
(entry.getLongKey(), entry.getValue()._2)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
.map{ case (vid, aOpt) => (vid, aOpt.get) }
|
|
||||||
.combineByKey((v: A) => v, reduceFunc, null, vertexPartitioner, false)
|
|
||||||
|
|
||||||
this.leftJoinVertices(newVertices, (v: Vertex[VD], a: Option[A]) => (v.data, a))
|
|
||||||
}
|
|
||||||
|
|
||||||
override def leftJoinVertices[U: ClassManifest, VD2: ClassManifest](
|
mapReduceTriplets(mf, reduceFunc)
|
||||||
updates: RDD[(Vid, U)],
|
}
|
||||||
updateF: (Vertex[VD], Option[U]) => VD2)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
override def outerJoinVertices[U: ClassManifest, VD2: ClassManifest]
|
||||||
|
(updates: RDD[(Vid, U)])(updateF: (Vid, VD, Option[U]) => VD2)
|
||||||
: Graph[VD2, ED] = {
|
: Graph[VD2, ED] = {
|
||||||
|
|
||||||
ClosureCleaner.clean(updateF)
|
ClosureCleaner.clean(updateF)
|
||||||
|
|
||||||
val newVTable = vTable.leftOuterJoin(updates).mapPartitions({ iter =>
|
val newVTable = vTable.leftOuterJoin(updates).mapValuesWithKeys{
|
||||||
iter.map { case (vid, ((vdata, pids), update)) =>
|
case (vid, (data, other)) => updateF(vid, data, other)
|
||||||
val newVdata = updateF(Vertex(vid, vdata), update)
|
}.asInstanceOf[IndexedRDD[Vid,VD2]]
|
||||||
(vid, (newVdata, pids))
|
new GraphImpl(newVTable, vid2pid, eTable)
|
||||||
}
|
|
||||||
}, preservesPartitioning = true).cache()
|
|
||||||
|
|
||||||
new GraphImpl(newVTable.partitions.length, eTable.partitions.length, null, null, newVTable, eTable)
|
|
||||||
}
|
|
||||||
|
|
||||||
override def joinVertices[U: ClassManifest](
|
|
||||||
updates: RDD[(Vid, U)],
|
|
||||||
updateF: (Vertex[VD], U) => VD)
|
|
||||||
: Graph[VD, ED] = {
|
|
||||||
|
|
||||||
ClosureCleaner.clean(updateF)
|
|
||||||
|
|
||||||
val newVTable = vTable.leftOuterJoin(updates).mapPartitions({ iter =>
|
|
||||||
iter.map { case (vid, ((vdata, pids), update)) =>
|
|
||||||
if (update.isDefined) {
|
|
||||||
val newVdata = updateF(Vertex(vid, vdata), update.get)
|
|
||||||
(vid, (newVdata, pids))
|
|
||||||
} else {
|
|
||||||
(vid, (vdata, pids))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}, preservesPartitioning = true).cache()
|
|
||||||
|
|
||||||
new GraphImpl(newVTable.partitions.length, eTable.partitions.length, null, null, newVTable, eTable)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -372,49 +431,130 @@ class GraphImpl[VD: ClassManifest, ED: ClassManifest] protected (
|
||||||
// Internals hidden from callers
|
// Internals hidden from callers
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
// TODO: Support non-hash partitioning schemes.
|
|
||||||
protected val vertexPartitioner = new HashPartitioner(numVertexPartitions)
|
|
||||||
protected val edgePartitioner = new HashPartitioner(numEdgePartitions)
|
|
||||||
|
|
||||||
/** Create a new graph but keep the current partitioning scheme. */
|
|
||||||
protected def newGraph[VD2: ClassManifest, ED2: ClassManifest](
|
|
||||||
vertices: RDD[Vertex[VD2]], edges: RDD[Edge[ED2]]): Graph[VD2, ED2] = {
|
|
||||||
(new GraphImpl[VD2, ED2](vertices, edges)).withPartitioner(numVertexPartitions, numEdgePartitions)
|
|
||||||
}
|
|
||||||
|
|
||||||
protected lazy val eTable: RDD[(Pid, EdgePartition[ED])] = {
|
|
||||||
if (_rawETable == null) {
|
|
||||||
createETable(_rawEdges, numEdgePartitions)
|
|
||||||
} else {
|
|
||||||
_rawETable
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
protected lazy val vTable: RDD[(Vid, (VD, Array[Pid]))] = {
|
|
||||||
if (_rawVTable == null) {
|
|
||||||
createVTable(_rawVertices, eTable, numVertexPartitions)
|
|
||||||
} else {
|
|
||||||
_rawVTable
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
protected lazy val vTableReplicated: RDD[(Vid, VD)] = {
|
// /** Create a new graph but keep the current partitioning scheme. */
|
||||||
// Join vid2pid and vTable, generate a shuffle dependency on the joined result, and get
|
// protected def newGraph[VD2: ClassManifest, ED2: ClassManifest](
|
||||||
// the shuffle id so we can use it on the slave.
|
// vertices: RDD[Vertex[VD2]], edges: RDD[Edge[ED2]]): Graph[VD2, ED2] = {
|
||||||
vTable
|
// (new GraphImpl[VD2, ED2](vertices, edges)).withPartitioner(numVertexPartitions, numEdgePartitions)
|
||||||
.flatMap { case (vid, (vdata, pids)) =>
|
// }
|
||||||
pids.iterator.map { pid => MessageToPartition(pid, (vid, vdata)) }
|
|
||||||
}
|
// protected lazy val eTable: RDD[(Pid, EdgePartition[ED])] = {
|
||||||
.partitionBy(edgePartitioner)
|
// if (_rawETable == null) {
|
||||||
.mapPartitions({ part =>
|
// createETable(_rawEdges, numEdgePartitions)
|
||||||
part.map { message => (message.data._1, message.data._2) }
|
// } else {
|
||||||
}, preservesPartitioning = true)
|
// _rawETable
|
||||||
}
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
// protected lazy val vTable: RDD[(Vid, (VD, Array[Pid]))] = {
|
||||||
|
// if (_rawVTable == null) {
|
||||||
|
// createVTable(_rawVertices, eTable, numVertexPartitions)
|
||||||
|
// } else {
|
||||||
|
// _rawVTable
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
// protected lazy val vTableReplicated: RDD[(Vid, VD)] = {
|
||||||
|
// // Join vid2pid and vTable, generate a shuffle dependency on the joined result, and get
|
||||||
|
// // the shuffle id so we can use it on the slave.
|
||||||
|
// vTable
|
||||||
|
// .flatMap { case (vid, (vdata, pids)) =>
|
||||||
|
// pids.iterator.map { pid => MessageToPartition(pid, (vid, vdata)) }
|
||||||
|
// }
|
||||||
|
// .partitionBy(edgePartitioner)
|
||||||
|
// .mapPartitions({ part =>
|
||||||
|
// part.map { message => (message.data._1, message.data._2) }
|
||||||
|
// }, preservesPartitioning = true)
|
||||||
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
object GraphImpl {
|
object GraphImpl {
|
||||||
|
|
||||||
|
def apply[VD: ClassManifest, ED: ClassManifest](
|
||||||
|
vertices: RDD[(Vid, VD)], edges: RDD[Edge[ED]]):
|
||||||
|
GraphImpl[VD,ED] = {
|
||||||
|
|
||||||
|
apply(vertices, edges,
|
||||||
|
vertices.context.defaultParallelism, edges.context.defaultParallelism)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def apply[VD: ClassManifest, ED: ClassManifest](
|
||||||
|
vertices: RDD[(Vid, VD)], edges: RDD[Edge[ED]],
|
||||||
|
numVPart: Int, numEPart: Int): GraphImpl[VD,ED] = {
|
||||||
|
|
||||||
|
val vtable = vertices.indexed(numVPart)
|
||||||
|
val etable = createETable(edges, numEPart)
|
||||||
|
val vid2pid = createVid2Pid(etable, vtable.index)
|
||||||
|
|
||||||
|
new GraphImpl(vtable, vid2pid, etable)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create the edge table RDD, which is much more efficient for Java heap storage than the
|
||||||
|
* normal edges data structure (RDD[(Vid, Vid, ED)]).
|
||||||
|
*
|
||||||
|
* The edge table contains multiple partitions, and each partition contains only one RDD
|
||||||
|
* key-value pair: the key is the partition id, and the value is an EdgePartition object
|
||||||
|
* containing all the edges in a partition.
|
||||||
|
*/
|
||||||
|
protected def createETable[ED: ClassManifest](
|
||||||
|
edges: RDD[Edge[ED]], numPartitions: Int)
|
||||||
|
: IndexedRDD[Pid, EdgePartition[ED]] = {
|
||||||
|
val ceilSqrt: Pid = math.ceil(math.sqrt(numPartitions)).toInt
|
||||||
|
edges
|
||||||
|
.map { e =>
|
||||||
|
// Random partitioning based on the source vertex id.
|
||||||
|
// val part: Pid = edgePartitionFunction1D(e.src, e.dst, numPartitions)
|
||||||
|
val part: Pid = edgePartitionFunction2D(e.src, e.dst, numPartitions, ceilSqrt)
|
||||||
|
//val part: Pid = canonicalEdgePartitionFunction2D(e.src, e.dst, numPartitions, ceilSqrt)
|
||||||
|
|
||||||
|
// Should we be using 3-tuple or an optimized class
|
||||||
|
MessageToPartition(part, (e.src, e.dst, e.data))
|
||||||
|
}
|
||||||
|
.partitionBy(new HashPartitioner(numPartitions))
|
||||||
|
.mapPartitionsWithIndex({ (pid, iter) =>
|
||||||
|
val builder = new EdgePartitionBuilder[ED]
|
||||||
|
iter.foreach { message =>
|
||||||
|
val data = message.data
|
||||||
|
builder.add(data._1, data._2, data._3)
|
||||||
|
}
|
||||||
|
Iterator((pid, builder.toEdgePartition))
|
||||||
|
}, preservesPartitioning = true).indexed()
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
protected def createVid2Pid[ED: ClassManifest](
|
||||||
|
eTable: IndexedRDD[Pid, EdgePartition[ED]],
|
||||||
|
vTableIndex: RDDIndex[Vid]): IndexedRDD[Vid, Pid] = {
|
||||||
|
eTable.mapPartitions { iter =>
|
||||||
|
val (pid, edgePartition) = iter.next()
|
||||||
|
val vSet = new VertexSet
|
||||||
|
edgePartition.foreach(e => {vSet.add(e.src); vSet.add(e.dst)})
|
||||||
|
vSet.iterator.map { vid => (vid.toLong, pid) }
|
||||||
|
}.indexed(vTableIndex)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
protected def edgePartitionFunction1D(src: Vid, dst: Vid, numParts: Pid): Pid = {
|
protected def edgePartitionFunction1D(src: Vid, dst: Vid, numParts: Pid): Pid = {
|
||||||
val mixingPrime: Vid = 1125899906842597L
|
val mixingPrime: Vid = 1125899906842597L
|
||||||
|
@ -500,70 +640,44 @@ object GraphImpl {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Create the edge table RDD, which is much more efficient for Java heap storage than the
|
|
||||||
* normal edges data structure (RDD[(Vid, Vid, ED)]).
|
|
||||||
*
|
|
||||||
* The edge table contains multiple partitions, and each partition contains only one RDD
|
|
||||||
* key-value pair: the key is the partition id, and the value is an EdgePartition object
|
|
||||||
* containing all the edges in a partition.
|
|
||||||
*/
|
|
||||||
protected def createETable[ED: ClassManifest](edges: RDD[Edge[ED]], numPartitions: Int)
|
|
||||||
: RDD[(Pid, EdgePartition[ED])] = {
|
|
||||||
val ceilSqrt: Pid = math.ceil(math.sqrt(numPartitions)).toInt
|
|
||||||
|
|
||||||
edges
|
|
||||||
.map { e =>
|
|
||||||
// Random partitioning based on the source vertex id.
|
|
||||||
// val part: Pid = edgePartitionFunction1D(e.src, e.dst, numPartitions)
|
|
||||||
val part: Pid = edgePartitionFunction2D(e.src, e.dst, numPartitions, ceilSqrt)
|
|
||||||
//val part: Pid = canonicalEdgePartitionFunction2D(e.src, e.dst, numPartitions, ceilSqrt)
|
|
||||||
|
|
||||||
// Should we be using 3-tuple or an optimized class
|
|
||||||
MessageToPartition(part, (e.src, e.dst, e.data))
|
|
||||||
// (math.abs(e.src) % numPartitions, (e.src, e.dst, e.data))
|
|
||||||
}
|
|
||||||
.partitionBy(new HashPartitioner(numPartitions))
|
|
||||||
.mapPartitionsWithIndex({ (pid, iter) =>
|
|
||||||
val edgePartition = new EdgePartition[ED]
|
|
||||||
iter.foreach { message =>
|
|
||||||
val data = message.data
|
|
||||||
edgePartition.add(data._1, data._2, data._3)
|
|
||||||
}
|
|
||||||
edgePartition.trim()
|
|
||||||
Iterator((pid, edgePartition))
|
|
||||||
}, preservesPartitioning = true)
|
|
||||||
}
|
|
||||||
|
|
||||||
protected def createVTable[VD: ClassManifest, ED: ClassManifest](
|
// protected def createVTable[VD: ClassManifest, ED: ClassManifest](
|
||||||
vertices: RDD[Vertex[VD]],
|
// eTable: IndexedRDD[Pid, EdgePartition[ED]],
|
||||||
eTable: RDD[(Pid, EdgePartition[ED])],
|
// vid2pid: Index
|
||||||
numPartitions: Int)
|
// vertices: RDD[Vertex[VD]],
|
||||||
: RDD[(Vid, (VD, Array[Pid]))] = {
|
|
||||||
val partitioner = new HashPartitioner(numPartitions)
|
|
||||||
|
|
||||||
// A key-value RDD. The key is a vertex id, and the value is a list of
|
// default: VD) : IndexedRDD[Vid, VD] = {
|
||||||
// partitions that contains edges referencing the vertex.
|
|
||||||
val vid2pid : RDD[(Vid, Seq[Pid])] = eTable.mapPartitions { iter =>
|
|
||||||
val (pid, edgePartition) = iter.next()
|
|
||||||
val vSet = new VertexSet
|
|
||||||
var i = 0
|
|
||||||
while (i < edgePartition.srcIds.size) {
|
|
||||||
vSet.add(edgePartition.srcIds.getLong(i))
|
|
||||||
vSet.add(edgePartition.dstIds.getLong(i))
|
|
||||||
i += 1
|
|
||||||
}
|
|
||||||
vSet.iterator.map { vid => (vid.toLong, pid) }
|
|
||||||
}.groupByKey(partitioner)
|
|
||||||
|
|
||||||
vertices
|
// // Compute all the vertices in the edge table.
|
||||||
.map { v => (v.id, v.data) }
|
// val vid2pid = createVid2Pid(eTable)
|
||||||
.partitionBy(partitioner)
|
|
||||||
.leftOuterJoin(vid2pid)
|
// // Compute all the
|
||||||
.mapValues {
|
// vertices.map(v => (v.id, v.data)).cogroup(vids)
|
||||||
case (vdata, None) => (vdata, Array.empty[Pid])
|
|
||||||
case (vdata, Some(pids)) => (vdata, pids.toArray)
|
// // A key-value RDD. The key is a vertex id, and the value is a list of
|
||||||
}
|
// // partitions that contains edges referencing the vertex.
|
||||||
}
|
// val vid2pid : RDD[(Vid, Seq[Pid])] = eTable.mapPartitions { iter =>
|
||||||
|
// val (pid, edgePartition) = iter.next()
|
||||||
|
// val vSet = new VertexSet
|
||||||
|
// var i = 0
|
||||||
|
// while (i < edgePartition.srcIds.size) {
|
||||||
|
// vSet.add(edgePartition.srcIds.getLong(i))
|
||||||
|
// vSet.add(edgePartition.dstIds.getLong(i))
|
||||||
|
// i += 1
|
||||||
|
// }
|
||||||
|
// vSet.iterator.map { vid => (vid.toLong, pid) }
|
||||||
|
// }.groupByKey(partitioner)
|
||||||
|
|
||||||
|
// vertices
|
||||||
|
// .map { v => (v.id, v.data) }
|
||||||
|
// .partitionBy(partitioner)
|
||||||
|
// .leftOuterJoin(vid2pid)
|
||||||
|
// .mapValues {
|
||||||
|
// case (vdata, None) => (vdata, Array.empty[Pid])
|
||||||
|
// case (vdata, Some(pids)) => (vdata, pids.toArray)
|
||||||
|
// }
|
||||||
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -37,7 +37,7 @@ object GraphGenerators {
|
||||||
val host = "local[4]"
|
val host = "local[4]"
|
||||||
val sc = new SparkContext(host, "Lognormal graph generator")
|
val sc = new SparkContext(host, "Lognormal graph generator")
|
||||||
|
|
||||||
val lnGraph = lognormalGraph(sc, 10000)
|
val lnGraph = logNormalGraph(sc, 10000)
|
||||||
|
|
||||||
val rmat = rmatGraph(sc, 1000, 3000)
|
val rmat = rmatGraph(sc, 1000, 3000)
|
||||||
|
|
||||||
|
@ -69,19 +69,21 @@ object GraphGenerators {
|
||||||
|
|
||||||
// Right now it just generates a bunch of edges where
|
// Right now it just generates a bunch of edges where
|
||||||
// the edge data is the weight (default 1)
|
// the edge data is the weight (default 1)
|
||||||
def lognormalGraph(sc: SparkContext, numVertices: Int): GraphImpl[Int, Int] = {
|
def logNormalGraph(sc: SparkContext, numVertices: Int): GraphImpl[Int, Int] = {
|
||||||
// based on Pregel settings
|
// based on Pregel settings
|
||||||
val mu = 4
|
val mu = 4
|
||||||
val sigma = 1.3
|
val sigma = 1.3
|
||||||
//val vertsAndEdges = (0 until numVertices).flatMap { src => {
|
//val vertsAndEdges = (0 until numVertices).flatMap { src => {
|
||||||
val vertices = (0 until numVertices).flatMap { src =>
|
|
||||||
Array(Vertex(src, sampleLogNormal(mu, sigma, numVertices))) }
|
|
||||||
val edges = vertices.flatMap( { v =>
|
|
||||||
generateRandomEdges(v.id.toInt, v.data, numVertices) })
|
|
||||||
|
|
||||||
|
val vertices: RDD[(Vid, Int)] = sc.parallelize(0 until numVertices).map{
|
||||||
|
src => (src, sampleLogNormal(mu, sigma, numVertices))
|
||||||
|
}
|
||||||
|
|
||||||
|
val edges = vertices.flatMap{
|
||||||
|
v => generateRandomEdges(v._1.toInt, v._2, numVertices)
|
||||||
|
}
|
||||||
|
|
||||||
new GraphImpl[Int, Int](sc.parallelize(vertices), sc.parallelize(edges))
|
GraphImpl(vertices, edges)
|
||||||
//println("Vertices:")
|
//println("Vertices:")
|
||||||
//for (v <- vertices) {
|
//for (v <- vertices) {
|
||||||
// println(v.id)
|
// println(v.id)
|
||||||
|
@ -161,8 +163,8 @@ object GraphGenerators {
|
||||||
|
|
||||||
val vertices = edges.flatMap { edge => List((edge.src, 1)) }
|
val vertices = edges.flatMap { edge => List((edge.src, 1)) }
|
||||||
.reduceByKey(_ + _)
|
.reduceByKey(_ + _)
|
||||||
.map{ case (vid, degree) => Vertex(vid, degree) }
|
.map{ case (vid, degree) => (vid, degree) }
|
||||||
new GraphImpl[Int, ED](vertices, edges)
|
GraphImpl(vertices, edges)
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
Loading…
Reference in a new issue