Remove duplicate method in GraphLoader and improve docs

This commit is contained in:
Ankur Dave 2014-01-10 12:37:20 -08:00
parent 37611e57f6
commit 9454fa1f6c

View file

@ -8,51 +8,20 @@ import org.apache.spark.{Logging, SparkContext}
import org.apache.spark.graphx.impl.{EdgePartition, GraphImpl}
import org.apache.spark.util.collection.PrimitiveVector
/**
* Provides utilities for loading [[Graph]]s from files.
*/
object GraphLoader extends Logging {
/**
* Load an edge list from file initializing the Graph
* Loads a graph from an edge list formatted file where each line contains two integers: a source
* id and a target id. Skips lines that begin with `#`.
*
* @tparam ED the type of the edge data of the resulting Graph
* If desired the edges can be automatically oriented in the positive
* direction (source Id < target Id) by setting `canonicalOrientation` to
* true.
*
* @param sc the SparkContext used to construct RDDs
* @param path the path to the text file containing the edge list
* @param edgeParser a function that takes an array of strings and
* returns an ED object
* @param minEdgePartitions the number of partitions for the
* the Edge RDD
*
*/
def textFile[ED: ClassTag](
sc: SparkContext,
path: String,
edgeParser: Array[String] => ED,
minEdgePartitions: Int = 1):
Graph[Int, ED] = {
// Parse the edge data table
val edges = sc.textFile(path, minEdgePartitions).mapPartitions( iter =>
iter.filter(line => !line.isEmpty && line(0) != '#').map { line =>
val lineArray = line.split("\\s+")
if(lineArray.length < 2) {
println("Invalid line: " + line)
assert(false)
}
val source = lineArray(0).trim.toLong
val target = lineArray(1).trim.toLong
val tail = lineArray.drop(2)
val edata = edgeParser(tail)
Edge(source, target, edata)
})
val defaultVertexAttr = 1
Graph.fromEdges(edges, defaultVertexAttr)
}
/**
* Load a graph from an edge list formatted file with each line containing
* two integers: a source Id and a target Id.
*
* @example A file in the following format:
* @example Loads a file in the following format:
* {{{
* # Comment Line
* # Source Id <\t> Target Id
@ -62,25 +31,19 @@ object GraphLoader extends Logging {
* 1 8
* }}}
*
* If desired the edges can be automatically oriented in the positive
* direction (source Id < target Id) by setting `canonicalOrientation` to
* true
*
* @param sc
* @param path the path to the file (e.g., /Home/data/file or hdfs://file)
* @param path the path to the file (e.g., /home/data/file or hdfs://file)
* @param canonicalOrientation whether to orient edges in the positive
* direction.
* direction
* @param minEdgePartitions the number of partitions for the
* the Edge RDD
* the edge RDD
* @tparam ED
* @return
*/
def edgeListFile(
sc: SparkContext,
path: String,
canonicalOrientation: Boolean = false,
minEdgePartitions: Int = 1):
Graph[Int, Int] = {
minEdgePartitions: Int = 1): Graph[Int, Int] = {
val startTime = System.currentTimeMillis
// Parse the edge data table directly into edge partitions