Remove duplicate method in GraphLoader and improve docs

2014-01-10 12:37:20 -08:00 · 2014-01-10 12:37:20 -08:00 · 9454fa1f6c
parent 37611e57f6
commit 9454fa1f6c
1 changed files with 13 additions and 50 deletions
--- a/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala
@ -8,51 +8,20 @@ import org.apache.spark.{Logging, SparkContext}
 import org.apache.spark.graphx.impl.{EdgePartition, GraphImpl}
 import org.apache.spark.util.collection.PrimitiveVector

-
+/**
+ * Provides utilities for loading [[Graph]]s from files.
+ */
 object GraphLoader extends Logging {

  /**
-   * Load an edge list from file initializing the Graph
+   * Loads a graph from an edge list formatted file where each line contains two integers: a source
+   * id and a target id. Skips lines that begin with `#`.
   *
-   * @tparam ED the type of the edge data of the resulting Graph
+   * If desired the edges can be automatically oriented in the positive
+   * direction (source Id < target Id) by setting `canonicalOrientation` to
+   * true.
   *
-   * @param sc the SparkContext used to construct RDDs
-   * @param path the path to the text file containing the edge list
-   * @param edgeParser a function that takes an array of strings and
-   * returns an ED object
-   * @param minEdgePartitions the number of partitions for the
-   * the Edge RDD
-   *
-   */
-  def textFile[ED: ClassTag](
-      sc: SparkContext,
-      path: String,
-      edgeParser: Array[String] => ED,
-      minEdgePartitions: Int = 1):
-    Graph[Int, ED] = {
-    // Parse the edge data table
-    val edges = sc.textFile(path, minEdgePartitions).mapPartitions( iter =>
-      iter.filter(line => !line.isEmpty && line(0) != '#').map { line =>
-        val lineArray = line.split("\\s+")
-        if(lineArray.length < 2) {
-          println("Invalid line: " + line)
-          assert(false)
-        }
-        val source = lineArray(0).trim.toLong
-        val target = lineArray(1).trim.toLong
-        val tail = lineArray.drop(2)
-        val edata = edgeParser(tail)
-        Edge(source, target, edata)
-      })
-    val defaultVertexAttr = 1
-    Graph.fromEdges(edges, defaultVertexAttr)
-  }
-
-  /**
-   * Load a graph from an edge list formatted file with each line containing
-   * two integers: a source Id and a target Id.
-   *
-   * @example A file in the following format:
+   * @example Loads a file in the following format:
   * {{{
   * # Comment Line
   * # Source Id <\t> Target Id
@ -62,25 +31,19 @@ object GraphLoader extends Logging {
   * 1    8
   * }}}
   *
-   * If desired the edges can be automatically oriented in the positive
-   * direction (source Id < target Id) by setting `canonicalOrientation` to
-   * true
-   *
   * @param sc
-   * @param path the path to the file (e.g., /Home/data/file or hdfs://file)
+   * @param path the path to the file (e.g., /home/data/file or hdfs://file)
   * @param canonicalOrientation whether to orient edges in the positive
-   *        direction.
+   *        direction
   * @param minEdgePartitions the number of partitions for the
-   *        the Edge RDD
+   *        the edge RDD
   * @tparam ED
-   * @return
   */
  def edgeListFile(
      sc: SparkContext,
      path: String,
      canonicalOrientation: Boolean = false,
-      minEdgePartitions: Int = 1):
-    Graph[Int, Int] = {
+      minEdgePartitions: Int = 1): Graph[Int, Int] = {
    val startTime = System.currentTimeMillis

    // Parse the edge data table directly into edge partitions