spark-instrumented-optimizer/src/scala/spark/HadoopFile.scala

package spark

import mesos.SlaveOffer

import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapred.FileInputFormat
import org.apache.hadoop.mapred.InputSplit
import org.apache.hadoop.mapred.JobConf
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.hadoop.mapred.RecordReader
import org.apache.hadoop.mapred.Reporter

@serializable class HadoopSplit(@transient s: InputSplit)
extends Split { 
  val inputSplit = new SerializableWritable[InputSplit](s)

  // Hadoop gives each split a unique toString value, so use this as our ID
  override def getId() = "HadoopSplit(" + inputSplit.toString + ")"
}

class HadoopTextFile(sc: SparkContext, path: String)
extends RDD[String](sc) {
  @transient val conf = new JobConf()
  @transient val inputFormat = new TextInputFormat()

  FileInputFormat.setInputPaths(conf, path)
  ConfigureLock.synchronized { inputFormat.configure(conf) }

  @transient val splits_ =
    inputFormat.getSplits(conf, sc.scheduler.numCores).map(new HadoopSplit(_)).toArray

  override def splits = splits_.asInstanceOf[Array[Split]]
  
  override def iterator(split_in: Split) = new Iterator[String] {
    val split = split_in.asInstanceOf[HadoopSplit]
    var reader: RecordReader[LongWritable, Text] = null
    ConfigureLock.synchronized {
      val conf = new JobConf()
      conf.set("io.file.buffer.size",
          System.getProperty("spark.buffer.size", "65536"))
      val tif = new TextInputFormat()
      tif.configure(conf) 
      reader = tif.getRecordReader(split.inputSplit.value, conf, Reporter.NULL)
    }
    val lineNum = new LongWritable()
    val text = new Text()
    var gotNext = false
    var finished = false

    override def hasNext: Boolean = {
      if (!gotNext) {
        try {
          finished = !reader.next(lineNum, text)
        } catch {
          case eofe: java.io.EOFException =>
            finished = true
        }
        gotNext = true
      }
      !finished
    }

    override def next: String = {
      if (!gotNext)
        finished = !reader.next(lineNum, text)
      if (finished)
        throw new java.util.NoSuchElementException("end of stream")
      gotNext = false
      text.toString
    }
  }

  override def preferredLocations(split: Split) = {
    // TODO: Filtering out "localhost" in case of file:// URLs
    val hadoopSplit = split.asInstanceOf[HadoopSplit]
    hadoopSplit.inputSplit.value.getLocations.filter(_ != "localhost")
  }
}

object ConfigureLock {}
Initial commit 2010-03-29 19:17:55 -04:00			`package spark`

Updated code to work with Nexus->Mesos name change 2010-07-25 23:53:46 -04:00			`import mesos.SlaveOffer`
Initial commit 2010-03-29 19:17:55 -04:00
			`import org.apache.hadoop.io.LongWritable`
			`import org.apache.hadoop.io.Text`
			`import org.apache.hadoop.mapred.FileInputFormat`
			`import org.apache.hadoop.mapred.InputSplit`
			`import org.apache.hadoop.mapred.JobConf`
			`import org.apache.hadoop.mapred.TextInputFormat`
			`import org.apache.hadoop.mapred.RecordReader`
			`import org.apache.hadoop.mapred.Reporter`

Renamed HdfsFile to HadoopFile 2010-10-16 20:25:09 -04:00			`@serializable class HadoopSplit(@transient s: InputSplit)`
- Got rid of 'Split' type parameter in RDD - Added SampledRDD, SplitRDD and CartesianRDD - Made Split a class rather than a type parameter - Added numCores() to Scheduler to help set default level of parallelism 2010-08-31 15:08:09 -04:00			`extends Split {`
			`val inputSplit = new SerializableWritable[InputSplit](s)`
Added a getId method to split to force classes to specify a unique ID for each split. This replaces the previous method of calling split.toString, which would produce different results for the same split each time it is deserialized (because the default implementation returns the Java object's address). 2010-10-07 20:17:07 -04:00
Renamed HdfsFile to HadoopFile 2010-10-16 20:25:09 -04:00			`// Hadoop gives each split a unique toString value, so use this as our ID`
			`override def getId() = "HadoopSplit(" + inputSplit.toString + ")"`
- Got rid of 'Split' type parameter in RDD - Added SampledRDD, SplitRDD and CartesianRDD - Made Split a class rather than a type parameter - Added numCores() to Scheduler to help set default level of parallelism 2010-08-31 15:08:09 -04:00			`}`
Initial commit 2010-03-29 19:17:55 -04:00
Renamed HdfsFile to HadoopFile 2010-10-16 20:25:09 -04:00			`class HadoopTextFile(sc: SparkContext, path: String)`
- Got rid of 'Split' type parameter in RDD - Added SampledRDD, SplitRDD and CartesianRDD - Made Split a class rather than a type parameter - Added numCores() to Scheduler to help set default level of parallelism 2010-08-31 15:08:09 -04:00			`extends RDD[String](sc) {`
Initial commit 2010-03-29 19:17:55 -04:00			`@transient val conf = new JobConf()`
			`@transient val inputFormat = new TextInputFormat()`

			`FileInputFormat.setInputPaths(conf, path)`
			`ConfigureLock.synchronized { inputFormat.configure(conf) }`

			`@transient val splits_ =`
Renamed HdfsFile to HadoopFile 2010-10-16 20:25:09 -04:00			`inputFormat.getSplits(conf, sc.scheduler.numCores).map(new HadoopSplit(_)).toArray`
Initial commit 2010-03-29 19:17:55 -04:00
- Got rid of 'Split' type parameter in RDD - Added SampledRDD, SplitRDD and CartesianRDD - Made Split a class rather than a type parameter - Added numCores() to Scheduler to help set default level of parallelism 2010-08-31 15:08:09 -04:00			`override def splits = splits_.asInstanceOf[Array[Split]]`
Initial commit 2010-03-29 19:17:55 -04:00
- Got rid of 'Split' type parameter in RDD - Added SampledRDD, SplitRDD and CartesianRDD - Made Split a class rather than a type parameter - Added numCores() to Scheduler to help set default level of parallelism 2010-08-31 15:08:09 -04:00			`override def iterator(split_in: Split) = new Iterator[String] {`
Renamed HdfsFile to HadoopFile 2010-10-16 20:25:09 -04:00			`val split = split_in.asInstanceOf[HadoopSplit]`
Initial commit 2010-03-29 19:17:55 -04:00			`var reader: RecordReader[LongWritable, Text] = null`
			`ConfigureLock.synchronized {`
			`val conf = new JobConf()`
			`conf.set("io.file.buffer.size",`
			`System.getProperty("spark.buffer.size", "65536"))`
			`val tif = new TextInputFormat()`
			`tif.configure(conf)`
- Got rid of 'Split' type parameter in RDD - Added SampledRDD, SplitRDD and CartesianRDD - Made Split a class rather than a type parameter - Added numCores() to Scheduler to help set default level of parallelism 2010-08-31 15:08:09 -04:00			`reader = tif.getRecordReader(split.inputSplit.value, conf, Reporter.NULL)`
Initial commit 2010-03-29 19:17:55 -04:00			`}`
			`val lineNum = new LongWritable()`
			`val text = new Text()`
			`var gotNext = false`
			`var finished = false`

			`override def hasNext: Boolean = {`
			`if (!gotNext) {`
HdfsFile.scala: added a try/catch block to exit gracefully for correupted gzip files MesosScheduler.scala: formatted the slaveOffer() output to include the serialized task size RDD.scala: added support for aggregating RDDs on a per-split basis (aggregateSplit()) as well as for sampling without replacement (sample()) 2010-08-18 18:25:57 -04:00			`try {`
			`finished = !reader.next(lineNum, text)`
			`} catch {`
			`case eofe: java.io.EOFException =>`
			`finished = true`
			`}`
Initial commit 2010-03-29 19:17:55 -04:00			`gotNext = true`
			`}`
			`!finished`
			`}`

			`override def next: String = {`
			`if (!gotNext)`
			`finished = !reader.next(lineNum, text)`
			`if (finished)`
			`throw new java.util.NoSuchElementException("end of stream")`
			`gotNext = false`
			`text.toString`
			`}`
			`}`

- Got rid of 'Split' type parameter in RDD - Added SampledRDD, SplitRDD and CartesianRDD - Made Split a class rather than a type parameter - Added numCores() to Scheduler to help set default level of parallelism 2010-08-31 15:08:09 -04:00			`override def preferredLocations(split: Split) = {`
Imported changes from old repository (mostly Mosharaf's work, plus some fault tolerance code). 2010-04-04 02:44:55 -04:00			`// TODO: Filtering out "localhost" in case of file:// URLs`
Renamed HdfsFile to HadoopFile 2010-10-16 20:25:09 -04:00			`val hadoopSplit = split.asInstanceOf[HadoopSplit]`
			`hadoopSplit.inputSplit.value.getLocations.filter(_ != "localhost")`
Imported changes from old repository (mostly Mosharaf's work, plus some fault tolerance code). 2010-04-04 02:44:55 -04:00			`}`
Initial commit 2010-03-29 19:17:55 -04:00			`}`

			`object ConfigureLock {}`