spark-instrumented-optimizer/src/scala/spark/HdfsFile.scala

package spark

import mesos.SlaveOffer

import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapred.FileInputFormat
import org.apache.hadoop.mapred.InputSplit
import org.apache.hadoop.mapred.JobConf
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.hadoop.mapred.RecordReader
import org.apache.hadoop.mapred.Reporter

@serializable class HdfsSplit(@transient s: InputSplit)
extends Split { 
  val inputSplit = new SerializableWritable[InputSplit](s)
  override def toString = inputSplit.toString
}

class HdfsTextFile(sc: SparkContext, path: String)
extends RDD[String](sc) {
  @transient val conf = new JobConf()
  @transient val inputFormat = new TextInputFormat()

  FileInputFormat.setInputPaths(conf, path)
  ConfigureLock.synchronized { inputFormat.configure(conf) }

  @transient val splits_ =
    inputFormat.getSplits(conf, sc.scheduler.numCores).map(new HdfsSplit(_)).toArray

  override def splits = splits_.asInstanceOf[Array[Split]]
  
  override def iterator(split_in: Split) = new Iterator[String] {
    val split = split_in.asInstanceOf[HdfsSplit]
    var reader: RecordReader[LongWritable, Text] = null
    ConfigureLock.synchronized {
      val conf = new JobConf()
      conf.set("io.file.buffer.size",
          System.getProperty("spark.buffer.size", "65536"))
      val tif = new TextInputFormat()
      tif.configure(conf) 
      reader = tif.getRecordReader(split.inputSplit.value, conf, Reporter.NULL)
    }
    val lineNum = new LongWritable()
    val text = new Text()
    var gotNext = false
    var finished = false

    override def hasNext: Boolean = {
      if (!gotNext) {
        try {
          finished = !reader.next(lineNum, text)
        } catch {
          case eofe: java.io.EOFException =>
            finished = true
        }
        gotNext = true
      }
      !finished
    }

    override def next: String = {
      if (!gotNext)
        finished = !reader.next(lineNum, text)
      if (finished)
        throw new java.util.NoSuchElementException("end of stream")
      gotNext = false
      text.toString
    }
  }

  override def preferredLocations(split: Split) = {
    // TODO: Filtering out "localhost" in case of file:// URLs
    split.asInstanceOf[HdfsSplit].inputSplit.value.getLocations().filter(_ != "localhost")
  }
}

object ConfigureLock {}
Initial commit 2010-03-29 19:17:55 -04:00			`package spark`

Updated code to work with Nexus->Mesos name change 2010-07-25 23:53:46 -04:00			`import mesos.SlaveOffer`
Initial commit 2010-03-29 19:17:55 -04:00
			`import org.apache.hadoop.io.LongWritable`
			`import org.apache.hadoop.io.Text`
			`import org.apache.hadoop.mapred.FileInputFormat`
			`import org.apache.hadoop.mapred.InputSplit`
			`import org.apache.hadoop.mapred.JobConf`
			`import org.apache.hadoop.mapred.TextInputFormat`
			`import org.apache.hadoop.mapred.RecordReader`
			`import org.apache.hadoop.mapred.Reporter`

- Got rid of 'Split' type parameter in RDD - Added SampledRDD, SplitRDD and CartesianRDD - Made Split a class rather than a type parameter - Added numCores() to Scheduler to help set default level of parallelism 2010-08-31 15:08:09 -04:00			`@serializable class HdfsSplit(@transient s: InputSplit)`
			`extends Split {`
			`val inputSplit = new SerializableWritable[InputSplit](s)`
Fixed a rather bad bug in HDFS files that has been in for a while: caching was not working because Split objects did not have a consistent toString value 2010-10-03 01:06:06 -04:00			`override def toString = inputSplit.toString`
- Got rid of 'Split' type parameter in RDD - Added SampledRDD, SplitRDD and CartesianRDD - Made Split a class rather than a type parameter - Added numCores() to Scheduler to help set default level of parallelism 2010-08-31 15:08:09 -04:00			`}`
Initial commit 2010-03-29 19:17:55 -04:00
			`class HdfsTextFile(sc: SparkContext, path: String)`
- Got rid of 'Split' type parameter in RDD - Added SampledRDD, SplitRDD and CartesianRDD - Made Split a class rather than a type parameter - Added numCores() to Scheduler to help set default level of parallelism 2010-08-31 15:08:09 -04:00			`extends RDD[String](sc) {`
Initial commit 2010-03-29 19:17:55 -04:00			`@transient val conf = new JobConf()`
			`@transient val inputFormat = new TextInputFormat()`

			`FileInputFormat.setInputPaths(conf, path)`
			`ConfigureLock.synchronized { inputFormat.configure(conf) }`

			`@transient val splits_ =`
round robin scheduling of tasks has been added 2010-09-07 17:03:59 -04:00			`inputFormat.getSplits(conf, sc.scheduler.numCores).map(new HdfsSplit(_)).toArray`
Initial commit 2010-03-29 19:17:55 -04:00
- Got rid of 'Split' type parameter in RDD - Added SampledRDD, SplitRDD and CartesianRDD - Made Split a class rather than a type parameter - Added numCores() to Scheduler to help set default level of parallelism 2010-08-31 15:08:09 -04:00			`override def splits = splits_.asInstanceOf[Array[Split]]`
Initial commit 2010-03-29 19:17:55 -04:00
- Got rid of 'Split' type parameter in RDD - Added SampledRDD, SplitRDD and CartesianRDD - Made Split a class rather than a type parameter - Added numCores() to Scheduler to help set default level of parallelism 2010-08-31 15:08:09 -04:00			`override def iterator(split_in: Split) = new Iterator[String] {`
			`val split = split_in.asInstanceOf[HdfsSplit]`
Initial commit 2010-03-29 19:17:55 -04:00			`var reader: RecordReader[LongWritable, Text] = null`
			`ConfigureLock.synchronized {`
			`val conf = new JobConf()`
			`conf.set("io.file.buffer.size",`
			`System.getProperty("spark.buffer.size", "65536"))`
			`val tif = new TextInputFormat()`
			`tif.configure(conf)`
- Got rid of 'Split' type parameter in RDD - Added SampledRDD, SplitRDD and CartesianRDD - Made Split a class rather than a type parameter - Added numCores() to Scheduler to help set default level of parallelism 2010-08-31 15:08:09 -04:00			`reader = tif.getRecordReader(split.inputSplit.value, conf, Reporter.NULL)`
Initial commit 2010-03-29 19:17:55 -04:00			`}`
			`val lineNum = new LongWritable()`
			`val text = new Text()`
			`var gotNext = false`
			`var finished = false`

			`override def hasNext: Boolean = {`
			`if (!gotNext) {`
HdfsFile.scala: added a try/catch block to exit gracefully for correupted gzip files MesosScheduler.scala: formatted the slaveOffer() output to include the serialized task size RDD.scala: added support for aggregating RDDs on a per-split basis (aggregateSplit()) as well as for sampling without replacement (sample()) 2010-08-18 18:25:57 -04:00			`try {`
			`finished = !reader.next(lineNum, text)`
			`} catch {`
			`case eofe: java.io.EOFException =>`
			`finished = true`
			`}`
Initial commit 2010-03-29 19:17:55 -04:00			`gotNext = true`
			`}`
			`!finished`
			`}`

			`override def next: String = {`
			`if (!gotNext)`
			`finished = !reader.next(lineNum, text)`
			`if (finished)`
			`throw new java.util.NoSuchElementException("end of stream")`
			`gotNext = false`
			`text.toString`
			`}`
			`}`

- Got rid of 'Split' type parameter in RDD - Added SampledRDD, SplitRDD and CartesianRDD - Made Split a class rather than a type parameter - Added numCores() to Scheduler to help set default level of parallelism 2010-08-31 15:08:09 -04:00			`override def preferredLocations(split: Split) = {`
Imported changes from old repository (mostly Mosharaf's work, plus some fault tolerance code). 2010-04-04 02:44:55 -04:00			`// TODO: Filtering out "localhost" in case of file:// URLs`
- Got rid of 'Split' type parameter in RDD - Added SampledRDD, SplitRDD and CartesianRDD - Made Split a class rather than a type parameter - Added numCores() to Scheduler to help set default level of parallelism 2010-08-31 15:08:09 -04:00			`split.asInstanceOf[HdfsSplit].inputSplit.value.getLocations().filter(_ != "localhost")`
Imported changes from old repository (mostly Mosharaf's work, plus some fault tolerance code). 2010-04-04 02:44:55 -04:00			`}`
Initial commit 2010-03-29 19:17:55 -04:00			`}`

			`object ConfigureLock {}`