spark-instrumented-optimizer/core/src/main/scala/spark/HadoopFile.scala

package spark

import mesos.SlaveOffer

import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapred.FileInputFormat
import org.apache.hadoop.mapred.InputFormat
import org.apache.hadoop.mapred.InputSplit
import org.apache.hadoop.mapred.JobConf
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.hadoop.mapred.RecordReader
import org.apache.hadoop.mapred.Reporter
import org.apache.hadoop.util.ReflectionUtils

/** A Spark split class that wraps around a Hadoop InputSplit */
@serializable class HadoopSplit(@transient s: InputSplit)
extends Split {
  val inputSplit = new SerializableWritable[InputSplit](s)

  // Hadoop gives each split a unique toString value, so use this as our ID
  override def getId() = "HadoopSplit(" + inputSplit.toString + ")"
}


/**
 * An RDD that reads a Hadoop file (from HDFS, S3, the local filesystem, etc)
 * and represents it as a set of key-value pairs using a given InputFormat.
 */
class HadoopFile[K, V](
  sc: SparkContext,
  path: String,
  inputFormatClass: Class[_ <: InputFormat[K, V]],
  keyClass: Class[K],
  valueClass: Class[V])
extends RDD[(K, V)](sc) {
  @transient val splits_ : Array[Split] = ConfigureLock.synchronized {
    val conf = new JobConf()
    FileInputFormat.setInputPaths(conf, path)
    val inputFormat = createInputFormat(conf)
    val inputSplits = inputFormat.getSplits(conf, sc.numCores)
    inputSplits.map(x => new HadoopSplit(x): Split).toArray
  }

  def createInputFormat(conf: JobConf): InputFormat[K, V] = {
    ReflectionUtils.newInstance(inputFormatClass.asInstanceOf[Class[_]], conf)
                   .asInstanceOf[InputFormat[K, V]]
  }

  override def splits = splits_

  override def iterator(theSplit: Split) = new Iterator[(K, V)] {
    val split = theSplit.asInstanceOf[HadoopSplit]
    var reader: RecordReader[K, V] = null

    ConfigureLock.synchronized {
      val conf = new JobConf()
      val bufferSize = System.getProperty("spark.buffer.size", "65536")
      conf.set("io.file.buffer.size", bufferSize)
      val fmt = createInputFormat(conf)
      reader = fmt.getRecordReader(split.inputSplit.value, conf, Reporter.NULL)
    }

    val key: K = keyClass.newInstance()
    val value: V = valueClass.newInstance()
    var gotNext = false
    var finished = false

    override def hasNext: Boolean = {
      if (!gotNext) {
        try {
          finished = !reader.next(key, value)
        } catch {
          case eofe: java.io.EOFException =>
            finished = true
        }
        gotNext = true
      }
      if (finished) {
        reader.close()
      }
      !finished
    }

    override def next: (K, V) = {
      if (!gotNext) {
        finished = !reader.next(key, value)
      }
      if (finished) {
        throw new java.util.NoSuchElementException("End of stream")
      }
      gotNext = false
      (key, value)
    }
  }

  override def preferredLocations(split: Split) = {
    // TODO: Filtering out "localhost" in case of file:// URLs
    val hadoopSplit = split.asInstanceOf[HadoopSplit]
    hadoopSplit.inputSplit.value.getLocations.filter(_ != "localhost")
  }
}


/**
 * Convenience class for Hadoop files read using TextInputFormat that
 * represents the file as an RDD of Strings.
 */
class HadoopTextFile(sc: SparkContext, path: String)
extends MappedRDD[String, (LongWritable, Text)](
  new HadoopFile(sc, path, classOf[TextInputFormat],
                 classOf[LongWritable], classOf[Text]),
  { pair: (LongWritable, Text) => pair._2.toString }
)


/**
 * Object used to ensure that only one thread at a time is configuring Hadoop
 * InputFormat classes. Apparently configuring them is not thread safe!
 */
object ConfigureLock {}
Initial commit 2010-03-29 19:17:55 -04:00			`package spark`

Updated code to work with Nexus->Mesos name change 2010-07-25 23:53:46 -04:00			`import mesos.SlaveOffer`
Initial commit 2010-03-29 19:17:55 -04:00
			`import org.apache.hadoop.io.LongWritable`
			`import org.apache.hadoop.io.Text`
			`import org.apache.hadoop.mapred.FileInputFormat`
Added support for generic Hadoop InputFormats and refactored textFile to use this. Closes #12. 2010-10-16 22:03:33 -04:00			`import org.apache.hadoop.mapred.InputFormat`
Initial commit 2010-03-29 19:17:55 -04:00			`import org.apache.hadoop.mapred.InputSplit`
			`import org.apache.hadoop.mapred.JobConf`
			`import org.apache.hadoop.mapred.TextInputFormat`
			`import org.apache.hadoop.mapred.RecordReader`
			`import org.apache.hadoop.mapred.Reporter`
Added support for generic Hadoop InputFormats and refactored textFile to use this. Closes #12. 2010-10-16 22:03:33 -04:00			`import org.apache.hadoop.util.ReflectionUtils`
Initial commit 2010-03-29 19:17:55 -04:00
Added support for generic Hadoop InputFormats and refactored textFile to use this. Closes #12. 2010-10-16 22:03:33 -04:00			`/** A Spark split class that wraps around a Hadoop InputSplit */`
Renamed HdfsFile to HadoopFile 2010-10-16 20:25:09 -04:00			`@serializable class HadoopSplit(@transient s: InputSplit)`
Fixed some whitespace 2010-10-17 00:21:16 -04:00			`extends Split {`
- Got rid of 'Split' type parameter in RDD - Added SampledRDD, SplitRDD and CartesianRDD - Made Split a class rather than a type parameter - Added numCores() to Scheduler to help set default level of parallelism 2010-08-31 15:08:09 -04:00			`val inputSplit = new SerializableWritable[InputSplit](s)`
Added a getId method to split to force classes to specify a unique ID for each split. This replaces the previous method of calling split.toString, which would produce different results for the same split each time it is deserialized (because the default implementation returns the Java object's address). 2010-10-07 20:17:07 -04:00
Renamed HdfsFile to HadoopFile 2010-10-16 20:25:09 -04:00			`// Hadoop gives each split a unique toString value, so use this as our ID`
			`override def getId() = "HadoopSplit(" + inputSplit.toString + ")"`
- Got rid of 'Split' type parameter in RDD - Added SampledRDD, SplitRDD and CartesianRDD - Made Split a class rather than a type parameter - Added numCores() to Scheduler to help set default level of parallelism 2010-08-31 15:08:09 -04:00			`}`
Initial commit 2010-03-29 19:17:55 -04:00

Added support for generic Hadoop InputFormats and refactored textFile to use this. Closes #12. 2010-10-16 22:03:33 -04:00			`/**`
			`* An RDD that reads a Hadoop file (from HDFS, S3, the local filesystem, etc)`
			`* and represents it as a set of key-value pairs using a given InputFormat.`
			`*/`
			`class HadoopFile[K, V](`
			`sc: SparkContext,`
			`path: String,`
			`inputFormatClass: Class[_ <: InputFormat[K, V]],`
			`keyClass: Class[K],`
			`valueClass: Class[V])`
			`extends RDD[(K, V)](sc) {`
			`@transient val splits_ : Array[Split] = ConfigureLock.synchronized {`
			`val conf = new JobConf()`
			`FileInputFormat.setInputPaths(conf, path)`
			`val inputFormat = createInputFormat(conf)`
Added reduceByKey, groupByKey and join operations based on combine, as well as versions of the shuffle operations that set the number of splits automatically. 2010-11-04 02:51:11 -04:00			`val inputSplits = inputFormat.getSplits(conf, sc.numCores)`
Added support for generic Hadoop InputFormats and refactored textFile to use this. Closes #12. 2010-10-16 22:03:33 -04:00			`inputSplits.map(x => new HadoopSplit(x): Split).toArray`
			`}`
Initial commit 2010-03-29 19:17:55 -04:00
Added support for generic Hadoop InputFormats and refactored textFile to use this. Closes #12. 2010-10-16 22:03:33 -04:00			`def createInputFormat(conf: JobConf): InputFormat[K, V] = {`
			`ReflectionUtils.newInstance(inputFormatClass.asInstanceOf[Class[_]], conf)`
			`.asInstanceOf[InputFormat[K, V]]`
			`}`
Initial commit 2010-03-29 19:17:55 -04:00
Added support for generic Hadoop InputFormats and refactored textFile to use this. Closes #12. 2010-10-16 22:03:33 -04:00			`override def splits = splits_`
Fixed some whitespace 2010-10-17 00:21:16 -04:00
Added support for generic Hadoop InputFormats and refactored textFile to use this. Closes #12. 2010-10-16 22:03:33 -04:00			`override def iterator(theSplit: Split) = new Iterator[(K, V)] {`
			`val split = theSplit.asInstanceOf[HadoopSplit]`
			`var reader: RecordReader[K, V] = null`

Initial commit 2010-03-29 19:17:55 -04:00			`ConfigureLock.synchronized {`
			`val conf = new JobConf()`
Added support for generic Hadoop InputFormats and refactored textFile to use this. Closes #12. 2010-10-16 22:03:33 -04:00			`val bufferSize = System.getProperty("spark.buffer.size", "65536")`
			`conf.set("io.file.buffer.size", bufferSize)`
			`val fmt = createInputFormat(conf)`
			`reader = fmt.getRecordReader(split.inputSplit.value, conf, Reporter.NULL)`
Initial commit 2010-03-29 19:17:55 -04:00			`}`
Added support for generic Hadoop InputFormats and refactored textFile to use this. Closes #12. 2010-10-16 22:03:33 -04:00
			`val key: K = keyClass.newInstance()`
			`val value: V = valueClass.newInstance()`
Initial commit 2010-03-29 19:17:55 -04:00			`var gotNext = false`
			`var finished = false`

			`override def hasNext: Boolean = {`
			`if (!gotNext) {`
HdfsFile.scala: added a try/catch block to exit gracefully for correupted gzip files MesosScheduler.scala: formatted the slaveOffer() output to include the serialized task size RDD.scala: added support for aggregating RDDs on a per-split basis (aggregateSplit()) as well as for sampling without replacement (sample()) 2010-08-18 18:25:57 -04:00			`try {`
Added support for generic Hadoop InputFormats and refactored textFile to use this. Closes #12. 2010-10-16 22:03:33 -04:00			`finished = !reader.next(key, value)`
HdfsFile.scala: added a try/catch block to exit gracefully for correupted gzip files MesosScheduler.scala: formatted the slaveOffer() output to include the serialized task size RDD.scala: added support for aggregating RDDs on a per-split basis (aggregateSplit()) as well as for sampling without replacement (sample()) 2010-08-18 18:25:57 -04:00			`} catch {`
			`case eofe: java.io.EOFException =>`
			`finished = true`
			`}`
Initial commit 2010-03-29 19:17:55 -04:00			`gotNext = true`
			`}`
Close record readers in HadoopFile after finishing a split 2011-02-10 14:20:51 -05:00			`if (finished) {`
			`reader.close()`
			`}`
Initial commit 2010-03-29 19:17:55 -04:00			`!finished`
			`}`

Added support for generic Hadoop InputFormats and refactored textFile to use this. Closes #12. 2010-10-16 22:03:33 -04:00			`override def next: (K, V) = {`
			`if (!gotNext) {`
			`finished = !reader.next(key, value)`
			`}`
			`if (finished) {`
			`throw new java.util.NoSuchElementException("End of stream")`
			`}`
Initial commit 2010-03-29 19:17:55 -04:00			`gotNext = false`
Added support for generic Hadoop InputFormats and refactored textFile to use this. Closes #12. 2010-10-16 22:03:33 -04:00			`(key, value)`
Initial commit 2010-03-29 19:17:55 -04:00			`}`
			`}`

- Got rid of 'Split' type parameter in RDD - Added SampledRDD, SplitRDD and CartesianRDD - Made Split a class rather than a type parameter - Added numCores() to Scheduler to help set default level of parallelism 2010-08-31 15:08:09 -04:00			`override def preferredLocations(split: Split) = {`
Imported changes from old repository (mostly Mosharaf's work, plus some fault tolerance code). 2010-04-04 02:44:55 -04:00			`// TODO: Filtering out "localhost" in case of file:// URLs`
Renamed HdfsFile to HadoopFile 2010-10-16 20:25:09 -04:00			`val hadoopSplit = split.asInstanceOf[HadoopSplit]`
			`hadoopSplit.inputSplit.value.getLocations.filter(_ != "localhost")`
Imported changes from old repository (mostly Mosharaf's work, plus some fault tolerance code). 2010-04-04 02:44:55 -04:00			`}`
Initial commit 2010-03-29 19:17:55 -04:00			`}`

Added support for generic Hadoop InputFormats and refactored textFile to use this. Closes #12. 2010-10-16 22:03:33 -04:00
			`/**`
			`* Convenience class for Hadoop files read using TextInputFormat that`
			`* represents the file as an RDD of Strings.`
			`*/`
			`class HadoopTextFile(sc: SparkContext, path: String)`
			`extends MappedRDD[String, (LongWritable, Text)](`
			`new HadoopFile(sc, path, classOf[TextInputFormat],`
			`classOf[LongWritable], classOf[Text]),`
			`{ pair: (LongWritable, Text) => pair._2.toString }`
			`)`


			`/**`
			`* Object used to ensure that only one thread at a time is configuring Hadoop`
			`* InputFormat classes. Apparently configuring them is not thread safe!`
			`*/`
Initial commit 2010-03-29 19:17:55 -04:00			`object ConfigureLock {}`