spark-instrumented-optimizer/src/scala/spark/HadoopFile.scala

82 lines
2.4 KiB
Scala
Raw Normal View History

2010-03-29 19:17:55 -04:00
package spark
import mesos.SlaveOffer
2010-03-29 19:17:55 -04:00
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapred.FileInputFormat
import org.apache.hadoop.mapred.InputSplit
import org.apache.hadoop.mapred.JobConf
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.hadoop.mapred.RecordReader
import org.apache.hadoop.mapred.Reporter
2010-10-16 20:25:09 -04:00
@serializable class HadoopSplit(@transient s: InputSplit)
extends Split {
val inputSplit = new SerializableWritable[InputSplit](s)
2010-10-16 20:25:09 -04:00
// Hadoop gives each split a unique toString value, so use this as our ID
override def getId() = "HadoopSplit(" + inputSplit.toString + ")"
}
2010-03-29 19:17:55 -04:00
2010-10-16 20:25:09 -04:00
class HadoopTextFile(sc: SparkContext, path: String)
extends RDD[String](sc) {
2010-03-29 19:17:55 -04:00
@transient val conf = new JobConf()
@transient val inputFormat = new TextInputFormat()
FileInputFormat.setInputPaths(conf, path)
ConfigureLock.synchronized { inputFormat.configure(conf) }
@transient val splits_ =
2010-10-16 20:25:09 -04:00
inputFormat.getSplits(conf, sc.scheduler.numCores).map(new HadoopSplit(_)).toArray
2010-03-29 19:17:55 -04:00
override def splits = splits_.asInstanceOf[Array[Split]]
2010-03-29 19:17:55 -04:00
override def iterator(split_in: Split) = new Iterator[String] {
2010-10-16 20:25:09 -04:00
val split = split_in.asInstanceOf[HadoopSplit]
2010-03-29 19:17:55 -04:00
var reader: RecordReader[LongWritable, Text] = null
ConfigureLock.synchronized {
val conf = new JobConf()
conf.set("io.file.buffer.size",
System.getProperty("spark.buffer.size", "65536"))
val tif = new TextInputFormat()
tif.configure(conf)
reader = tif.getRecordReader(split.inputSplit.value, conf, Reporter.NULL)
2010-03-29 19:17:55 -04:00
}
val lineNum = new LongWritable()
val text = new Text()
var gotNext = false
var finished = false
override def hasNext: Boolean = {
if (!gotNext) {
try {
finished = !reader.next(lineNum, text)
} catch {
case eofe: java.io.EOFException =>
finished = true
}
2010-03-29 19:17:55 -04:00
gotNext = true
}
!finished
}
override def next: String = {
if (!gotNext)
finished = !reader.next(lineNum, text)
if (finished)
throw new java.util.NoSuchElementException("end of stream")
gotNext = false
text.toString
}
}
override def preferredLocations(split: Split) = {
// TODO: Filtering out "localhost" in case of file:// URLs
2010-10-16 20:25:09 -04:00
val hadoopSplit = split.asInstanceOf[HadoopSplit]
hadoopSplit.inputSplit.value.getLocations.filter(_ != "localhost")
}
2010-03-29 19:17:55 -04:00
}
object ConfigureLock {}