2010-03-29 19:17:55 -04:00
|
|
|
package spark
|
|
|
|
|
2010-07-25 23:53:46 -04:00
|
|
|
import mesos.SlaveOffer
|
2010-03-29 19:17:55 -04:00
|
|
|
|
|
|
|
import org.apache.hadoop.io.LongWritable
|
|
|
|
import org.apache.hadoop.io.Text
|
|
|
|
import org.apache.hadoop.mapred.FileInputFormat
|
2010-10-16 22:03:33 -04:00
|
|
|
import org.apache.hadoop.mapred.InputFormat
|
2010-03-29 19:17:55 -04:00
|
|
|
import org.apache.hadoop.mapred.InputSplit
|
|
|
|
import org.apache.hadoop.mapred.JobConf
|
|
|
|
import org.apache.hadoop.mapred.TextInputFormat
|
|
|
|
import org.apache.hadoop.mapred.RecordReader
|
|
|
|
import org.apache.hadoop.mapred.Reporter
|
2010-10-16 22:03:33 -04:00
|
|
|
import org.apache.hadoop.util.ReflectionUtils
|
2010-03-29 19:17:55 -04:00
|
|
|
|
2010-10-16 22:03:33 -04:00
|
|
|
/** A Spark split class that wraps around a Hadoop InputSplit */
|
2010-10-16 20:25:09 -04:00
|
|
|
@serializable class HadoopSplit(@transient s: InputSplit)
|
2010-10-17 00:21:16 -04:00
|
|
|
extends Split {
|
2010-08-31 15:08:09 -04:00
|
|
|
val inputSplit = new SerializableWritable[InputSplit](s)
|
2010-10-07 20:17:07 -04:00
|
|
|
|
2010-10-16 20:25:09 -04:00
|
|
|
// Hadoop gives each split a unique toString value, so use this as our ID
|
|
|
|
override def getId() = "HadoopSplit(" + inputSplit.toString + ")"
|
2010-08-31 15:08:09 -04:00
|
|
|
}
|
2010-03-29 19:17:55 -04:00
|
|
|
|
|
|
|
|
2010-10-16 22:03:33 -04:00
|
|
|
/**
|
|
|
|
* An RDD that reads a Hadoop file (from HDFS, S3, the local filesystem, etc)
|
|
|
|
* and represents it as a set of key-value pairs using a given InputFormat.
|
|
|
|
*/
|
|
|
|
class HadoopFile[K, V](
|
|
|
|
sc: SparkContext,
|
|
|
|
path: String,
|
|
|
|
inputFormatClass: Class[_ <: InputFormat[K, V]],
|
|
|
|
keyClass: Class[K],
|
|
|
|
valueClass: Class[V])
|
|
|
|
extends RDD[(K, V)](sc) {
|
|
|
|
@transient val splits_ : Array[Split] = ConfigureLock.synchronized {
|
|
|
|
val conf = new JobConf()
|
|
|
|
FileInputFormat.setInputPaths(conf, path)
|
|
|
|
val inputFormat = createInputFormat(conf)
|
2010-11-04 02:51:11 -04:00
|
|
|
val inputSplits = inputFormat.getSplits(conf, sc.numCores)
|
2010-10-16 22:03:33 -04:00
|
|
|
inputSplits.map(x => new HadoopSplit(x): Split).toArray
|
|
|
|
}
|
2010-03-29 19:17:55 -04:00
|
|
|
|
2010-10-16 22:03:33 -04:00
|
|
|
def createInputFormat(conf: JobConf): InputFormat[K, V] = {
|
|
|
|
ReflectionUtils.newInstance(inputFormatClass.asInstanceOf[Class[_]], conf)
|
|
|
|
.asInstanceOf[InputFormat[K, V]]
|
|
|
|
}
|
2010-03-29 19:17:55 -04:00
|
|
|
|
2010-10-16 22:03:33 -04:00
|
|
|
override def splits = splits_
|
2010-10-17 00:21:16 -04:00
|
|
|
|
2010-10-16 22:03:33 -04:00
|
|
|
override def iterator(theSplit: Split) = new Iterator[(K, V)] {
|
|
|
|
val split = theSplit.asInstanceOf[HadoopSplit]
|
|
|
|
var reader: RecordReader[K, V] = null
|
|
|
|
|
2010-03-29 19:17:55 -04:00
|
|
|
ConfigureLock.synchronized {
|
|
|
|
val conf = new JobConf()
|
2010-10-16 22:03:33 -04:00
|
|
|
val bufferSize = System.getProperty("spark.buffer.size", "65536")
|
|
|
|
conf.set("io.file.buffer.size", bufferSize)
|
|
|
|
val fmt = createInputFormat(conf)
|
|
|
|
reader = fmt.getRecordReader(split.inputSplit.value, conf, Reporter.NULL)
|
2010-03-29 19:17:55 -04:00
|
|
|
}
|
2010-10-16 22:03:33 -04:00
|
|
|
|
|
|
|
val key: K = keyClass.newInstance()
|
|
|
|
val value: V = valueClass.newInstance()
|
2010-03-29 19:17:55 -04:00
|
|
|
var gotNext = false
|
|
|
|
var finished = false
|
|
|
|
|
|
|
|
override def hasNext: Boolean = {
|
|
|
|
if (!gotNext) {
|
2010-08-18 18:25:57 -04:00
|
|
|
try {
|
2010-10-16 22:03:33 -04:00
|
|
|
finished = !reader.next(key, value)
|
2010-08-18 18:25:57 -04:00
|
|
|
} catch {
|
|
|
|
case eofe: java.io.EOFException =>
|
|
|
|
finished = true
|
|
|
|
}
|
2010-03-29 19:17:55 -04:00
|
|
|
gotNext = true
|
|
|
|
}
|
2011-02-10 14:20:51 -05:00
|
|
|
if (finished) {
|
|
|
|
reader.close()
|
|
|
|
}
|
2010-03-29 19:17:55 -04:00
|
|
|
!finished
|
|
|
|
}
|
|
|
|
|
2010-10-16 22:03:33 -04:00
|
|
|
override def next: (K, V) = {
|
|
|
|
if (!gotNext) {
|
|
|
|
finished = !reader.next(key, value)
|
|
|
|
}
|
|
|
|
if (finished) {
|
|
|
|
throw new java.util.NoSuchElementException("End of stream")
|
|
|
|
}
|
2010-03-29 19:17:55 -04:00
|
|
|
gotNext = false
|
2010-10-16 22:03:33 -04:00
|
|
|
(key, value)
|
2010-03-29 19:17:55 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-08-31 15:08:09 -04:00
|
|
|
override def preferredLocations(split: Split) = {
|
2010-04-04 02:44:55 -04:00
|
|
|
// TODO: Filtering out "localhost" in case of file:// URLs
|
2010-10-16 20:25:09 -04:00
|
|
|
val hadoopSplit = split.asInstanceOf[HadoopSplit]
|
|
|
|
hadoopSplit.inputSplit.value.getLocations.filter(_ != "localhost")
|
2010-04-04 02:44:55 -04:00
|
|
|
}
|
2010-03-29 19:17:55 -04:00
|
|
|
}
|
|
|
|
|
2010-10-16 22:03:33 -04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Convenience class for Hadoop files read using TextInputFormat that
|
|
|
|
* represents the file as an RDD of Strings.
|
|
|
|
*/
|
|
|
|
class HadoopTextFile(sc: SparkContext, path: String)
|
|
|
|
extends MappedRDD[String, (LongWritable, Text)](
|
|
|
|
new HadoopFile(sc, path, classOf[TextInputFormat],
|
|
|
|
classOf[LongWritable], classOf[Text]),
|
|
|
|
{ pair: (LongWritable, Text) => pair._2.toString }
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Object used to ensure that only one thread at a time is configuring Hadoop
|
|
|
|
* InputFormat classes. Apparently configuring them is not thread safe!
|
|
|
|
*/
|
2010-03-29 19:17:55 -04:00
|
|
|
object ConfigureLock {}
|