Merge HadoopDatasetRDD into HadoopRDD.

This commit is contained in:
Harvey Feng 2013-09-28 18:32:41 -07:00
parent 417085716a
commit 7d06bdde1d
2 changed files with 40 additions and 27 deletions

View file

@ -332,17 +332,15 @@ class SparkContext(
* etc).
*/
def hadoopRDD[K, V](
conf: JobConf,
jobConf: JobConf,
inputFormatClass: Class[_ <: InputFormat[K, V]],
keyClass: Class[K],
valueClass: Class[V],
minSplits: Int = defaultMinSplits
): RDD[(K, V)] = {
// Add necessary security credentials to the JobConf before broadcasting it.
SparkEnv.get.hadoop.addCredentials(conf)
// A Hadoop JobConf can be about 10 KB, which is pretty big, so broadcast it.
val confBroadcast = broadcast(new SerializableWritable(conf))
new HadoopDatasetRDD(this, confBroadcast, inputFormatClass, keyClass, valueClass, minSplits)
SparkEnv.get.hadoop.addCredentials(jobConf)
new HadoopRDD(this, jobConf, inputFormatClass, keyClass, valueClass, minSplits)
}
/** Get an RDD for a Hadoop file with an arbitrary InputFormat */
@ -353,6 +351,7 @@ class SparkContext(
valueClass: Class[V],
minSplits: Int = defaultMinSplits
): RDD[(K, V)] = {
// A Hadoop configuration can be about 10 KB, which is pretty big, so broadcast it.
val confBroadcast = broadcast(new SerializableWritable(hadoopConfiguration))
hadoopFile(path, confBroadcast, inputFormatClass, keyClass, valueClass, minSplits)
}

View file

@ -43,20 +43,18 @@ import org.apache.hadoop.conf.{Configuration, Configurable}
class HadoopFileRDD[K, V](
sc: SparkContext,
path: String,
hadoopConfBroadcast: Broadcast[SerializableWritable[Configuration]],
confBroadcast: Broadcast[SerializableWritable[Configuration]],
inputFormatClass: Class[_ <: InputFormat[K, V]],
keyClass: Class[K],
valueClass: Class[V],
minSplits: Int)
extends HadoopRDD[K, V](sc, inputFormatClass, keyClass, valueClass, minSplits) {
private val jobConfCacheKey = "rdd_%d_job_conf".format(id)
extends HadoopRDD[K, V](sc, confBroadcast, inputFormatClass, keyClass, valueClass, minSplits) {
override def getJobConf(): JobConf = {
if (HadoopRDD.containsCachedMetadata(jobConfCacheKey)) {
return HadoopRDD.getCachedMetadata(jobConfCacheKey).asInstanceOf[JobConf]
} else {
val newJobConf = new JobConf(hadoopConfBroadcast.value.value)
val newJobConf = new JobConf(confBroadcast.value.value)
FileInputFormat.setInputPaths(newJobConf, path)
HadoopRDD.putCachedMetadata(jobConfCacheKey, newJobConf)
return newJobConf
@ -64,21 +62,6 @@ class HadoopFileRDD[K, V](
}
}
/**
* An RDD that reads a Hadoop dataset as specified by a JobConf (e.g. tables in HBase).
*/
class HadoopDatasetRDD[K, V](
sc: SparkContext,
confBroadcast: Broadcast[SerializableWritable[JobConf]],
inputFormatClass: Class[_ <: InputFormat[K, V]],
keyClass: Class[K],
valueClass: Class[V],
minSplits: Int)
extends HadoopRDD[K, V](sc, inputFormatClass, keyClass, valueClass, minSplits) {
override def getJobConf(): JobConf = confBroadcast.value.value
}
/**
* A Spark split class that wraps around a Hadoop InputSplit.
*/
@ -95,18 +78,49 @@ private[spark] class HadoopPartition(rddId: Int, idx: Int, @transient s: InputSp
/**
* A base class that provides core functionality for reading data partitions stored in Hadoop.
*/
abstract class HadoopRDD[K, V](
class HadoopRDD[K, V](
sc: SparkContext,
confBroadcast: Broadcast[SerializableWritable[Configuration]],
inputFormatClass: Class[_ <: InputFormat[K, V]],
keyClass: Class[K],
valueClass: Class[V],
minSplits: Int)
extends RDD[(K, V)](sc, Nil) with Logging {
def this(
sc: SparkContext,
jobConf: JobConf,
inputFormatClass: Class[_ <: InputFormat[K, V]],
keyClass: Class[K],
valueClass: Class[V],
minSplits: Int) = {
this(
sc,
sc.broadcast(new SerializableWritable(jobConf))
.asInstanceOf[Broadcast[SerializableWritable[Configuration]]],
inputFormatClass,
keyClass,
valueClass,
minSplits)
}
protected val jobConfCacheKey = "rdd_%d_job_conf".format(id)
private val inputFormatCacheKey = "rdd_%d_input_format".format(id)
// Returns a JobConf that will be used on slaves to obtain input splits for Hadoop reads.
protected def getJobConf(): JobConf
protected def getJobConf(): JobConf = {
val conf: Configuration = confBroadcast.value.value
if (conf.isInstanceOf[JobConf]) {
return conf.asInstanceOf[JobConf]
} else if (HadoopRDD.containsCachedMetadata(jobConfCacheKey)) {
return HadoopRDD.getCachedMetadata(jobConfCacheKey).asInstanceOf[JobConf]
} else {
val newJobConf = new JobConf(confBroadcast.value.value)
HadoopRDD.putCachedMetadata(jobConfCacheKey, newJobConf)
return newJobConf
}
}
def getInputFormat(conf: JobConf): InputFormat[K, V] = {
if (HadoopRDD.containsCachedMetadata(inputFormatCacheKey)) {