Merge pull request #645 from pwendell/compression
Adding compression to Hadoop save functions
This commit is contained in:
commit
d93851aedf
|
@ -10,6 +10,8 @@ import scala.collection.JavaConversions._
|
|||
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.Path
|
||||
import org.apache.hadoop.io.compress.CompressionCodec
|
||||
import org.apache.hadoop.io.SequenceFile.CompressionType
|
||||
import org.apache.hadoop.mapred.FileOutputCommitter
|
||||
import org.apache.hadoop.mapred.FileOutputFormat
|
||||
import org.apache.hadoop.mapred.HadoopWriter
|
||||
|
@ -515,6 +517,16 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest](
|
|||
saveAsHadoopFile(path, getKeyClass, getValueClass, fm.erasure.asInstanceOf[Class[F]])
|
||||
}
|
||||
|
||||
/**
|
||||
* Output the RDD to any Hadoop-supported file system, using a Hadoop `OutputFormat` class
|
||||
* supporting the key and value types K and V in this RDD. Compress the result with the
|
||||
* supplied codec.
|
||||
*/
|
||||
def saveAsHadoopFile[F <: OutputFormat[K, V]](
|
||||
path: String, codec: Class[_ <: CompressionCodec]) (implicit fm: ClassManifest[F]) {
|
||||
saveAsHadoopFile(path, getKeyClass, getValueClass, fm.erasure.asInstanceOf[Class[F]], codec)
|
||||
}
|
||||
|
||||
/**
|
||||
* Output the RDD to any Hadoop-supported file system, using a new Hadoop API `OutputFormat`
|
||||
* (mapreduce.OutputFormat) object supporting the key and value types K and V in this RDD.
|
||||
|
@ -574,6 +586,20 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest](
|
|||
jobCommitter.cleanupJob(jobTaskContext)
|
||||
}
|
||||
|
||||
/**
|
||||
* Output the RDD to any Hadoop-supported file system, using a Hadoop `OutputFormat` class
|
||||
* supporting the key and value types K and V in this RDD. Compress with the supplied codec.
|
||||
*/
|
||||
def saveAsHadoopFile(
|
||||
path: String,
|
||||
keyClass: Class[_],
|
||||
valueClass: Class[_],
|
||||
outputFormatClass: Class[_ <: OutputFormat[_, _]],
|
||||
codec: Class[_ <: CompressionCodec]) {
|
||||
saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass,
|
||||
new JobConf(self.context.hadoopConfiguration), Some(codec))
|
||||
}
|
||||
|
||||
/**
|
||||
* Output the RDD to any Hadoop-supported file system, using a Hadoop `OutputFormat` class
|
||||
* supporting the key and value types K and V in this RDD.
|
||||
|
@ -583,11 +609,19 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest](
|
|||
keyClass: Class[_],
|
||||
valueClass: Class[_],
|
||||
outputFormatClass: Class[_ <: OutputFormat[_, _]],
|
||||
conf: JobConf = new JobConf(self.context.hadoopConfiguration)) {
|
||||
conf: JobConf = new JobConf(self.context.hadoopConfiguration),
|
||||
codec: Option[Class[_ <: CompressionCodec]] = None) {
|
||||
conf.setOutputKeyClass(keyClass)
|
||||
conf.setOutputValueClass(valueClass)
|
||||
// conf.setOutputFormat(outputFormatClass) // Doesn't work in Scala 2.9 due to what may be a generics bug
|
||||
conf.set("mapred.output.format.class", outputFormatClass.getName)
|
||||
for (c <- codec) {
|
||||
conf.setCompressMapOutput(true)
|
||||
conf.set("mapred.output.compress", "true")
|
||||
conf.setMapOutputCompressorClass(c)
|
||||
conf.set("mapred.output.compression.codec", c.getCanonicalName)
|
||||
conf.set("mapred.output.compression.type", CompressionType.BLOCK.toString)
|
||||
}
|
||||
conf.setOutputCommitter(classOf[FileOutputCommitter])
|
||||
FileOutputFormat.setOutputPath(conf, HadoopWriter.createPathFromString(path, conf))
|
||||
saveAsHadoopDataset(conf)
|
||||
|
|
|
@ -7,6 +7,7 @@ import scala.collection.JavaConversions.mapAsScalaMap
|
|||
import scala.collection.mutable.ArrayBuffer
|
||||
|
||||
import org.apache.hadoop.io.BytesWritable
|
||||
import org.apache.hadoop.io.compress.CompressionCodec
|
||||
import org.apache.hadoop.io.NullWritable
|
||||
import org.apache.hadoop.io.Text
|
||||
import org.apache.hadoop.mapred.TextOutputFormat
|
||||
|
@ -730,6 +731,14 @@ abstract class RDD[T: ClassManifest](
|
|||
.saveAsHadoopFile[TextOutputFormat[NullWritable, Text]](path)
|
||||
}
|
||||
|
||||
/**
|
||||
* Save this RDD as a compressed text file, using string representations of elements.
|
||||
*/
|
||||
def saveAsTextFile(path: String, codec: Class[_ <: CompressionCodec]) {
|
||||
this.map(x => (NullWritable.get(), new Text(x.toString)))
|
||||
.saveAsHadoopFile[TextOutputFormat[NullWritable, Text]](path, codec)
|
||||
}
|
||||
|
||||
/**
|
||||
* Save this RDD as a SequenceFile of serialized objects.
|
||||
*/
|
||||
|
|
|
@ -18,6 +18,7 @@ import org.apache.hadoop.mapred.TextOutputFormat
|
|||
import org.apache.hadoop.mapred.SequenceFileOutputFormat
|
||||
import org.apache.hadoop.mapred.OutputCommitter
|
||||
import org.apache.hadoop.mapred.FileOutputCommitter
|
||||
import org.apache.hadoop.io.compress.CompressionCodec
|
||||
import org.apache.hadoop.io.Writable
|
||||
import org.apache.hadoop.io.NullWritable
|
||||
import org.apache.hadoop.io.BytesWritable
|
||||
|
@ -62,7 +63,7 @@ class SequenceFileRDDFunctions[K <% Writable: ClassManifest, V <% Writable : Cla
|
|||
* byte arrays to BytesWritable, and Strings to Text. The `path` can be on any Hadoop-supported
|
||||
* file system.
|
||||
*/
|
||||
def saveAsSequenceFile(path: String) {
|
||||
def saveAsSequenceFile(path: String, codec: Option[Class[_ <: CompressionCodec]] = None) {
|
||||
def anyToWritable[U <% Writable](u: U): Writable = u
|
||||
|
||||
val keyClass = getWritableClass[K]
|
||||
|
@ -72,14 +73,18 @@ class SequenceFileRDDFunctions[K <% Writable: ClassManifest, V <% Writable : Cla
|
|||
|
||||
logInfo("Saving as sequence file of type (" + keyClass.getSimpleName + "," + valueClass.getSimpleName + ")" )
|
||||
val format = classOf[SequenceFileOutputFormat[Writable, Writable]]
|
||||
val jobConf = new JobConf(self.context.hadoopConfiguration)
|
||||
if (!convertKey && !convertValue) {
|
||||
self.saveAsHadoopFile(path, keyClass, valueClass, format)
|
||||
self.saveAsHadoopFile(path, keyClass, valueClass, format, jobConf, codec)
|
||||
} else if (!convertKey && convertValue) {
|
||||
self.map(x => (x._1,anyToWritable(x._2))).saveAsHadoopFile(path, keyClass, valueClass, format)
|
||||
self.map(x => (x._1,anyToWritable(x._2))).saveAsHadoopFile(
|
||||
path, keyClass, valueClass, format, jobConf, codec)
|
||||
} else if (convertKey && !convertValue) {
|
||||
self.map(x => (anyToWritable(x._1),x._2)).saveAsHadoopFile(path, keyClass, valueClass, format)
|
||||
self.map(x => (anyToWritable(x._1),x._2)).saveAsHadoopFile(
|
||||
path, keyClass, valueClass, format, jobConf, codec)
|
||||
} else if (convertKey && convertValue) {
|
||||
self.map(x => (anyToWritable(x._1),anyToWritable(x._2))).saveAsHadoopFile(path, keyClass, valueClass, format)
|
||||
self.map(x => (anyToWritable(x._1),anyToWritable(x._2))).saveAsHadoopFile(
|
||||
path, keyClass, valueClass, format, jobConf, codec)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -6,6 +6,7 @@ import java.util.Comparator
|
|||
import scala.Tuple2
|
||||
import scala.collection.JavaConversions._
|
||||
|
||||
import org.apache.hadoop.io.compress.CompressionCodec
|
||||
import org.apache.hadoop.mapred.JobConf
|
||||
import org.apache.hadoop.mapred.OutputFormat
|
||||
import org.apache.hadoop.mapreduce.{OutputFormat => NewOutputFormat}
|
||||
|
@ -459,6 +460,16 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kManifest: ClassManif
|
|||
rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass)
|
||||
}
|
||||
|
||||
/** Output the RDD to any Hadoop-supported file system, compressing with the supplied codec. */
|
||||
def saveAsHadoopFile[F <: OutputFormat[_, _]](
|
||||
path: String,
|
||||
keyClass: Class[_],
|
||||
valueClass: Class[_],
|
||||
outputFormatClass: Class[F],
|
||||
codec: Class[_ <: CompressionCodec]) {
|
||||
rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass, codec)
|
||||
}
|
||||
|
||||
/** Output the RDD to any Hadoop-supported file system. */
|
||||
def saveAsNewAPIHadoopFile[F <: NewOutputFormat[_, _]](
|
||||
path: String,
|
||||
|
|
|
@ -4,6 +4,7 @@ import java.util.{List => JList}
|
|||
import scala.Tuple2
|
||||
import scala.collection.JavaConversions._
|
||||
|
||||
import org.apache.hadoop.io.compress.CompressionCodec
|
||||
import spark.{SparkContext, Partition, RDD, TaskContext}
|
||||
import spark.api.java.JavaPairRDD._
|
||||
import spark.api.java.function.{Function2 => JFunction2, Function => JFunction, _}
|
||||
|
@ -310,6 +311,13 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
|
|||
*/
|
||||
def saveAsTextFile(path: String) = rdd.saveAsTextFile(path)
|
||||
|
||||
|
||||
/**
|
||||
* Save this RDD as a compressed text file, using string representations of elements.
|
||||
*/
|
||||
def saveAsTextFile(path: String, codec: Class[_ <: CompressionCodec]) =
|
||||
rdd.saveAsTextFile(path, codec)
|
||||
|
||||
/**
|
||||
* Save this RDD as a SequenceFile of serialized objects.
|
||||
*/
|
||||
|
|
|
@ -7,6 +7,8 @@ import scala.io.Source
|
|||
import com.google.common.io.Files
|
||||
import org.scalatest.FunSuite
|
||||
import org.apache.hadoop.io._
|
||||
import org.apache.hadoop.io.compress.{DefaultCodec, CompressionCodec, GzipCodec}
|
||||
|
||||
|
||||
import SparkContext._
|
||||
|
||||
|
@ -26,6 +28,28 @@ class FileSuite extends FunSuite with LocalSparkContext {
|
|||
assert(sc.textFile(outputDir).collect().toList === List("1", "2", "3", "4"))
|
||||
}
|
||||
|
||||
test("text files (compressed)") {
|
||||
sc = new SparkContext("local", "test")
|
||||
val tempDir = Files.createTempDir()
|
||||
val normalDir = new File(tempDir, "output_normal").getAbsolutePath
|
||||
val compressedOutputDir = new File(tempDir, "output_compressed").getAbsolutePath
|
||||
val codec = new DefaultCodec()
|
||||
|
||||
val data = sc.parallelize("a" * 10000, 1)
|
||||
data.saveAsTextFile(normalDir)
|
||||
data.saveAsTextFile(compressedOutputDir, classOf[DefaultCodec])
|
||||
|
||||
val normalFile = new File(normalDir, "part-00000")
|
||||
val normalContent = sc.textFile(normalDir).collect
|
||||
assert(normalContent === Array.fill(10000)("a"))
|
||||
|
||||
val compressedFile = new File(compressedOutputDir, "part-00000" + codec.getDefaultExtension)
|
||||
val compressedContent = sc.textFile(compressedOutputDir).collect
|
||||
assert(compressedContent === Array.fill(10000)("a"))
|
||||
|
||||
assert(compressedFile.length < normalFile.length)
|
||||
}
|
||||
|
||||
test("SequenceFiles") {
|
||||
sc = new SparkContext("local", "test")
|
||||
val tempDir = Files.createTempDir()
|
||||
|
@ -37,6 +61,28 @@ class FileSuite extends FunSuite with LocalSparkContext {
|
|||
assert(output.map(_.toString).collect().toList === List("(1,a)", "(2,aa)", "(3,aaa)"))
|
||||
}
|
||||
|
||||
test("SequenceFile (compressed)") {
|
||||
sc = new SparkContext("local", "test")
|
||||
val tempDir = Files.createTempDir()
|
||||
val normalDir = new File(tempDir, "output_normal").getAbsolutePath
|
||||
val compressedOutputDir = new File(tempDir, "output_compressed").getAbsolutePath
|
||||
val codec = new DefaultCodec()
|
||||
|
||||
val data = sc.parallelize(Seq.fill(100)("abc"), 1).map(x => (x, x))
|
||||
data.saveAsSequenceFile(normalDir)
|
||||
data.saveAsSequenceFile(compressedOutputDir, Some(classOf[DefaultCodec]))
|
||||
|
||||
val normalFile = new File(normalDir, "part-00000")
|
||||
val normalContent = sc.sequenceFile[String, String](normalDir).collect
|
||||
assert(normalContent === Array.fill(100)("abc", "abc"))
|
||||
|
||||
val compressedFile = new File(compressedOutputDir, "part-00000" + codec.getDefaultExtension)
|
||||
val compressedContent = sc.sequenceFile[String, String](compressedOutputDir).collect
|
||||
assert(compressedContent === Array.fill(100)("abc", "abc"))
|
||||
|
||||
assert(compressedFile.length < normalFile.length)
|
||||
}
|
||||
|
||||
test("SequenceFile with writable key") {
|
||||
sc = new SparkContext("local", "test")
|
||||
val tempDir = Files.createTempDir()
|
||||
|
|
|
@ -8,6 +8,7 @@ import java.util.*;
|
|||
import scala.Tuple2;
|
||||
|
||||
import com.google.common.base.Charsets;
|
||||
import org.apache.hadoop.io.compress.DefaultCodec;
|
||||
import com.google.common.io.Files;
|
||||
import org.apache.hadoop.io.IntWritable;
|
||||
import org.apache.hadoop.io.Text;
|
||||
|
@ -473,6 +474,19 @@ public class JavaAPISuite implements Serializable {
|
|||
Assert.assertEquals(expected, readRDD.collect());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void textFilesCompressed() throws IOException {
|
||||
File tempDir = Files.createTempDir();
|
||||
String outputDir = new File(tempDir, "output").getAbsolutePath();
|
||||
JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4));
|
||||
rdd.saveAsTextFile(outputDir, DefaultCodec.class);
|
||||
|
||||
// Try reading it in as a text file RDD
|
||||
List<String> expected = Arrays.asList("1", "2", "3", "4");
|
||||
JavaRDD<String> readRDD = sc.textFile(outputDir);
|
||||
Assert.assertEquals(expected, readRDD.collect());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void sequenceFile() {
|
||||
File tempDir = Files.createTempDir();
|
||||
|
@ -619,6 +633,37 @@ public class JavaAPISuite implements Serializable {
|
|||
}).collect().toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void hadoopFileCompressed() {
|
||||
File tempDir = Files.createTempDir();
|
||||
String outputDir = new File(tempDir, "output_compressed").getAbsolutePath();
|
||||
List<Tuple2<Integer, String>> pairs = Arrays.asList(
|
||||
new Tuple2<Integer, String>(1, "a"),
|
||||
new Tuple2<Integer, String>(2, "aa"),
|
||||
new Tuple2<Integer, String>(3, "aaa")
|
||||
);
|
||||
JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
|
||||
|
||||
rdd.map(new PairFunction<Tuple2<Integer, String>, IntWritable, Text>() {
|
||||
@Override
|
||||
public Tuple2<IntWritable, Text> call(Tuple2<Integer, String> pair) {
|
||||
return new Tuple2<IntWritable, Text>(new IntWritable(pair._1()), new Text(pair._2()));
|
||||
}
|
||||
}).saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class,
|
||||
DefaultCodec.class);
|
||||
|
||||
JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
|
||||
SequenceFileInputFormat.class, IntWritable.class, Text.class);
|
||||
|
||||
Assert.assertEquals(pairs.toString(), output.map(new Function<Tuple2<IntWritable, Text>,
|
||||
String>() {
|
||||
@Override
|
||||
public String call(Tuple2<IntWritable, Text> x) {
|
||||
return x.toString();
|
||||
}
|
||||
}).collect().toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void zip() {
|
||||
JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
|
||||
|
|
Loading…
Reference in a new issue