[SPARK-14873][CORE] Java sampleByKey methods take ju.Map but with Scala Double values; results in type Object
## What changes were proposed in this pull request? Java `sampleByKey` methods should accept `Map` with `java.lang.Double` values ## How was this patch tested? Existing (updated) Jenkins tests Author: Sean Owen <sowen@cloudera.com> Closes #12637 from srowen/SPARK-14873.
This commit is contained in:
parent
a55fbe2a16
commit
be0d5d3bbe
|
@ -139,9 +139,12 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
|
|||
* math.ceil(numItems * samplingRate) over all key values.
|
||||
*/
|
||||
def sampleByKey(withReplacement: Boolean,
|
||||
fractions: java.util.Map[K, Double],
|
||||
fractions: java.util.Map[K, jl.Double],
|
||||
seed: Long): JavaPairRDD[K, V] =
|
||||
new JavaPairRDD[K, V](rdd.sampleByKey(withReplacement, fractions.asScala, seed))
|
||||
new JavaPairRDD[K, V](rdd.sampleByKey(
|
||||
withReplacement,
|
||||
fractions.asScala.mapValues(_.toDouble).toMap, // map to Scala Double; toMap to serialize
|
||||
seed))
|
||||
|
||||
/**
|
||||
* Return a subset of this RDD sampled by key (via stratified sampling).
|
||||
|
@ -154,7 +157,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
|
|||
* Use Utils.random.nextLong as the default seed for the random number generator.
|
||||
*/
|
||||
def sampleByKey(withReplacement: Boolean,
|
||||
fractions: java.util.Map[K, Double]): JavaPairRDD[K, V] =
|
||||
fractions: java.util.Map[K, jl.Double]): JavaPairRDD[K, V] =
|
||||
sampleByKey(withReplacement, fractions, Utils.random.nextLong)
|
||||
|
||||
/**
|
||||
|
@ -168,9 +171,12 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
|
|||
* two additional passes.
|
||||
*/
|
||||
def sampleByKeyExact(withReplacement: Boolean,
|
||||
fractions: java.util.Map[K, Double],
|
||||
fractions: java.util.Map[K, jl.Double],
|
||||
seed: Long): JavaPairRDD[K, V] =
|
||||
new JavaPairRDD[K, V](rdd.sampleByKeyExact(withReplacement, fractions.asScala, seed))
|
||||
new JavaPairRDD[K, V](rdd.sampleByKeyExact(
|
||||
withReplacement,
|
||||
fractions.asScala.mapValues(_.toDouble).toMap, // map to Scala Double; toMap to serialize
|
||||
seed))
|
||||
|
||||
/**
|
||||
* Return a subset of this RDD sampled by key (via stratified sampling) containing exactly
|
||||
|
@ -186,7 +192,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
|
|||
*/
|
||||
def sampleByKeyExact(
|
||||
withReplacement: Boolean,
|
||||
fractions: java.util.Map[K, Double]): JavaPairRDD[K, V] =
|
||||
fractions: java.util.Map[K, jl.Double]): JavaPairRDD[K, V] =
|
||||
sampleByKeyExact(withReplacement, fractions, Utils.random.nextLong)
|
||||
|
||||
/**
|
||||
|
|
|
@ -44,7 +44,6 @@ import com.google.common.collect.ImmutableMap;
|
|||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Iterators;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.google.common.base.Throwables;
|
||||
import com.google.common.io.Files;
|
||||
import org.apache.hadoop.io.IntWritable;
|
||||
|
@ -1644,7 +1643,7 @@ public class JavaAPISuite implements Serializable {
|
|||
return new Tuple2<>(i % 2, 1);
|
||||
}
|
||||
});
|
||||
Map<Integer, Object> fractions = Maps.newHashMap();
|
||||
Map<Integer, Double> fractions = new HashMap<>();
|
||||
fractions.put(0, 0.5);
|
||||
fractions.put(1, 1.0);
|
||||
JavaPairRDD<Integer, Integer> wr = rdd2.sampleByKey(true, fractions, 1L);
|
||||
|
@ -1670,7 +1669,7 @@ public class JavaAPISuite implements Serializable {
|
|||
return new Tuple2<>(i % 2, 1);
|
||||
}
|
||||
});
|
||||
Map<Integer, Object> fractions = Maps.newHashMap();
|
||||
Map<Integer, Double> fractions = new HashMap<>();
|
||||
fractions.put(0, 0.5);
|
||||
fractions.put(1, 1.0);
|
||||
JavaPairRDD<Integer, Integer> wrExact = rdd2.sampleByKeyExact(true, fractions, 1L);
|
||||
|
|
|
@ -37,22 +37,19 @@ public class JavaStratifiedSamplingExample {
|
|||
|
||||
@SuppressWarnings("unchecked")
|
||||
// $example on$
|
||||
List<Tuple2<Integer, Character>> list = new ArrayList<>(
|
||||
Arrays.<Tuple2<Integer, Character>>asList(
|
||||
new Tuple2(1, 'a'),
|
||||
new Tuple2(1, 'b'),
|
||||
new Tuple2(2, 'c'),
|
||||
new Tuple2(2, 'd'),
|
||||
new Tuple2(2, 'e'),
|
||||
new Tuple2(3, 'f')
|
||||
)
|
||||
List<Tuple2<Integer, Character>> list = Arrays.asList(
|
||||
new Tuple2<>(1, 'a'),
|
||||
new Tuple2<>(1, 'b'),
|
||||
new Tuple2<>(2, 'c'),
|
||||
new Tuple2<>(2, 'd'),
|
||||
new Tuple2<>(2, 'e'),
|
||||
new Tuple2<>(3, 'f')
|
||||
);
|
||||
|
||||
JavaPairRDD<Integer, Character> data = jsc.parallelizePairs(list);
|
||||
|
||||
// specify the exact fraction desired from each key Map<K, Object>
|
||||
ImmutableMap<Integer, Object> fractions =
|
||||
ImmutableMap.of(1, (Object)0.1, 2, (Object) 0.6, 3, (Object) 0.3);
|
||||
// specify the exact fraction desired from each key Map<K, Double>
|
||||
ImmutableMap<Integer, Double> fractions = ImmutableMap.of(1, 0.1, 2, 0.6, 3, 0.3);
|
||||
|
||||
// Get an approximate sample from each stratum
|
||||
JavaPairRDD<Integer, Character> approxSample = data.sampleByKey(false, fractions);
|
||||
|
|
Loading…
Reference in a new issue