[SPARK-31518][CORE] Expose filterByRange in JavaPairRDD
### What changes were proposed in this pull request? This exposes the `filterByRange` method from `OrderedRDDFunctions` in the Java API (as a method of JavaPairRDD). This is the only method of `OrderedRDDFunctions` which is not exposed in the Java API so far. ### Why are the changes needed? This improves the consistency between the Scala and Java APIs. Calling the Scala method manually from a Java context is cumbersome as it requires passing many ClassTags. ### Does this PR introduce any user-facing change? Yes, a new method in the Java API. ### How was this patch tested? With unit tests. The implementation of the Scala method is already tested independently and it was not touched in this PR. Suggesting srowen as a reviewer. Closes #28293 from wetneb/SPARK-31518. Authored-by: Antonin Delpeuch <antonin@delpeuch.eu> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
This commit is contained in:
parent
da3c6c4e35
commit
497024956a
|
@ -32,6 +32,7 @@ import org.apache.hadoop.mapreduce.{OutputFormat => NewOutputFormat}
|
|||
|
||||
import org.apache.spark.{HashPartitioner, Partitioner}
|
||||
import org.apache.spark.Partitioner._
|
||||
import org.apache.spark.annotation.Since
|
||||
import org.apache.spark.api.java.JavaSparkContext.fakeClassTag
|
||||
import org.apache.spark.api.java.JavaUtils.mapAsSerializableJavaMap
|
||||
import org.apache.spark.api.java.function.{FlatMapFunction, Function => JFunction,
|
||||
|
@ -937,6 +938,34 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
|
|||
fromRDD(new OrderedRDDFunctions[K, V, (K, V)](rdd).sortByKey(ascending, numPartitions))
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a RDD containing only the elements in the inclusive range `lower` to `upper`.
|
||||
* If the RDD has been partitioned using a `RangePartitioner`, then this operation can be
|
||||
* performed efficiently by only scanning the partitions that might containt matching elements.
|
||||
* Otherwise, a standard `filter` is applied to all partitions.
|
||||
*
|
||||
* @since 3.1.0
|
||||
*/
|
||||
@Since("3.1.0")
|
||||
def filterByRange(lower: K, upper: K): JavaPairRDD[K, V] = {
|
||||
val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]]
|
||||
filterByRange(comp, lower, upper)
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a RDD containing only the elements in the inclusive range `lower` to `upper`.
|
||||
* If the RDD has been partitioned using a `RangePartitioner`, then this operation can be
|
||||
* performed efficiently by only scanning the partitions that might containt matching elements.
|
||||
* Otherwise, a standard `filter` is applied to all partitions.
|
||||
*
|
||||
* @since 3.1.0
|
||||
*/
|
||||
@Since("3.1.0")
|
||||
def filterByRange(comp: Comparator[K], lower: K, upper: K): JavaPairRDD[K, V] = {
|
||||
implicit val ordering = comp // Allow implicit conversion of Comparator to Ordering.
|
||||
fromRDD(new OrderedRDDFunctions[K, V, (K, V)](rdd).filterByRange(lower, upper))
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an RDD with the keys of each tuple.
|
||||
*/
|
||||
|
|
|
@ -251,6 +251,34 @@ public class JavaAPISuite implements Serializable {
|
|||
Arrays.asList(new Tuple2<>(1, 3), new Tuple2<>(3, 8), new Tuple2<>(3, 8)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void filterByRange() {
|
||||
List<Tuple2<Integer, Integer>> pairs = new ArrayList<>();
|
||||
pairs.add(new Tuple2<>(0, 5));
|
||||
pairs.add(new Tuple2<>(1, 8));
|
||||
pairs.add(new Tuple2<>(2, 6));
|
||||
pairs.add(new Tuple2<>(3, 8));
|
||||
pairs.add(new Tuple2<>(4, 8));
|
||||
|
||||
JavaPairRDD<Integer, Integer> rdd = sc.parallelizePairs(pairs).sortByKey();
|
||||
|
||||
// Default comparator
|
||||
JavaPairRDD<Integer, Integer> filteredRDD = rdd.filterByRange(3, 11);
|
||||
List<Tuple2<Integer, Integer>> filteredPairs = filteredRDD.collect();
|
||||
assertEquals(filteredPairs.size(), 2);
|
||||
assertEquals(filteredPairs.get(0), new Tuple2<>(3, 8));
|
||||
assertEquals(filteredPairs.get(1), new Tuple2<>(4, 8));
|
||||
|
||||
// Custom comparator
|
||||
filteredRDD = rdd.filterByRange(Collections.reverseOrder(), 3, -2);
|
||||
filteredPairs = filteredRDD.collect();
|
||||
assertEquals(filteredPairs.size(), 4);
|
||||
assertEquals(filteredPairs.get(0), new Tuple2<>(0, 5));
|
||||
assertEquals(filteredPairs.get(1), new Tuple2<>(1, 8));
|
||||
assertEquals(filteredPairs.get(2), new Tuple2<>(2, 6));
|
||||
assertEquals(filteredPairs.get(3), new Tuple2<>(3, 8));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void emptyRDD() {
|
||||
JavaRDD<String> rdd = sc.emptyRDD();
|
||||
|
|
Loading…
Reference in a new issue