1c690ddafa
This PR adds an initial implementation of count min sketch, contained in a new module spark-sketch under `common/sketch`. The implementation is based on the [`CountMinSketch` class in stream-lib][1].
As required by the [design doc][2], spark-sketch should have no external dependency.
Two classes, `Murmur3_x86_32` and `Platform` are copied to spark-sketch from spark-unsafe for hashing facilities. They'll also be used in the upcoming bloom filter implementation.
The following features will be added in future follow-up PRs:
- Serialization support
- DataFrame API integration
[1]: aac6b4d23a/src/main/java/com/clearspring/analytics/stream/frequency/CountMinSketch.java
[2]: https://issues.apache.org/jira/secure/attachment/12782378/BloomFilterandCount-MinSketchinSpark2.0.pdf
Author: Cheng Lian <lian@databricks.com>
Closes #10851 from liancheng/count-min-sketch.
43 lines
1.7 KiB
XML
43 lines
1.7 KiB
XML
<?xml version="1.0" encoding="UTF-8"?>
|
|
<!--
|
|
~ Licensed to the Apache Software Foundation (ASF) under one or more
|
|
~ contributor license agreements. See the NOTICE file distributed with
|
|
~ this work for additional information regarding copyright ownership.
|
|
~ The ASF licenses this file to You under the Apache License, Version 2.0
|
|
~ (the "License"); you may not use this file except in compliance with
|
|
~ the License. You may obtain a copy of the License at
|
|
~
|
|
~ http://www.apache.org/licenses/LICENSE-2.0
|
|
~
|
|
~ Unless required by applicable law or agreed to in writing, software
|
|
~ distributed under the License is distributed on an "AS IS" BASIS,
|
|
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
~ See the License for the specific language governing permissions and
|
|
~ limitations under the License.
|
|
-->
|
|
|
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
|
<modelVersion>4.0.0</modelVersion>
|
|
<parent>
|
|
<groupId>org.apache.spark</groupId>
|
|
<artifactId>spark-parent_2.10</artifactId>
|
|
<version>2.0.0-SNAPSHOT</version>
|
|
<relativePath>../../pom.xml</relativePath>
|
|
</parent>
|
|
|
|
<groupId>org.apache.spark</groupId>
|
|
<artifactId>spark-sketch_2.10</artifactId>
|
|
<packaging>jar</packaging>
|
|
<name>Spark Project Sketch</name>
|
|
<url>http://spark.apache.org/</url>
|
|
<properties>
|
|
<sbt.project.name>sketch</sbt.project.name>
|
|
</properties>
|
|
|
|
<build>
|
|
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
|
|
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
|
|
</build>
|
|
</project>
|