131 lines
3.8 KiB
Scala
131 lines
3.8 KiB
Scala
package org.mimirdb.pip.lib
|
|
|
|
import org.scalatest.flatspec.AnyFlatSpec
|
|
import TestData._
|
|
import org.mimirdb.pip.distribution.Discretized
|
|
import org.mimirdb.pip.Test
|
|
|
|
class HierarchicalClusteringTests extends AnyFlatSpec {
|
|
|
|
def testRadii(cluster: HierarchicalClustering.Cluster[Double, Int], parentRadius: Double): Unit =
|
|
{
|
|
cluster match {
|
|
case _:HierarchicalClustering.Singleton[Double, Int] => ()
|
|
case g:HierarchicalClustering.Group[Double, Int] =>
|
|
{
|
|
// Adopt an approximation for now
|
|
assert(g.radius <= parentRadius * 1.2)
|
|
g.children.foreach { testRadii(_, g.radius) }
|
|
}
|
|
}
|
|
}
|
|
|
|
val Strategies =
|
|
Seq[(String, (IndexedSeq[(Array[Double], Int)], Distance[Double]) => HierarchicalClustering.Cluster[Double, Int])](
|
|
"Naive" -> { HierarchicalClustering.naive(_, _) },
|
|
"BottomUp" -> { HierarchicalClustering.bottomUp(_, _) },
|
|
)
|
|
|
|
"Clustering" should "be correct" in {
|
|
for( (strategy, makeClusters) <- Strategies ){
|
|
for(Measure <- Seq(ManhattanDistance, EuclideanDistance))
|
|
{
|
|
val test = Measure.getClass.getSimpleName.dropRight(9)
|
|
val clusters =
|
|
Time(s"$strategy-Cluster-$test"){
|
|
makeClusters(
|
|
TEST_POINTS.toIndexedSeq.zipWithIndex,
|
|
Measure
|
|
)
|
|
}
|
|
testRadii(clusters, Double.PositiveInfinity)
|
|
var lastRadius = Double.PositiveInfinity
|
|
|
|
Test.log(s"$strategy-Cluster-$test-Full"){ log =>
|
|
log(s"cluster_radius, kl_divergence")
|
|
for(c <- clusters.orderedIterator) {
|
|
if(c.radius > 0)
|
|
{
|
|
val centroid =
|
|
positionToBins(
|
|
Measure.centroid(c.elements.map { _.position }.toSeq)
|
|
)
|
|
val divergence =
|
|
c.elements.map { e =>
|
|
val bins = positionToBins(e.position)
|
|
Discretized.klDivergence(bins, centroid)
|
|
}.max
|
|
log(s"${c.radius}, $divergence")
|
|
(c.radius, divergence)
|
|
}
|
|
}
|
|
}
|
|
Test.log(s"$strategy-Cluster-$test-Cutoff_0.3"){ log =>
|
|
log(s"cluster_radius, kl_divergence")
|
|
for(c <- clusters.threshold(0.3)) {
|
|
if(c.radius > 0)
|
|
{
|
|
val centroid =
|
|
positionToBins(
|
|
Measure.centroid(c.elements.map { _.position }.toSeq)
|
|
)
|
|
val divergence =
|
|
c.elements.map { e =>
|
|
val bins = positionToBins(e.position)
|
|
Discretized.klDivergence(bins, centroid)
|
|
}.max
|
|
log(s"${c.radius}, $divergence")
|
|
(c.radius, divergence)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
it should "run fast" in {
|
|
val data = TestData.makeData(500).zipWithIndex
|
|
Time("Hierarchical@500"){
|
|
HierarchicalClustering.bottomUp(data, EuclideanDistance)
|
|
}
|
|
}
|
|
|
|
|
|
it should "perform sensibly" in {
|
|
// BURN IN
|
|
val data = TestData.makeData(200).zipWithIndex
|
|
for( (strategy, makeClusters) <- Strategies ){
|
|
makeClusters(data, EuclideanDistance)
|
|
}
|
|
|
|
val datasets = Seq(
|
|
100,
|
|
250,
|
|
500,
|
|
1000,
|
|
2500,
|
|
5000,
|
|
10000,
|
|
// 25000,
|
|
// 50000
|
|
).map { s =>
|
|
s -> TestData.makeData(s).zipWithIndex
|
|
}
|
|
|
|
for( (strategy, makeClusters) <- Strategies ){
|
|
Test.log(s"$strategy-Time"){ log =>
|
|
log("size, time_s")
|
|
for( (size, data) <- datasets )
|
|
{
|
|
val skip = (strategy == "Naive" && size >= 1000)
|
|
if(!skip){
|
|
Time(t => { log(s"$size, $t"); println(s"$strategy @ $size: $t s") }){
|
|
makeClusters(data, EuclideanDistance)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
} |