mimir-pip/lib/test/src/org/mimirdb/pip/lib/HierarchicalClusteringTests...

131 lines
3.8 KiB
Scala

package org.mimirdb.pip.lib
import org.scalatest.flatspec.AnyFlatSpec
import TestData._
import org.mimirdb.pip.distribution.Discretized
import org.mimirdb.pip.Test
class HierarchicalClusteringTests extends AnyFlatSpec {
def testRadii(cluster: HierarchicalClustering.Cluster[Double, Int], parentRadius: Double): Unit =
{
cluster match {
case _:HierarchicalClustering.Singleton[Double, Int] => ()
case g:HierarchicalClustering.Group[Double, Int] =>
{
// Adopt an approximation for now
assert(g.radius <= parentRadius * 1.2)
g.children.foreach { testRadii(_, g.radius) }
}
}
}
val Strategies =
Seq[(String, (IndexedSeq[(Array[Double], Int)], Distance[Double]) => HierarchicalClustering.Cluster[Double, Int])](
"Naive" -> { HierarchicalClustering.naive(_, _) },
"BottomUp" -> { HierarchicalClustering.bottomUp(_, _) },
)
"Clustering" should "be correct" in {
for( (strategy, makeClusters) <- Strategies ){
for(Measure <- Seq(ManhattanDistance, EuclideanDistance))
{
val test = Measure.getClass.getSimpleName.dropRight(9)
val clusters =
Time(s"$strategy-Cluster-$test"){
makeClusters(
TEST_POINTS.toIndexedSeq.zipWithIndex,
Measure
)
}
testRadii(clusters, Double.PositiveInfinity)
var lastRadius = Double.PositiveInfinity
Test.log(s"$strategy-Cluster-$test-Full"){ log =>
log(s"cluster_radius, kl_divergence")
for(c <- clusters.orderedIterator) {
if(c.radius > 0)
{
val centroid =
positionToBins(
Measure.centroid(c.elements.map { _.position }.toSeq)
)
val divergence =
c.elements.map { e =>
val bins = positionToBins(e.position)
Discretized.klDivergence(bins, centroid)
}.max
log(s"${c.radius}, $divergence")
(c.radius, divergence)
}
}
}
Test.log(s"$strategy-Cluster-$test-Cutoff_0.3"){ log =>
log(s"cluster_radius, kl_divergence")
for(c <- clusters.threshold(0.3)) {
if(c.radius > 0)
{
val centroid =
positionToBins(
Measure.centroid(c.elements.map { _.position }.toSeq)
)
val divergence =
c.elements.map { e =>
val bins = positionToBins(e.position)
Discretized.klDivergence(bins, centroid)
}.max
log(s"${c.radius}, $divergence")
(c.radius, divergence)
}
}
}
}
}
}
it should "run fast" in {
val data = TestData.makeData(500).zipWithIndex
Time("Hierarchical@500"){
HierarchicalClustering.bottomUp(data, EuclideanDistance)
}
}
it should "perform sensibly" in {
// BURN IN
val data = TestData.makeData(200).zipWithIndex
for( (strategy, makeClusters) <- Strategies ){
makeClusters(data, EuclideanDistance)
}
val datasets = Seq(
100,
250,
500,
1000,
2500,
5000,
10000,
// 25000,
// 50000
).map { s =>
s -> TestData.makeData(s).zipWithIndex
}
for( (strategy, makeClusters) <- Strategies ){
Test.log(s"$strategy-Time"){ log =>
log("size, time_s")
for( (size, data) <- datasets )
{
val skip = (strategy == "Naive" && size >= 1000)
if(!skip){
Time(t => { log(s"$size, $t"); println(s"$strategy @ $size: $t s") }){
makeClusters(data, EuclideanDistance)
}
}
}
}
}
}
}