mimir-pip/lib/src/org/mimirdb/pip/lib/HierarchicalClustering.scala

129 lines
3.6 KiB
Scala

package org.mimirdb.pip.lib
import scala.collection.mutable
import scala.collection.mutable.PriorityQueue
object HierarchicalClustering
{
def naive[I, V](elements: IndexedSeq[(Array[I], V)], measure: Distance[I]): Cluster[I, V] =
{
def distance(i: Int, cmp: Iterable[Int]): Double =
cmp.map { j =>
if(i == j){ 0.0 }
else { measure.pointToPoint(elements(i)._1, elements(j)._1) }
}.sum
assert(!elements.isEmpty)
if(elements.size == 1){
Singleton(elements.head._1, elements.head._2)
} else {
val all = (0 until elements.size)
val (furthestIdx, furthestDistance) =
(0 until elements.size)
.map { i => i -> distance(i, all) }
.maxBy { _._2 }
val left =
mutable.Set(all.filterNot { _ == furthestIdx }:_*)
val right =
mutable.Set[Int](furthestIdx)
var done = false
while(left.size > 1 && !done){
val (bestMoveCandidate, bestMoveScore) =
left.toSeq
.map { i =>
val leftScore =
(1.0 / (left.size - 1)) * distance(i, left)
val rightScore =
(1.0 / right.size) * distance(i, right)
i -> (leftScore - rightScore)
}
.maxBy { _._2 }
if(bestMoveScore <= 0) { done = true }
else {
left -= bestMoveCandidate
right += bestMoveCandidate
}
}
Group(
Array(
naive(left.toIndexedSeq.map { elements(_) }, measure),
naive(right.toIndexedSeq.map { elements(_) }, measure),
),
radius = furthestDistance,
size = elements.size,
)
}
}
class ClusterIterator[I, V](root: Cluster[I, V])
extends Iterator[Cluster[I, V]]
{
val queue = PriorityQueue(root)(new Ordering[Cluster[I, V]]{
def compare(a: Cluster[I, V], b: Cluster[I, V]) =
Ordering[Double].compare(a.radius, b.radius)
})
def hasNext: Boolean = !queue.isEmpty
def next(): Cluster[I, V] =
{
val ret = queue.dequeue()
ret.children.foreach { queue.enqueue(_) }
return ret
}
}
trait Cluster[I, V]
{
def radius: Double
def render(firstPrefix: String, restPrefix: String): String
def size: Int
def children: Array[Cluster[I, V]]
def orderedIterator = new ClusterIterator(this)
def elements: Iterator[Singleton[I, V]]
def threshold(cutoff: Double): Iterator[Cluster[I, V]] =
{
if(radius > cutoff){
children.iterator.flatMap { _.threshold(cutoff) }
} else {
Seq(this).iterator
}
}
override def toString(): String = render("", "")
}
case class Group[I, V](children: Array[Cluster[I,V]], radius: Double, size: Int) extends Cluster[I, V]
{
def render(firstPrefix: String, restPrefix: String): String =
{
firstPrefix + s"- [$radius]\n" +
children.zipWithIndex.map { case (child, idx) =>
child.render(restPrefix + " +-",
if(idx == children.size - 1){
restPrefix + " "
} else {
restPrefix + " | "
}
)
}.mkString("\n")
}
def elements: Iterator[Singleton[I, V]] =
children.iterator.flatMap { _.elements }
}
case class Singleton[I, V](position: Array[I], value: V) extends Cluster[I, V]
{
def radius = 0.0
def size = 1
def children = Array()
def render(firstPrefix: String, restPrefix: String): String =
firstPrefix + " <" + position.mkString(", ") + s"> -> $value"
def elements: Iterator[Singleton[I, V]] =
Some(this).iterator
}
}