mimir-pip/lib/src/org/mimirdb/pip/lib/HierarchicalClustering.scala

239 lines
7.1 KiB
Scala

package org.mimirdb.pip.lib
import scala.collection.mutable
import scala.reflect.ClassTag
import scala.collection.mutable.PriorityQueue
import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable.Stack
object HierarchicalClustering
{
def naive[I, V](elements: IndexedSeq[(Array[I], V)], measure: Distance[I]): Cluster[I, V] =
{
def distance(i: Int, cmp: Iterable[Int]): Double =
cmp.map { j =>
if(i == j){ 0.0 }
else { measure.pointToPoint(elements(i)._1, elements(j)._1) }
}.sum
assert(!elements.isEmpty)
if(elements.size == 1){
Singleton(elements.head._1, elements.head._2)
} else {
val all = (0 until elements.size)
val (furthestIdx, furthestDistance) =
(0 until elements.size)
.map { i => i -> distance(i, all) }
.maxBy { _._2 }
val left =
mutable.Set(all.filterNot { _ == furthestIdx }:_*)
val right =
mutable.Set[Int](furthestIdx)
var done = false
while(left.size > 1 && !done){
val (bestMoveCandidate, bestMoveScore) =
left.toSeq
.map { i =>
val leftScore =
(1.0 / (left.size - 1)) * distance(i, left)
val rightScore =
(1.0 / right.size) * distance(i, right)
i -> (leftScore - rightScore)
}
.maxBy { _._2 }
if(bestMoveScore <= 0) { done = true }
else {
left -= bestMoveCandidate
right += bestMoveCandidate
}
}
Group(
left = naive(left.toIndexedSeq.map { elements(_) }, measure),
right = naive(right.toIndexedSeq.map { elements(_) }, measure),
radius = furthestDistance,
size = elements.size,
)
}
}
def bottomUp[I: Ordering, V](elements: IndexedSeq[(Array[I], V)], measure: Distance[I])(implicit tag: ClassTag[I]): Cluster[I, V] =
{
assert(elements.size > 1)
val tree = new KDTree[I, Int](elements(0)._1.size)
// higher values get dequeued first--max heap
val queue = mutable.PriorityQueue()(new Ordering[(Double, Int, Int)]{
def compare(a: (Double, Int, Int), b: (Double, Int, Int)): Int =
{
// invert to get lower distances first
-Ordering[Double].compare(a._1, b._1)
}
})
val clusters: mutable.ArrayBuffer[Option[(Array[I], Cluster[I, V])]] =
mutable.ArrayBuffer(
elements.map { e => Some(e._1 -> Singleton(e._1, e._2)) }:_*
)
tree.insertAll(elements.map { _._1 }.zipWithIndex)
for( (point, idx) <- elements.map { _._1 }.zipWithIndex )
{
tree.nearest(point, measure, ignore = { (_, others, _) => !others.exists { _ != idx } }) match {
case Some( (nearestPoint, nearestIdxs, distance) ) =>
// println(s"Nearest to $idx: $nearestIdxs")
queue.enqueue( (distance, idx, nearestIdxs.filter { _ != idx }.head) )
case None =>
assert(false)
}
}
// Each iteration removes at least two items from the pqueue and adds one
// Hard bound the number of iterations defensively
for(i <- 0 until elements.size*4)
{
val (distance, aIdx, bIdx) = queue.dequeue()
(clusters(aIdx), clusters(bIdx)) match {
case (Some((aPos, aCluster)), Some((bPos, bCluster))) =>
{
// println(s"Merging $aIdx and $bIdx (at distance $distance)")
val centroid = measure.centroid((aCluster.elements ++ bCluster.elements).map { _.position }.toSeq)
val radius = (aCluster.elements ++ bCluster.elements).map { e =>
measure.pointToPoint(centroid, e.position)
}.max
tree.remove(aPos, aIdx)
tree.remove(bPos, bIdx)
clusters(aIdx) = None
clusters(bIdx) = None
val newCluster =
Group(
left = aCluster,
right = bCluster,
radius = radius,
size = aCluster.size + bCluster.size
)
if(tree.isEmpty){
return newCluster
} else {
val newIdx = clusters.size
clusters.append( Some(centroid, newCluster) )
val (nearestPosition, nearestIdx, nearestDistance) =
tree.nearest(centroid, measure).get
tree.insert(centroid, newIdx)
queue.enqueue(
(nearestDistance, newIdx, nearestIdx.head)
)
}
}
case _ =>
{
// println(s"Skipping $aIdx and $bIdx")
() // if a or b were removed, we're done with them
}
}
}
???
}
class ClusterIterator[I, V](root: Cluster[I, V])
extends Iterator[Cluster[I, V]]
{
val queue = PriorityQueue(root)(new Ordering[Cluster[I, V]]{
def compare(a: Cluster[I, V], b: Cluster[I, V]) =
Ordering[Double].compare(a.radius, b.radius)
})
def hasNext: Boolean = !queue.isEmpty
def next(): Cluster[I, V] =
{
val ret = queue.dequeue()
ret.children.foreach { queue.enqueue(_) }
return ret
}
}
class ClusterElementIterator[I, V](root: Cluster[I, V])
extends Iterator[Singleton[I, V]]
{
val stack = Stack[Group[I, V]]()
var nextSingleton: Singleton[I, V] = null
pushLeft(root)
def pushLeft(node: Cluster[I, V]): Unit =
{
var tmp = node;
while(tmp != null){
tmp = tmp match {
case g: Group[I, V] =>
{
stack.push(g)
g.left
}
case s: Singleton[I, V] =>
{
nextSingleton = s
null
}
}
}
}
def hasNext: Boolean = (nextSingleton != null)
def next: Singleton[I, V] =
{
val ret = nextSingleton
if(stack.isEmpty){ nextSingleton = null }
else { pushLeft(stack.pop.right) }
return ret
}
}
trait Cluster[I, V]
{
def radius: Double
def render(firstPrefix: String, restPrefix: String): String
def size: Int
def children: Seq[Cluster[I, V]]
def orderedIterator = new ClusterIterator(this)
def elements = new ClusterElementIterator[I, V](this)
def threshold(cutoff: Double): Iterator[Cluster[I, V]] =
{
if(radius > cutoff){
children.iterator.flatMap { _.threshold(cutoff) }
} else {
Seq(this).iterator
}
}
override def toString(): String = render("", "")
}
case class Group[I, V](left: Cluster[I,V], right: Cluster[I, V], radius: Double, size: Int) extends Cluster[I, V]
{
def children = Seq(left, right)
def render(firstPrefix: String, restPrefix: String): String =
firstPrefix + s"- [$radius]\n" +
left.render(restPrefix + " +-", restPrefix + " | ") + "\n" +
right.render(restPrefix + " +-", restPrefix + " ")
}
case class Singleton[I, V](position: Array[I], value: V) extends Cluster[I, V]
{
def children = Seq()
def radius = 0.0
def size = 1
def render(firstPrefix: String, restPrefix: String): String =
firstPrefix + " <" + position.mkString(", ") + s"> -> $value"
}
}