239 lines
7.1 KiB
Scala
239 lines
7.1 KiB
Scala
package org.mimirdb.pip.lib
|
|
|
|
import scala.collection.mutable
|
|
import scala.reflect.ClassTag
|
|
import scala.collection.mutable.PriorityQueue
|
|
import scala.collection.mutable.ArrayBuffer
|
|
import scala.collection.mutable.Stack
|
|
|
|
object HierarchicalClustering
|
|
{
|
|
def naive[I, V](elements: IndexedSeq[(Array[I], V)], measure: Distance[I]): Cluster[I, V] =
|
|
{
|
|
|
|
def distance(i: Int, cmp: Iterable[Int]): Double =
|
|
cmp.map { j =>
|
|
if(i == j){ 0.0 }
|
|
else { measure.pointToPoint(elements(i)._1, elements(j)._1) }
|
|
}.sum
|
|
|
|
assert(!elements.isEmpty)
|
|
if(elements.size == 1){
|
|
Singleton(elements.head._1, elements.head._2)
|
|
} else {
|
|
val all = (0 until elements.size)
|
|
|
|
val (furthestIdx, furthestDistance) =
|
|
(0 until elements.size)
|
|
.map { i => i -> distance(i, all) }
|
|
.maxBy { _._2 }
|
|
|
|
val left =
|
|
mutable.Set(all.filterNot { _ == furthestIdx }:_*)
|
|
val right =
|
|
mutable.Set[Int](furthestIdx)
|
|
|
|
var done = false
|
|
while(left.size > 1 && !done){
|
|
val (bestMoveCandidate, bestMoveScore) =
|
|
left.toSeq
|
|
.map { i =>
|
|
val leftScore =
|
|
(1.0 / (left.size - 1)) * distance(i, left)
|
|
val rightScore =
|
|
(1.0 / right.size) * distance(i, right)
|
|
i -> (leftScore - rightScore)
|
|
}
|
|
.maxBy { _._2 }
|
|
if(bestMoveScore <= 0) { done = true }
|
|
else {
|
|
left -= bestMoveCandidate
|
|
right += bestMoveCandidate
|
|
}
|
|
}
|
|
|
|
Group(
|
|
left = naive(left.toIndexedSeq.map { elements(_) }, measure),
|
|
right = naive(right.toIndexedSeq.map { elements(_) }, measure),
|
|
radius = furthestDistance,
|
|
size = elements.size,
|
|
)
|
|
}
|
|
}
|
|
|
|
def bottomUp[I: Ordering, V](elements: IndexedSeq[(Array[I], V)], measure: Distance[I])(implicit tag: ClassTag[I]): Cluster[I, V] =
|
|
{
|
|
assert(elements.size > 1)
|
|
val tree = new KDTree[I, Int](elements(0)._1.size)
|
|
// higher values get dequeued first--max heap
|
|
val queue = mutable.PriorityQueue()(new Ordering[(Double, Int, Int)]{
|
|
def compare(a: (Double, Int, Int), b: (Double, Int, Int)): Int =
|
|
{
|
|
// invert to get lower distances first
|
|
-Ordering[Double].compare(a._1, b._1)
|
|
}
|
|
})
|
|
|
|
val clusters: mutable.ArrayBuffer[Option[(Array[I], Cluster[I, V])]] =
|
|
mutable.ArrayBuffer(
|
|
elements.map { e => Some(e._1 -> Singleton(e._1, e._2)) }:_*
|
|
)
|
|
|
|
tree.insertAll(elements.map { _._1 }.zipWithIndex)
|
|
for( (point, idx) <- elements.map { _._1 }.zipWithIndex )
|
|
{
|
|
tree.nearest(point, measure, ignore = { (_, others, _) => !others.exists { _ != idx } }) match {
|
|
case Some( (nearestPoint, nearestIdxs, distance) ) =>
|
|
// println(s"Nearest to $idx: $nearestIdxs")
|
|
queue.enqueue( (distance, idx, nearestIdxs.filter { _ != idx }.head) )
|
|
case None =>
|
|
assert(false)
|
|
}
|
|
}
|
|
|
|
// Each iteration removes at least two items from the pqueue and adds one
|
|
// Hard bound the number of iterations defensively
|
|
for(i <- 0 until elements.size*4)
|
|
{
|
|
val (distance, aIdx, bIdx) = queue.dequeue()
|
|
|
|
|
|
(clusters(aIdx), clusters(bIdx)) match {
|
|
case (Some((aPos, aCluster)), Some((bPos, bCluster))) =>
|
|
{
|
|
// println(s"Merging $aIdx and $bIdx (at distance $distance)")
|
|
val centroid = measure.centroid((aCluster.elements ++ bCluster.elements).map { _.position }.toSeq)
|
|
val radius = (aCluster.elements ++ bCluster.elements).map { e =>
|
|
measure.pointToPoint(centroid, e.position)
|
|
}.max
|
|
tree.remove(aPos, aIdx)
|
|
tree.remove(bPos, bIdx)
|
|
clusters(aIdx) = None
|
|
clusters(bIdx) = None
|
|
|
|
val newCluster =
|
|
Group(
|
|
left = aCluster,
|
|
right = bCluster,
|
|
radius = radius,
|
|
size = aCluster.size + bCluster.size
|
|
)
|
|
if(tree.isEmpty){
|
|
return newCluster
|
|
} else {
|
|
val newIdx = clusters.size
|
|
clusters.append( Some(centroid, newCluster) )
|
|
|
|
val (nearestPosition, nearestIdx, nearestDistance) =
|
|
tree.nearest(centroid, measure).get
|
|
|
|
tree.insert(centroid, newIdx)
|
|
|
|
queue.enqueue(
|
|
(nearestDistance, newIdx, nearestIdx.head)
|
|
)
|
|
}
|
|
}
|
|
case _ =>
|
|
{
|
|
// println(s"Skipping $aIdx and $bIdx")
|
|
() // if a or b were removed, we're done with them
|
|
}
|
|
}
|
|
|
|
}
|
|
???
|
|
}
|
|
|
|
|
|
class ClusterIterator[I, V](root: Cluster[I, V])
|
|
extends Iterator[Cluster[I, V]]
|
|
{
|
|
val queue = PriorityQueue(root)(new Ordering[Cluster[I, V]]{
|
|
def compare(a: Cluster[I, V], b: Cluster[I, V]) =
|
|
Ordering[Double].compare(a.radius, b.radius)
|
|
})
|
|
|
|
def hasNext: Boolean = !queue.isEmpty
|
|
def next(): Cluster[I, V] =
|
|
{
|
|
val ret = queue.dequeue()
|
|
ret.children.foreach { queue.enqueue(_) }
|
|
return ret
|
|
}
|
|
}
|
|
|
|
class ClusterElementIterator[I, V](root: Cluster[I, V])
|
|
extends Iterator[Singleton[I, V]]
|
|
{
|
|
val stack = Stack[Group[I, V]]()
|
|
var nextSingleton: Singleton[I, V] = null
|
|
pushLeft(root)
|
|
|
|
def pushLeft(node: Cluster[I, V]): Unit =
|
|
{
|
|
var tmp = node;
|
|
while(tmp != null){
|
|
tmp = tmp match {
|
|
case g: Group[I, V] =>
|
|
{
|
|
stack.push(g)
|
|
g.left
|
|
}
|
|
case s: Singleton[I, V] =>
|
|
{
|
|
nextSingleton = s
|
|
null
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
def hasNext: Boolean = (nextSingleton != null)
|
|
def next: Singleton[I, V] =
|
|
{
|
|
val ret = nextSingleton
|
|
if(stack.isEmpty){ nextSingleton = null }
|
|
else { pushLeft(stack.pop.right) }
|
|
return ret
|
|
}
|
|
}
|
|
|
|
trait Cluster[I, V]
|
|
{
|
|
def radius: Double
|
|
def render(firstPrefix: String, restPrefix: String): String
|
|
def size: Int
|
|
def children: Seq[Cluster[I, V]]
|
|
def orderedIterator = new ClusterIterator(this)
|
|
def elements = new ClusterElementIterator[I, V](this)
|
|
def threshold(cutoff: Double): Iterator[Cluster[I, V]] =
|
|
{
|
|
if(radius > cutoff){
|
|
children.iterator.flatMap { _.threshold(cutoff) }
|
|
} else {
|
|
Seq(this).iterator
|
|
}
|
|
}
|
|
override def toString(): String = render("", "")
|
|
}
|
|
|
|
case class Group[I, V](left: Cluster[I,V], right: Cluster[I, V], radius: Double, size: Int) extends Cluster[I, V]
|
|
{
|
|
def children = Seq(left, right)
|
|
def render(firstPrefix: String, restPrefix: String): String =
|
|
firstPrefix + s"- [$radius]\n" +
|
|
left.render(restPrefix + " +-", restPrefix + " | ") + "\n" +
|
|
right.render(restPrefix + " +-", restPrefix + " ")
|
|
}
|
|
case class Singleton[I, V](position: Array[I], value: V) extends Cluster[I, V]
|
|
{
|
|
def children = Seq()
|
|
def radius = 0.0
|
|
def size = 1
|
|
def render(firstPrefix: String, restPrefix: String): String =
|
|
firstPrefix + " <" + position.mkString(", ") + s"> -> $value"
|
|
|
|
}
|
|
|
|
} |