129 lines
3.6 KiB
Scala
129 lines
3.6 KiB
Scala
package org.mimirdb.pip.lib
|
|
|
|
import scala.collection.mutable
|
|
import scala.collection.mutable.PriorityQueue
|
|
|
|
object HierarchicalClustering
|
|
{
|
|
def naive[I, V](elements: IndexedSeq[(Array[I], V)], measure: Distance[I]): Cluster[I, V] =
|
|
{
|
|
|
|
def distance(i: Int, cmp: Iterable[Int]): Double =
|
|
cmp.map { j =>
|
|
if(i == j){ 0.0 }
|
|
else { measure.pointToPoint(elements(i)._1, elements(j)._1) }
|
|
}.sum
|
|
|
|
assert(!elements.isEmpty)
|
|
if(elements.size == 1){
|
|
Singleton(elements.head._1, elements.head._2)
|
|
} else {
|
|
val all = (0 until elements.size)
|
|
|
|
val (furthestIdx, furthestDistance) =
|
|
(0 until elements.size)
|
|
.map { i => i -> distance(i, all) }
|
|
.maxBy { _._2 }
|
|
|
|
val left =
|
|
mutable.Set(all.filterNot { _ == furthestIdx }:_*)
|
|
val right =
|
|
mutable.Set[Int](furthestIdx)
|
|
|
|
var done = false
|
|
while(left.size > 1 && !done){
|
|
val (bestMoveCandidate, bestMoveScore) =
|
|
left.toSeq
|
|
.map { i =>
|
|
val leftScore =
|
|
(1.0 / (left.size - 1)) * distance(i, left)
|
|
val rightScore =
|
|
(1.0 / right.size) * distance(i, right)
|
|
i -> (leftScore - rightScore)
|
|
}
|
|
.maxBy { _._2 }
|
|
if(bestMoveScore <= 0) { done = true }
|
|
else {
|
|
left -= bestMoveCandidate
|
|
right += bestMoveCandidate
|
|
}
|
|
}
|
|
|
|
Group(
|
|
Array(
|
|
naive(left.toIndexedSeq.map { elements(_) }, measure),
|
|
naive(right.toIndexedSeq.map { elements(_) }, measure),
|
|
),
|
|
radius = furthestDistance,
|
|
size = elements.size,
|
|
)
|
|
}
|
|
}
|
|
|
|
class ClusterIterator[I, V](root: Cluster[I, V])
|
|
extends Iterator[Cluster[I, V]]
|
|
{
|
|
val queue = PriorityQueue(root)(new Ordering[Cluster[I, V]]{
|
|
def compare(a: Cluster[I, V], b: Cluster[I, V]) =
|
|
Ordering[Double].compare(a.radius, b.radius)
|
|
})
|
|
|
|
def hasNext: Boolean = !queue.isEmpty
|
|
def next(): Cluster[I, V] =
|
|
{
|
|
val ret = queue.dequeue()
|
|
ret.children.foreach { queue.enqueue(_) }
|
|
return ret
|
|
}
|
|
}
|
|
|
|
trait Cluster[I, V]
|
|
{
|
|
def radius: Double
|
|
def render(firstPrefix: String, restPrefix: String): String
|
|
def size: Int
|
|
def children: Array[Cluster[I, V]]
|
|
def orderedIterator = new ClusterIterator(this)
|
|
def elements: Iterator[Singleton[I, V]]
|
|
def threshold(cutoff: Double): Iterator[Cluster[I, V]] =
|
|
{
|
|
if(radius > cutoff){
|
|
children.iterator.flatMap { _.threshold(cutoff) }
|
|
} else {
|
|
Seq(this).iterator
|
|
}
|
|
}
|
|
override def toString(): String = render("", "")
|
|
}
|
|
|
|
case class Group[I, V](children: Array[Cluster[I,V]], radius: Double, size: Int) extends Cluster[I, V]
|
|
{
|
|
def render(firstPrefix: String, restPrefix: String): String =
|
|
{
|
|
firstPrefix + s"- [$radius]\n" +
|
|
children.zipWithIndex.map { case (child, idx) =>
|
|
child.render(restPrefix + " +-",
|
|
if(idx == children.size - 1){
|
|
restPrefix + " "
|
|
} else {
|
|
restPrefix + " | "
|
|
}
|
|
)
|
|
}.mkString("\n")
|
|
}
|
|
def elements: Iterator[Singleton[I, V]] =
|
|
children.iterator.flatMap { _.elements }
|
|
}
|
|
case class Singleton[I, V](position: Array[I], value: V) extends Cluster[I, V]
|
|
{
|
|
def radius = 0.0
|
|
def size = 1
|
|
def children = Array()
|
|
def render(firstPrefix: String, restPrefix: String): String =
|
|
firstPrefix + " <" + position.mkString(", ") + s"> -> $value"
|
|
def elements: Iterator[Singleton[I, V]] =
|
|
Some(this).iterator
|
|
|
|
}
|
|
|
|
} |