Source code for classix.merging

# -*- coding: utf-8 -*-
#
# CLASSIX: Fast and explainable clustering based on sorting
#
# MIT License
#
# Copyright (c) 2023 Stefan Güttel, Xinye Chen
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# Python implementation for group merging

import collections
import numpy as np
from scipy.special import betainc, gamma


def distance_merging_mtg(data, labels, splist, radius, minPts, scale, sort_vals, half_nrm2):
    """
    Implement CLASSIX's merging without merging tiny groups
    
    Parameters
    ----------
    data : numpy.ndarray
        The input that is array-like of shape (n_samples,).
    
    labels : list
        aggregation labels

    splist : numpy.ndarray
        Represent the list of starting points information formed in the aggregation. 
        list of [ starting point index of current group, sorting values, and number of group elements ].

    radius : float
        The tolerance to control the aggregation. If the distance between the starting point 
        of a group and another data point is less than or equal to the tolerance,
        the point is allocated to that group. For details, we refer users to [1].

    minPts : int, default=0
        The threshold, in the range of [0, infity] to determine the noise degree.
        When assign it 0, algorithm won't check noises.

    scale : float, default 1.5
        Design for distance-clustering, when distance between the two starting points 
        associated with two distinct groups smaller than scale*radius, then the two groups merge.

    sort_vals : numpy.ndarray
        Sorting values.
        
    half_nrm2 : numpy.ndarray
        Precomputed values for distance computation.

    Returns
    -------
    labels : numpy.ndarray
        The merging labels.
    
    old_cluster_count : int
        The number of clusters without outliers elimination.
    
    SIZE_NOISE_LABELS : int
        The number of clusters marked as outliers.

        
    References
    ----------
    [1] X. Chen and S. Güttel. Fast and explainable sorted based clustering, 2022

    """
    
    splist_indices = splist[:, 0]
    gp_nr = splist[:, 1] >= minPts
    spdata = data[splist_indices]
    sort_vals_sp = sort_vals[splist_indices]

    labels = np.asarray(labels)
    sp_cluster_labels = labels[splist_indices]

    radius = scale*radius
    scaled_radius = 0.5*(radius)**2

    for i in range(splist.shape[0]):
        if not gp_nr[i]:    # tiny groups can not take over large ones
            continue

        xi = spdata[i, :]
        rhs = scaled_radius - half_nrm2[i]
        last_j = np.searchsorted(sort_vals_sp, radius + sort_vals_sp[i], side='right')

        inds = (half_nrm2[i:last_j] - np.matmul(spdata[i:last_j], xi) <= rhs)
        inds = i + np.where(inds&gp_nr[i:last_j])[0]

        spl = np.unique(sp_cluster_labels[inds])
        minlab = np.min(spl)
        
        for label in spl:
            sp_cluster_labels[sp_cluster_labels==label] = minlab

    # rename labels to be 1,2,3,... and determine cluster sizes
    ul = np.unique(sp_cluster_labels)
    nr_u = len(ul)
    
    cs = np.zeros(nr_u, dtype=int)
    grp_sizes = splist[:, 1].astype(int)

    for i in range(nr_u):
        cid = sp_cluster_labels==ul[i]
        sp_cluster_labels[cid] = i
        cs[i] = np.sum(grp_sizes[cid])

    old_cluster_count = collections.Counter(sp_cluster_labels[labels])
    cid = np.nonzero(cs < minPts)[0]
    SIZE_NOISE_LABELS = cid.size

    if SIZE_NOISE_LABELS > 0:
        copy_sp_cluster_label = sp_cluster_labels.copy()
        
        for i in cid:
            ii = np.nonzero(copy_sp_cluster_label==i)[0] # find all tiny groups with that label
            for iii in ii:
                xi = spdata[iii, :]    # starting point of one tiny group

                dist = half_nrm2 - np.matmul(spdata, xi) + half_nrm2[iii]
                merge_ind = np.argsort(dist)
                for j in merge_ind:
                    if cs[copy_sp_cluster_label[j]] >= minPts:
                        sp_cluster_labels[iii] = copy_sp_cluster_label[j]
                        break

        ul = np.unique(sp_cluster_labels)
        nr_u = len(ul)
        cs = np.zeros(nr_u, dtype=int)

        for i in range(nr_u):
            cid = sp_cluster_labels==ul[i]
            sp_cluster_labels[cid] = i
            cs[i] = np.sum(grp_sizes[cid])

    labels = sp_cluster_labels[labels]

    return labels, old_cluster_count, SIZE_NOISE_LABELS




[docs]
def distance_merging(data, labels, splist, radius, minPts, scale, sort_vals, half_nrm2):
    """
    Implement CLASSIX's merging with early stopping and BLAS routines
    
    Parameters
    ----------
    data : numpy.ndarray
        The input that is array-like of shape (n_samples,).
    
    labels : list
        aggregation labels

    splist : numpy.ndarray
        Represent the list of starting points information formed in the aggregation. 
        list of [ starting point index of current group, sorting values, and number of group elements ].

    radius : float
        The tolerance to control the aggregation. If the distance between the starting point 
        of a group and another data point is less than or equal to the tolerance,
        the point is allocated to that group. For details, we refer users to [1].

    minPts : int
        The threshold, in the range of [0, infity] to determine the noise degree.
        When assign it 0, algorithm won't check noises.

    scale : float
        Design for distance-clustering, when distance between the two starting points 
        associated with two distinct groups smaller than scale*radius, then the two groups merge.

    sort_vals : numpy.ndarray
        Sorting values.
        
    half_nrm2 : numpy.ndarray
        Precomputed values for distance computation.

    Returns
    -------
    labels : numpy.ndarray
        The merging labels.
    
    old_cluster_count : int
        The number of clusters without outliers elimination.
    
    SIZE_NOISE_LABELS : int
        The number of clusters marked as outliers.

        
    References
    ----------
    [1] X. Chen and S. Güttel. Fast and explainable sorted based clustering, 2022

    """
    
    splist_indices = splist[:, 0]
    
    sort_vals_sp = sort_vals[splist_indices]
    spdata = data[splist_indices]
    
    labels = np.asarray(labels)
    sp_cluster_labels = labels[splist_indices]
    radius = scale*radius
    radius_2 = (radius)**2/2

    for i in range(splist.shape[0]):
        xi = spdata[i, :]

        last_j = np.searchsorted(sort_vals_sp, radius + sort_vals_sp[i], side='right')
        eucl = half_nrm2[i:last_j] - np.matmul(spdata[i:last_j,:], xi)
        inds = np.where(eucl <= radius_2 - half_nrm2[i]) 

        inds = i + inds[0]
        spl = np.unique(sp_cluster_labels[inds])
        minlab = np.min(spl)
        
        for label in spl:
            sp_cluster_labels[sp_cluster_labels==label] = minlab

    # rename labels to be 1,2,3,... and determine cluster sizes
    ul = np.unique(sp_cluster_labels)
    nr_u = len(ul)
    
    cs = np.zeros(nr_u, dtype=int)
    grp_sizes = splist[:, 1].astype(int)

    for i in range(nr_u):
        cid = sp_cluster_labels==ul[i]
        sp_cluster_labels[cid] = i
        cs[i] = np.sum(grp_sizes[cid])

    old_cluster_count = collections.Counter(sp_cluster_labels[labels])
    cid = np.nonzero(cs < minPts)[0]
    SIZE_NOISE_LABELS = cid.size

    if SIZE_NOISE_LABELS > 0:
        copy_sp_cluster_label = sp_cluster_labels.copy()
        
        for i in cid:
            ii = np.nonzero(copy_sp_cluster_label==i)[0] # find all tiny groups with that label
            for iii in ii:
                xi = spdata[iii, :]    # starting point of one tiny group

                dist = half_nrm2 - np.matmul(spdata, xi) + half_nrm2[iii]
                merge_ind = np.argsort(dist)
                for j in merge_ind:
                    if cs[copy_sp_cluster_label[j]] >= minPts:
                        sp_cluster_labels[iii] = copy_sp_cluster_label[j]
                        break

        ul = np.unique(sp_cluster_labels)
        nr_u = len(ul)
        cs = np.zeros(nr_u, dtype=int)

        for i in range(nr_u):
            cid = sp_cluster_labels==ul[i]
            sp_cluster_labels[cid] = i
            cs[i] = np.sum(grp_sizes[cid])

    labels = sp_cluster_labels[labels]

    return labels, old_cluster_count, SIZE_NOISE_LABELS






[docs]
def density_merging(data, splist, radius, sort_vals, half_nrm2):
    """
    Implement CLASSIX's merging with disjoint-set data structure, default choice for the merging.
    
    Parameters
    ----------
    data : numpy.ndarray
        The input that is array-like of shape (n_samples,).
    
    splist : numpy.ndarray
        Represent the list of starting points information formed in the aggregation. 
        list of [ starting point index of current group, sorting values, and number of group elements ]

    radius : float
        The tolerance to control the aggregation. If the distance between the starting point 
        of a group and another data point is less than or equal to the tolerance,
        the point is allocated to that group. For details, we refer users to [1].

    sort_vals : numpy.ndarray
        Sorting values.

    half_nrm2 : numpy.ndarray
        Precomputed values for distance computation.

        
    Returns
    -------
    labels_set : list
        Connected components of graph with groups as vertices.
    
    connected_pairs_store : list
        List for connected group labels.

    References
    ----------
    [1] X. Chen and S. Güttel. Fast and explainable sorted based clustering, 2022

    """
    
    connected_pairs = [SET(i) for i in range(splist.shape[0])]
    connected_pairs_store = []
    splist_indices = splist[:, 0]
    sort_vals_sp = sort_vals[splist_indices]
    spdata = data[splist_indices]
    half_nrm2_sp = half_nrm2[splist_indices]

    volume = np.pi**(data.shape[1]/2) * radius**data.shape[1] / gamma(data.shape[1]/2+1)

    radius_2 = (2*radius)**2
    radius_22 = radius**2

    for i in range(splist.shape[0]):
        sp1 = spdata[i]
        last_j = np.searchsorted(sort_vals_sp, 2*radius + sort_vals_sp[i], side='right')
        
        neigbor_sp = spdata[i+1:last_j]
        
        # THIS PART WE OMIT THE FAST QUERY WITH EARLY STOPPING CAUSE WE DON'T THINK EARLY STOPPING IN PYTHON CODE CAN BE FASTER THAN 
        # PYTHON BROADCAST, BUT IN THE CYTHON CODE, WE IMPLEMENT FAST QUERY WITH EARLY STOPPING BY LEVERAGING THE ADVANTAGE OF SORTED 
        # AGGREGATION.
        # index_overlap = euclid(2*half_nrm2_sp[i+1:last_j], neigbor_sp, sp1) <= radius_2 # 2*distance_scale*radius

        index_overlap = half_nrm2_sp[i+1:last_j] - np.matmul(neigbor_sp, sp1) <= radius_2 - half_nrm2_sp[i]
        
        select_stps = i+1 + np.where(index_overlap)[0]
        # calculate their neighbors ...1)
        # later decide whether sps should merge with neighbors
        if not np.any(index_overlap):
            continue

        c1 = euclid(2*half_nrm2, data, sp1) <= radius_22
        den1 = np.count_nonzero(c1) / volume
        
        for j in select_stps:
            sp2 = spdata[j]

            c2 = euclid(2*half_nrm2, data, sp2) <= radius_22
            den2 = np.count_nonzero(c2) / volume

            if check_if_overlap(sp1, sp2, radius=radius): 
                internum = np.count_nonzero(c1 & c2)
                cid = cal_inter_density(sp1, sp2, radius=radius, num=internum)
                if cid >= den1 or cid >= den2: 
                    merge(connected_pairs[i], connected_pairs[j])
                    connected_pairs_store.append([i, j])

        
    labels = [findParent(i).data for i in connected_pairs]
    labels_set = list()
    for i in np.unique(labels):
        labels_set.append(np.where(labels == i)[0].tolist())

    return labels_set, connected_pairs_store 





def euclid(xxt, X, v):
    return (xxt + np.inner(v,v).ravel() -2*X.dot(v)).astype(float)
    


class SET:
    """Disjoint-set data structure."""
    def __init__( self, data):
        self.data = data
        self.parent = self

        
        
def findParent(s):
    """Find parent of node."""
    if (s.data != s.parent.data) :
        s.parent = findParent((s.parent))
    return s.parent



def merge(s1, s2):
    """Merge the roots of two node."""
    parent_of_s1 = findParent(s1)
    parent_of_s2 = findParent(s2)

    if (parent_of_s1.data != parent_of_s2.data) :
        findParent(s1).parent = parent_of_s2
        
        

def check_if_overlap(starting_point, spo, radius, scale = 1):
    """Check if two groups formed by aggregation overlap
    """
    return np.linalg.norm(starting_point - spo, ord=2, axis=-1) <= 2 * scale * radius

    

def cal_inter_density(starting_point, spo, radius, num):
    """Calculate the density of intersection (lens)
    """
    in_volume = cal_inter_volume(starting_point, spo, radius)
    return num / in_volume



def cal_inter_volume(starting_point, spo, radius):
    """
    Returns the volume of the intersection of two spheres in n-dimensional space.
    The radius of the two spheres is r and the distance of their centers is d.
    For d=0 the function returns the volume of full sphere.
    Reference: https://math.stackexchange.com/questions/162250/how-to-compute-the-volume-of-intersection-between-two-hyperspheres

    """
    
    dim =starting_point.shape[0]
    dist = np.linalg.norm(starting_point - spo, ord=2, axis=-1) # the distance between the two groups
    
    if dist > 2*radius:
        return 0
    c = dist / 2
    return np.pi**(dim/2)/gamma(dim/2 + 1)*(radius**dim)*betainc((dim + 1)/2, 1/2, 1 - c**2/radius**2)