Source code for classix.aggregation

# -*- coding: utf-8 -*-
#
# CLASSIX: Fast and explainable clustering based on sorting
#
# MIT License
#
# Copyright (c) 2023 Stefan Güttel, Xinye Chen


# Python implementation for aggregation


import numpy as np
from scipy.sparse.linalg import svds
from scipy.linalg import get_blas_funcs, eigh


# stefan added 23/10/2023
def precompute_aggregate_pca(data, sorting='pca', tol=0.5):
    """Aggregate the data with PCA using precomputation

    Parameters
    ----------
    data : numpy.ndarray
        The input that is array-like of shape (n_samples,).
    
    tol : float
        The tolerance to control the aggregation, if the distance between the starting point 
        and the object is less than or equal than the tolerance,
        the object should allocated to the group which starting point belongs to.  
    
    
    Returns
    -------
    labels (list) : 
        The group categories of the data after aggregation.
    
    splist (list) : 
        The list of the starting points.
    
    nr_dist (int) :
        The number of pairwise distance calculations.

    ind (numpy.ndarray):
        Array storing Sorting indices.

    sort_vals (numpy.ndarray):
        Sorting values.
    
    data (numpy.ndarray):
        Sorted data.
    
    half_nrm2 (numpy.ndarray):
        Precomputed values for distance computation.

    """
    
    len_ind, fdim = data.shape
    

    # get sorting values
    if fdim>1:
        if fdim <= 3: # memory inefficient
            gemm = get_blas_funcs("gemm", [data.T, data])
            _, U1 = eigh(gemm(1, data.T, data), subset_by_index=[fdim-1, fdim-1])
            sort_vals = data@U1.reshape(-1)
        else:
            U1, s1, _ = svds(data, k=1, return_singular_vectors=True)
            sort_vals = U1[:,0]*s1[0]
    else:
        sort_vals = data[:,0]

    sort_vals = sort_vals*np.sign(-sort_vals[0]) # flip to enforce deterministic output
    ind = np.argsort(sort_vals)
    data = data[ind,:] # sort data
    sort_vals = sort_vals[ind] 
 
    half_r2 = 0.5*tol**2
    half_nrm2 = np.einsum('ij,ij->i', data, data) * 0.5 # precomputation

    lab = 0
    labels = [-1] * len_ind
    nr_dist = 0 
    splist = list()

    for i in range(len_ind): 
        if labels[i] >= 0:
            continue

        clustc = data[i,:] 
        labels[i] = lab
        num_group = 1

        rhs = half_r2 - half_nrm2[i] # right-hand side of norm ineq.
        last_j = np.searchsorted(sort_vals, tol + sort_vals[i], side='right')
        ips = np.matmul(data[i+1:last_j,:], clustc.T)
        
        for j in range(i+1, last_j):
            if labels[j] >= 0:
                continue

            nr_dist += 1
            if half_nrm2[j] - ips[j-i-1] <= rhs:
                num_group += 1
                labels[j] = lab

        splist.append((i, num_group))
        lab += 1

    return labels, splist, nr_dist, ind, sort_vals, data, half_nrm2



[docs] def precompute_aggregate(data, sorting="pca", tol=0.5): """Aggregate the data using precomputation Parameters ---------- data : numpy.ndarray The input that is array-like of shape (n_samples,). sorting : str The sorting way referred for aggregation, default='pca', other options: 'norm-mean', 'norm-orthant'. tol : float The tolerance to control the aggregation, if the distance between the starting point and the object is less than or equal than the tolerance, the object should allocated to the group which starting point belongs to. Returns ------- labels (list) : The group categories of the data after aggregation. splist (list) : The list of the starting points. nr_dist (int) : The number of pairwise distance calculations. ind (numpy.ndarray): Array storing Sorting indices. sort_vals (numpy.ndarray): Sorting values. data (numpy.ndarray): Sorted data. half_nrm2 (numpy.ndarray): Precomputed values for distance computation. """ splist = list() # store the starting points len_ind = data.shape[0] fdim = data.shape[1] if sorting == "norm-mean" or sorting == "norm-orthant": sort_vals = np.linalg.norm(data, ord=2, axis=1) elif sorting == "pca": # change to svd if fdim > 1: if fdim <= 3: # memory inefficient gemm = get_blas_funcs("gemm", [data.T, data]) _, U1 = eigh(gemm(1, data.T, data), subset_by_index=[fdim-1, fdim-1]) sort_vals = data@U1.reshape(-1) else: U1, s1, _ = svds(data, k=1, return_singular_vectors=True) sort_vals = U1[:,0]*s1[0] else: sort_vals = data[:,0] sort_vals = sort_vals*np.sign(-sort_vals[0]) # flip to enforce deterministic output else: # no sorting sort_vals = np.zeros(len_ind) ind = np.argsort(sort_vals) data = data[ind] sort_vals = sort_vals[ind] lab = 0 labels = [-1]*len_ind nr_dist = 0 half_r2 = tol**2 * 0.5 half_nrm2 = np.einsum('ij,ij->i', data, data) * 0.5 # precomputation for i in range(len_ind): if labels[i] >= 0: continue else: clustc = data[i,:] labels[i] = lab num_group = 1 rhs = half_r2 - half_nrm2[i] # right-hand side of norm ineq. for j in range(i+1, len_ind): if labels[j] >= 0: continue if (sort_vals[j] - sort_vals[i] > tol): break nr_dist += 1 dataj = data[j] if half_nrm2[j] - np.inner(clustc, dataj) <= rhs: num_group += 1 labels[j] = lab splist.append((i, num_group)) lab += 1 return labels, splist, nr_dist, ind, sort_vals, data, half_nrm2
[docs] def aggregate(data, sorting="pca", tol=0.5): """Aggregate the data Parameters ---------- data : numpy.ndarray The input that is array-like of shape (n_samples,). sorting : str The sorting method for aggregation, default='pca', other options: 'norm-mean', 'norm-orthant'. tol : float The tolerance to control the aggregation. if the distance between the starting point of a group and another data point is less than or equal to the tolerance, the point is allocated to that group. Returns ------- labels (list) : The group categories of the data after aggregation. splist (list) : The list of the starting points. nr_dist (int) : The number of pairwise distance calculations. ind (numpy.ndarray): Array storing Sorting indices. sort_vals (numpy.ndarray): Sorting values. data (numpy.ndarray): Sorted data. """ splist = list() # store the starting points len_ind = data.shape[0] fdim = data.shape[1] if sorting == "norm-mean" or sorting == "norm-orthant": sort_vals = np.linalg.norm(data, ord=2, axis=1) elif sorting == "pca": # change to svd if fdim > 1: if fdim <= 3: # memory inefficient gemm = get_blas_funcs("gemm", [data.T, data]) _, U1 = eigh(gemm(1, data.T, data), subset_by_index=[fdim-1, fdim-1]) sort_vals = data@U1.reshape(-1) else: U1, s1, _ = svds(data, k=1, return_singular_vectors=True) sort_vals = U1[:,0]*s1[0] else: sort_vals = data[:,0] sort_vals = sort_vals*np.sign(-sort_vals[0]) # flip to enforce deterministic output else: # no sorting sort_vals = np.zeros(len_ind) ind = np.argsort(sort_vals) data = data[ind] sort_vals = sort_vals[ind] lab = 0 labels = [-1]*len_ind nr_dist = 0 for i in range(len_ind): if labels[i] >= 0: continue else: clustc = data[i,:] labels[i] = lab num_group = 1 for j in range(i+1, len_ind): if labels[j] >= 0: continue if (sort_vals[j] - sort_vals[i] > tol): break dat = clustc - data[j,:] dist = np.inner(dat, dat) nr_dist += 1 if dist <= tol**2: num_group += 1 labels[j] = lab splist.append((i, num_group)) lab += 1 return labels, splist, nr_dist, ind, sort_vals, data