Source code for nlpatl.models.clustering.sklearn_clustering

from typing import List, Union
from collections import defaultdict
import numpy as np
from sklearn.cluster import KMeans

from nlpatl.models.clustering import Clustering
from nlpatl.dataset import Dataset

MODEL_FOR_SKLEARN_CLUSTERING_MAPPING_NAMES = {
    "kmeans": KMeans,
}


[docs]class SkLearnClustering(Clustering): """ A wrapper of sci-kit learn clustering class. :param model_name: sci-kit learn clustering model name. Possible values are `kmeans`. :type model_name: str :param model_config: Model paramateters. Refer to https://scikit-learn.org/stable/modules/classes.html#module-sklearn.cluster :type model_config: dict :param name: Name of this clustering :type name: str >>> import nlpatl.models.clustering as nmclu >>> model = nmclu.SkLearnClustering() """ def __init__( self, model_name: str = "kmeans", model_config: dict = {}, name: str = "sklearn_clustering", ): super().__init__(name) self.model_name = model_name self.model_config = model_config if model_name in self.get_mapping(): self.model = self.get_mapping()[model_name](**model_config) else: raise ValueError( "`{}` does not support. Supporting {} only".format( model_name, "`" + "`,`".join(self.get_mapping().keys()) + "`" ) ) @staticmethod def get_mapping() -> dict: return MODEL_FOR_SKLEARN_CLUSTERING_MAPPING_NAMES
[docs] def train(self, x: Union[List[float], np.ndarray]): self.model.fit(x)
[docs] def predict_proba(self, x: List[float], predict_config: dict = {}) -> Dataset: """ :param x: Raw features :type x: np.ndarray :param predict_config: Model prediction paramateters. Refer to https://scikit-learn.org/stable/modules/classes.html#module-sklearn.cluster :type predict_config: dict :return: Feature and probabilities :rtype: :class:`nlptatl.dataset.Dataset` """ clust_dists = self.model.transform(x) preds = self.model.predict(x, **predict_config) total_record = len(preds) indices = np.zeros(total_record, dtype=int) values = np.zeros(total_record, dtype=np.float) groups = [-1] * total_record start_pos = 0 for label in range(self.model.n_clusters): label_indices = np.where(preds == label)[0] end_pos = start_pos + len(label_indices) indices[start_pos:end_pos] = label_indices values[start_pos:end_pos] = clust_dists[label_indices][:, label] groups[start_pos:end_pos] = [label] * len(label_indices) start_pos = end_pos return Dataset(features=x, indices=indices, values=values, groups=groups)