from typing import List, Union, Callable, Optional
from nlpatl.models.clustering import Clustering
from nlpatl.models.embeddings import Embeddings
from nlpatl.learning import Learning
from nlpatl.dataset import Dataset
[docs]class UnsupervisedLearning(Learning):
"""
| Applying unsupervised learning apporach to annotate the most valuable data points.
You may refer to https://homepages.tuni.fi/tuomas.virtanen/papers/active-learning-sound.pdf.
Here is the pseudo:
| 1. [NLPatl] Convert raw data to features (Embeddings model)
| 2. [NLPatl] Train model and clustering data points (Clustering model)
| 3. [NLPatl] Estmiate the most valuable data points (Sampling)
| 4. [Human] Subject matter experts annotates the most valuable data points
| 5. Repeat Step 2 to 4 until acquire enough data points.
:param sampling: Sampling method for get the most valuable data points.
Providing certified methods name (`most_confidence`, `entropy`,
`least_confidence`, `margin`, `nearest_mean`, `fathest`)
or custom function.
:type sampling: str or function
:param embeddings: Function for converting raw data to embeddings. Providing
model name according to embeddings type. For example, `multi-qa-MiniLM-L6-cos-v1`
for `sentence_transformers`. bert-base-uncased` for
`transformers`. `vgg16` for `torch_vision`.
:type embeddings: str or :class:`nlpatl.models.embeddings.Embeddings`
:param embeddings_model_config: Configuration for embeddings models. Optional. Ignored
if using custom embeddings class
:type embeddings_model_config: dict
:param embeddings_type: Type of embeddings. `sentence_transformers` for text,
`transformers` for text or `torch_vision` for image
:type embeddings_type: str
:param clustering: Function for clustering inputs. Either providing
certified methods (`kmeans`) or custom function.
:type clustering: str or :class:`nlpatl.models.clustering.Clustering`
:param clustering_model_config: Configuration for clustering models. Optional. Ignored
if using custom clustering class
:type clustering_model_config: dict
:param multi_label: Indicate the classification model is multi-label or
multi-class (or binary). Default is False.
:type multi_label: bool
:param name: Name of this learning.
:type name: str
"""
def __init__(self,
sampling: Union[str, Callable],
embeddings: Union[str, Embeddings],
clustering: Union[str, Clustering],
embeddings_type: Optional[str] = None,
embeddings_model_config: Optional[dict] = None,
clustering_model_config: Optional[dict] = None,
multi_label: bool = False,
name: str = 'unsupervised_learning'):
super().__init__(sampling=sampling,
embeddings=embeddings, embeddings_type=embeddings_type,
embeddings_model_config=embeddings_model_config,
clustering=clustering,
clustering_model_config=clustering_model_config,
multi_label=multi_label, name=name)
def validate(self):
super().validate(['embeddings', 'clustering'])
[docs] def explore(self, inputs: List[str], return_type: str = 'dict',
num_sample: int = 2) -> Union[Dataset, dict]:
self.validate()
features = self.embeddings_model.convert(inputs)
self.clustering_model.train(features)
preds = self.clustering_model.predict_proba(features)
indices, values = self.sampling(
preds.values, preds.groups, num_sample=num_sample)
preds.keep(indices)
# Replace original probabilies by sampling values
preds.values = values
preds.inputs = [inputs[i] for i in preds.indices.tolist()]
return self.get_return_object(preds, return_type)