Source code for nlpatl.learning.unsupervised_learning

from typing import List, Union, Callable, Optional

from nlpatl.models.clustering import Clustering
from nlpatl.models.embeddings import Embeddings
from nlpatl.learning import Learning
from nlpatl.dataset import Dataset


[docs]class UnsupervisedLearning(Learning):
	"""
		| Applying unsupervised learning apporach to annotate the most valuable data points. 
			You may refer to https://homepages.tuni.fi/tuomas.virtanen/papers/active-learning-sound.pdf. 
			Here is the pseudo:
		|	1. [NLPatl] Convert raw data to features (Embeddings model)
		|	2. [NLPatl] Train model and clustering data points (Clustering model)
		|	3. [NLPatl] Estmiate the most valuable data points (Sampling)
		|	4. [Human] Subject matter experts annotates the most valuable data points
		|	5. Repeat Step 2 to 4 until acquire enough data points.
		
		:param sampling: Sampling method for get the most valuable data points. 
			Providing certified methods name (`most_confidence`, `entropy`, 
			`least_confidence`, `margin`, `nearest_mean`, `fathest`)
			or custom function.
		:type sampling: str or function
		:param embeddings: Function for converting raw data to embeddings. Providing 
			model name according to embeddings type. For example, `multi-qa-MiniLM-L6-cos-v1`
			for `sentence_transformers`. bert-base-uncased` for
			`transformers`. `vgg16` for `torch_vision`.
		:type embeddings: str or :class:`nlpatl.models.embeddings.Embeddings`
		:param embeddings_model_config: Configuration for embeddings models. Optional. Ignored
			if using custom embeddings class
		:type embeddings_model_config: dict
		:param embeddings_type: Type of embeddings. `sentence_transformers` for text, 
			`transformers` for text or `torch_vision` for image
		:type embeddings_type: str
		:param clustering: Function for clustering inputs. Either providing
			certified methods (`kmeans`) or custom function.
		:type clustering: str or :class:`nlpatl.models.clustering.Clustering`
		:param clustering_model_config: Configuration for clustering models. Optional. Ignored
			if using custom clustering class
		:type clustering_model_config: dict
		:param multi_label: Indicate the classification model is multi-label or 
			multi-class (or binary). Default is False.
		:type multi_label: bool
		:param name: Name of this learning.
		:type name: str
	"""

	def __init__(self, 
		sampling: Union[str, Callable],
		embeddings: Union[str, Embeddings], 
		clustering: Union[str, Clustering],
		embeddings_type: Optional[str] = None,
		embeddings_model_config: Optional[dict] = None,
		clustering_model_config: Optional[dict] = None,
		multi_label: bool = False, 
		name: str = 'unsupervised_learning'):

		super().__init__(sampling=sampling,
			embeddings=embeddings, embeddings_type=embeddings_type,
			embeddings_model_config=embeddings_model_config,
			clustering=clustering, 
			clustering_model_config=clustering_model_config,
			multi_label=multi_label, name=name)

	def validate(self):
		super().validate(['embeddings', 'clustering'])

[docs]	def explore(self, inputs: List[str], return_type: str = 'dict', 
		num_sample: int = 2) -> Union[Dataset, dict]:

		self.validate()

		features = self.embeddings_model.convert(inputs)

		self.clustering_model.train(features)
		preds = self.clustering_model.predict_proba(features)

		indices, values = self.sampling(
			preds.values, preds.groups, num_sample=num_sample)
		preds.keep(indices)
		# Replace original probabilies by sampling values
		preds.values = values

		preds.inputs = [inputs[i] for i in preds.indices.tolist()]

		return self.get_return_object(preds, return_type)