Source code for nlpatl.learning.mismatch_farthest_learning

from typing import List, Union, Callable, Optional
from collections import defaultdict
import numpy as np

from nlpatl.dataset import Dataset
from nlpatl.learning import Learning
from nlpatl.sampling import Sampling

import nlpatl.models.embeddings as nme
import nlpatl.models.clustering as nmclu
import nlpatl.models.classification as nmcla
import nlpatl.sampling.uncertainty as nsunc
import nlpatl.sampling.clustering as nsclu

CLUSTERING_MODEL_FOR_MISMATCH_FARTHEST_MAPPING_NAMES = {
	'kmeans': nmclu.SkLearnClustering
}

SAMPLING_FOR_MISMATCH_FARTHEST_MAPPING_NAMES = {
	'nearest_mean': nsclu.NearestMeanSampling(),
}


[docs]class MismatchFarthestLearning(Learning):
	"""
		| Applying mis-match first farthest traversal method apporach (with modification) 
			to annotate the most valuable data points. 
			You may refer to http://zhaoshuyang.com/static/documents/MAL2.pdf
			. Here is the pseudo:
		|	1. [NLPatl] Convert raw data to features (Embeddings model)
		|	2. [NLPatl] Train model and clustering data points (Clustering model)
		|	3. [NLPatl] Estmiate the most valuable data points (Sampling)
		|	4. [Human] Subject matter exepknrnts annotates the most valuable data points
		|	5. [NLPatl] Train classification model (Classification model)
		|	6. [NLPatl] Classify unlabeled data points and comparing the clustering model result
			according to the farthest mismatch data points
		|	7. [Human] Subject matter exepknrnts annotates the most valuable data points
		|	8. Repeat Step 2 to 7 until acquire enough data points or reach other
			exit criteria.
		
		:param clustering_sampling: Clustering sampling method for stage 1 exploration. 
			Providing certified methods name (`nearest_mean`) or custom function.
		:type clustering_sampling: str or function
		:param embeddings: Function for converting raw data to embeddings. Providing 
			model name according to embeddings type. For example, `multi-qa-MiniLM-L6-cos-v1`
			for `sentence_transformers`. bert-base-uncased` for
			`transformers`. `vgg16` for `torch_vision`.
		:type embeddings: str or :class:`nlpatl.models.embeddings.Embeddings`
		:param embeddings_model_config: Configuration for embeddings models. Optional. Ignored
			if using custom embeddings class
		:type embeddings_model_config: dict
		:param embeddings_type: Type of embeddings. `sentence_transformers` for text, 
			`transformers` for text or `torch_vision` for image
		:type embeddings_type: str
		:param clustering: Function for clustering inputs. Either providing
			certified methods (`kmeans`) or custom function.
		:type clustering: str or :class:`nlpatl.models.clustering.Clustering`
		:param clustering_model_config: Configuration for clustering models. Optional. Ignored
			if using custom clustering class
		:type clustering_model_config: dict
		:param classification: Function for classifying inputs. Either providing
			certified methods (`logistic_regression`, `svc`, `linear_svc`, `random_forest`
			and `xgboost`) or custom function.
		:type classification: :class:`nlpatl.models.classification.Classification`
		:param classification_model_config: Configuration for classification models. Optional.
			Ignored if using custom classification class
		:type classification_model_config: dict
		:param multi_label: Indicate the classification model is multi-label or 
			multi-class (or binary). Default is False.
		:type multi_label: bool
		:param name: Name of this learning.
		:type name: str
	"""

	def __init__(self, clustering_sampling: Union[str, Callable],
		embeddings: Union[str, nme.Embeddings], 
		clustering: Union[str, nmclu.Clustering],
		classification: Union[str, nmcla.Classification], 
		embeddings_type: Optional[str] = None,
		embeddings_model_config: Optional[dict] = None,
		clustering_model_config: Optional[dict] = None,
		classification_model_config: Optional[dict] = None,
		multi_label: bool = False,
		name: str = 'mismatch_farthest_learning'):

		super().__init__(
			embeddings=embeddings, embeddings_type=embeddings_type,
			embeddings_model_config=embeddings_model_config,
			clustering=clustering, 
			clustering_model_config=clustering_model_config,
			classification=classification, 
			classification_model_config=classification_model_config,
			multi_label=multi_label, name=name)

		self.clustering_sampling = self.init_sampling(clustering_sampling)
		self.mismatch_sampling = nsunc.MismatchSampling().sample
		self.farthest_sampling = nsclu.FarthestSampling().sample

	def get_sampling_mapping(self):
		return SAMPLING_FOR_MISMATCH_FARTHEST_MAPPING_NAMES

	def get_clustering_mapping(self):
		return CLUSTERING_MODEL_FOR_MISMATCH_FARTHEST_MAPPING_NAMES

	def validate(self):
		super().validate(['embeddings', 'clustering', 'classification'])

	def learn_clustering(self, x: np.ndarray, model_config: dict):
		_, self.clustering_model = self.init_clustering_model(
			model_config=model_config
			)

		self.clustering_model.train(x)

	def learn_classifier(self, x: np.ndarray, y: Union[List[str], List[int]]):
		self.classification_model.train(x, y)

	def build_seq_encoder(self, labels: Union[List[str], List[int]]):
		encoded_values = []
		label_decoder = {}
		unique_y_encoder = defaultdict(list)

		for i, c in enumerate(labels):
			encoded_values.append(i)
			label_decoder[i] = c

		for k, v in label_decoder.items():
			unique_y_encoder[v].append(k)
		unique_y_encoder = {k:sorted(v) for k, v in unique_y_encoder.items()}

		return encoded_values, label_decoder, unique_y_encoder

	def explore_first_stage(self, x: np.ndarray, 
		num_sample: int = 1) -> Union[Dataset, dict]:

		self.validate()

		self.clustering_model.train(x)
		preds = self.clustering_model.predict_proba(x)
		
		indices, values = self.clustering_sampling(
			preds.values, preds.groups, num_sample=1)
		preds.keep(indices)
		# Replace original probabilies by sampling values
		preds.values = values

		return preds

	def explore_second_stage(self, x: np.ndarray, num_sample: int = 2):
		# Get annotated dataset
		learn_indices, learn_x, learn_x_features, learn_y = self.get_learn_data()
		encoded_learn_y, learn_y_decoder, unique_y_encoder = self.build_seq_encoder(
			learn_y)
		# TODO: cache
		learn_x_features = self.embeddings_model.convert(learn_x)
		
		# Build unannotated dataset
		keep_indices = [_ for _ in range(len(x)) if _ not in learn_indices]
		unannotated_x_features = x[keep_indices]
		
		# Train clustering
		model_config = {
			'n_clusters': len(learn_x),
			'init': learn_x_features,
			'n_init': 1
		}
		self.learn_clustering(x=unannotated_x_features, 
			model_config=model_config)
		clustering_predictions = self.clustering_model.predict_proba(unannotated_x_features)
		clustering_preds = [learn_y_decoder[g] for g in clustering_predictions.groups]
		clustering_values = clustering_predictions.values

		# Train classifier
		self.learn_classifier(x=learn_x_features, y=learn_y)
		probs = self.classification_model.model.predict_proba(unannotated_x_features)
		preds = np.argmax(probs, axis=1)
		classification_preds = [self.classification_model.label_decoder[y] for y in preds]

		# Find mismatch
		mismatch_indices = self.mismatch_sampling(
			clustering_preds, classification_preds, num_sample=len(clustering_preds))

		new_groups = np.array([unique_y_encoder[learn_y_decoder[g]][0] for g in clustering_predictions.groups])
		new_groups = new_groups[mismatch_indices].flatten()
		new_values = clustering_predictions.values[mismatch_indices].flatten()

		positions, values = self.farthest_sampling(new_values, new_groups, num_sample)
		clustering_predictions.keep(positions)

		return clustering_predictions

[docs]	def explore_educate_in_notebook(self, 
		x: Union[List[str], List[int], List[float], np.ndarray],
		num_sample: int = 5, num_sample_per_cluster: int = 2, 
		data_type: str = 'text'):

		x_features = self.embeddings_model.convert(x)

		# First stage clustering learning
		valuable_data = self.explore_first_stage(
			x_features, num_sample=num_sample)
		valuable_data.inputs = [x[i] for i in valuable_data.indices]
		self.show_in_notebook(valuable_data, data_type=data_type)

		# Second stage mismatch-farthest
		while len(self.learn_x) < num_sample:
			valuable_data = self.explore_second_stage(
				x=x_features, num_sample=num_sample_per_cluster)
			valuable_data.inputs = [x[i] for i in valuable_data.indices]

			self.show_in_notebook(valuable_data, data_type=data_type)