Source code for nlpatl.learning.semi_supervised_learning

from typing import List, Union, Callable, Optional
from collections import defaultdict
import numpy as np

from nlpatl.models.classification import Classification
from nlpatl.models.embeddings import Embeddings
from nlpatl.sampling.certainty import MostConfidenceSampling
from nlpatl.learning import Learning
from nlpatl.dataset import Dataset


[docs]class SemiSupervisedLearning(Learning): """ | Applying both active learning and semi-supervised learning apporach to annotate the most valuable data points. You may refer to https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0162075&type=printable . Here is the pseudo: | 1. [NLPatl] Convert raw data to features (Embeddings model) | 2. [NLPatl] Train model and classifing data points (Classification model) | 3. [NLPatl] Estmiate the most valuable data points (Sampling) | 4. [Human] Subject matter experts annotates the most valuable data points | 5. [NLPatl] Retrain classification model | 6. [NLPatl] Classify unlabeled data points and labeling those confidences are higher than `self_learn_threshold` | 7. Repeat Step 2 to 6 until acquire enough data points. :param sampling: Sampling method for get the most valuable data points. Providing certified methods name (`most_confidence`, `entropy`, `least_confidence`, `margin`, `nearest_mean`, `fathest`) or custom function. :type sampling: str or function :param embeddings: Function for converting raw data to embeddings. Providing model name according to embeddings type. For example, `multi-qa-MiniLM-L6-cos-v1` for `sentence_transformers`. bert-base-uncased` for `transformers`. `vgg16` for `torch_vision`. :type embeddings: str or :class:`nlpatl.models.embeddings.Embeddings` :param embeddings_model_config: Configuration for embeddings models. Optional. Ignored if using custom embeddings class :type embeddings_model_config: dict :param embeddings_type: Type of embeddings. `sentence_transformers` for text, `transformers` for text or `torch_vision` for image :type embeddings_type: str :param classification: Function for classifying inputs. Either providing certified methods (`logistic_regression`, `svc`, `linear_svc`, `random_forest` and `xgboost`) or custom function. :type classification: :class:`nlpatl.models.classification.Classification` :param classification_model_config: Configuration for classification models. Optional. Ignored if using custom classification class :type classification_model_config: dict :type multi_label: bool :param self_learn_threshold: The minimum threshold for classifying probabilities. Data will be labeled automatically if probability is higher than this value. Default is 0.9 :type self_learn_threshold: float :param name: Name of this learning. :type name: str """ def __init__(self, sampling: Union[str, Callable], embeddings: Union[str, Embeddings], classification: Union[str, Classification], embeddings_type: Optional[str] = None, embeddings_model_config: Optional[dict] = None, classification_model_config: Optional[dict] = None, multi_label: bool = False, self_learn_threshold: float = 0.9, name: str = 'semi_supervised_learning'): super().__init__(sampling=sampling, embeddings=embeddings, embeddings_type=embeddings_type, embeddings_model_config=embeddings_model_config, classification=classification, classification_model_config=classification_model_config, multi_label=multi_label, name=name) self.most_confidence_sampling = MostConfidenceSampling( threshold=self_learn_threshold).sample self.self_learn_indices = None self.self_learn_x = None self.self_learn_x_features = None self.self_learn_y = None self.self_learn_threshold = self_learn_threshold def validate(self): super().validate(['embeddings', 'classification'])
[docs] def get_self_learn_data(self): """ Get all self learnt data points :return: Self learnt data points :rtype: Tuple of index list of int, x (:class:`numpy.ndarray`) and y (:class:`numpy.ndarray`) """ return self.self_learn_indices, self.self_learn_x, \ self.self_learn_x_features, self.self_learn_y
[docs] def learn(self, x: Union[List[str], List[int], List[float], np.ndarray] = None, y: Union[List[str], List[int]] = None, include_learn_data: bool = True): self.validate() if include_learn_data: all_x = self.concatenate( [d for d in [x , self.learn_x, self.self_learn_x] if d]) all_y = self.concatenate( [d for d in [y , self.learn_y, self.self_learn_y] if d]) else: all_x = x all_y = y self.add_unique_y(all_y) x_features = self.embeddings_model.convert(all_x) self.classification_model.train(x_features, all_y)
[docs] def explore(self, x: Union[List[str], List[int], List[float], np.ndarray], return_type: str = 'dict', num_sample: int = 10) -> Union[Dataset, dict]: self.validate() x_features = self.embeddings_model.convert(x) preds = self.classification_model.predict_proba(x_features) indices, values = self.sampling(preds.values, num_sample=num_sample) preds.keep(indices) # Replace original probabilies by sampling values preds.values = values preds.inputs = [x[i] for i in preds.indices.tolist()] return self.get_return_object(preds, return_type)
[docs] def explore_educate_in_notebook(self, x: Union[List[str], List[int], List[float], np.ndarray], num_sample: int = 2, data_type: str = 'text'): super().explore_educate_in_notebook( x=x, num_sample=num_sample, data_type=data_type) # Train model after human annotation self.learn() # Identify high confidence unannotated data unannotated_x = self.filter(x, self.learn_indices) x_features = self.embeddings_model.convert(unannotated_x) preds = self.classification_model.predict_proba(x_features) indices, values = self.most_confidence_sampling(preds.values, len(preds)) if len(indices) > 0: preds.keep(indices) # Replace original probabilies by sampling values preds.values = values # NOT original indices. These are filtered indices indices = preds.indices # self.self_learn_x_indices = self.filter(unannotated_x, indices) self.self_learn_x = self.filter(unannotated_x, indices) self.self_learn_x_features = self.filter(preds.features, indices) self.self_learn_y = self.filter(preds.groups, indices)