Source code for nlpatl.models.embeddings.transformers

from typing import List
import numpy as np

try:
    from transformers import AutoTokenizer, AutoModel, TFAutoModel
except ImportError:
    # No installation required if not using this function
    pass
try:
    import torch
except ImportError:
    # No installation required if not using this function
    pass
try:
    import tensorflow as tf
except ImportError:
    # No installation required if not using this function
    pass

from nlpatl.models.embeddings.embeddings import Embeddings


[docs]class Transformers(Embeddings): """ A wrapper of transformers class. :param model_name_or_path: transformers model name. :type model_name_or_path: str :param batch_size: Batch size of data processing. Default is 16 :type batch_size: int :param padding: Inputs may not have same size. Set True to pad it. Default is False :type padding: bool :param truncation: Inputs may not have same size. Set True to truncate it. Default is False :type truncation: bool :param nn_fwk: Neual network framework. Either pt (for PyTorch) or tf (for TensorFlow) :type nn_fwk: str :param name: Name of this embeddings :type name: str >>> import nlpatl.models.embeddings as nme >>> model = nme.Transformers() """ def __init__( self, model_name_or_path: str, batch_size: int = 16, padding: bool = False, truncation: bool = False, nn_fwk: str = None, name: str = "transformers", ): super().__init__(batch_size=batch_size, name=name) self.model_name_or_path = model_name_or_path self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) if nn_fwk == "pt": self.model = AutoModel.from_pretrained(model_name_or_path) self.model.eval() elif nn_fwk == "tf": self.model = TFAutoModel.from_pretrained(model_name_or_path) # TODO: have eval ? self.padding = padding self.truncation = truncation self.nn_fwk = nn_fwk
[docs] def convert(self, x: List[str]) -> np.ndarray: results = [] for batch_inputs in self.batch(x, self.batch_size): ids = self.tokenizer( batch_inputs, return_tensors=self.nn_fwk, padding=self.padding, truncation=self.truncation, ) # TODO: for tensorflow? with torch.no_grad(): output = self.model(**ids) assert "pooler_output" in output.keys(), ( "This model (`{}`) does not provide single embeddings for " "input. Switch to use other type of transformers model such as " "`bert-base-uncased` or `roberta-base`.".format( self.model_name_or_path ) ) results.append(output["pooler_output"]) if self.nn_fwk == "pt": return torch.cat(results).numpy() elif self.nn_fwk == "tf": return tf.concat(results, axis=0).numpy()