Source code for nlpatl.models.embeddings.transformers
from typing import List
import numpy as np
try:
from transformers import AutoTokenizer, AutoModel, TFAutoModel
except ImportError:
# No installation required if not using this function
pass
try:
import torch
except ImportError:
# No installation required if not using this function
pass
try:
import tensorflow as tf
except ImportError:
# No installation required if not using this function
pass
from nlpatl.models.embeddings.embeddings import Embeddings
[docs]class Transformers(Embeddings):
"""
A wrapper of transformers class.
:param model_name_or_path: transformers model name.
:type model_name_or_path: str
:param batch_size: Batch size of data processing. Default is 16
:type batch_size: int
:param padding: Inputs may not have same size. Set True to pad it.
Default is False
:type padding: bool
:param truncation: Inputs may not have same size. Set True to
truncate it. Default is False
:type truncation: bool
:param nn_fwk: Neual network framework. Either pt (for PyTorch) or
tf (for TensorFlow)
:type nn_fwk: str
:param name: Name of this embeddings
:type name: str
>>> import nlpatl.models.embeddings as nme
>>> model = nme.Transformers()
"""
def __init__(
self,
model_name_or_path: str,
batch_size: int = 16,
padding: bool = False,
truncation: bool = False,
nn_fwk: str = None,
name: str = "transformers",
):
super().__init__(batch_size=batch_size, name=name)
self.model_name_or_path = model_name_or_path
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if nn_fwk == "pt":
self.model = AutoModel.from_pretrained(model_name_or_path)
self.model.eval()
elif nn_fwk == "tf":
self.model = TFAutoModel.from_pretrained(model_name_or_path)
# TODO: have eval ?
self.padding = padding
self.truncation = truncation
self.nn_fwk = nn_fwk
[docs] def convert(self, x: List[str]) -> np.ndarray:
results = []
for batch_inputs in self.batch(x, self.batch_size):
ids = self.tokenizer(
batch_inputs,
return_tensors=self.nn_fwk,
padding=self.padding,
truncation=self.truncation,
)
# TODO: for tensorflow?
with torch.no_grad():
output = self.model(**ids)
assert "pooler_output" in output.keys(), (
"This model (`{}`) does not provide single embeddings for "
"input. Switch to use other type of transformers model such as "
"`bert-base-uncased` or `roberta-base`.".format(
self.model_name_or_path
)
)
results.append(output["pooler_output"])
if self.nn_fwk == "pt":
return torch.cat(results).numpy()
elif self.nn_fwk == "tf":
return tf.concat(results, axis=0).numpy()