Source code for nlpatl.models.embeddings.transformers
from typing import List
import numpy as np
try:
from transformers import (
AutoTokenizer,
AutoModel, TFAutoModel
)
except ImportError:
# No installation required if not using this function
pass
try:
import torch
except ImportError:
# No installation required if not using this function
pass
try:
import tensorflow as tf
except ImportError:
# No installation required if not using this function
pass
from nlpatl.models.embeddings.embeddings import Embeddings
[docs]class Transformers(Embeddings):
"""
A wrapper of transformers class.
:param model_name_or_path: transformers model name.
:type model_name_or_path: str
:param batch_size: Batch size of data processing. Default is 16
:type batch_size: int
:param padding: Inputs may not have same size. Set True to pad it.
Default is False
:type padding: bool
:param truncation: Inputs may not have same size. Set True to
truncate it. Default is False
:type truncation: bool
:param nn_fwk: Neual network framework. Either pt (for PyTorch) or
tf (for TensorFlow)
:type nn_fwk: str
:param model_config: Model paramateters. Refer to https://huggingface.co/docs/transformers/index
:type model_config: dict
:param name: Name of this embeddings
:type name: str
>>> import nlpatl.models.embeddings as nme
>>> model = nme.Transformers()
"""
def __init__(self, model_name_or_path: str, batch_size: int = 16,
padding: bool = False, truncation: bool = False,
nn_fwk: str = None, name: str = 'transformers'):
super().__init__(batch_size=batch_size, name=name)
self.model_name_or_path = model_name_or_path
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if nn_fwk == 'pt':
self.model = AutoModel.from_pretrained(model_name_or_path)
self.model.eval()
elif nn_fwk == 'tf':
self.model = TFAutoModel.from_pretrained(model_name_or_path)
# TODO: have eval ?
self.padding = padding
self.truncation = truncation
self.nn_fwk = nn_fwk
[docs] def convert(self, x: List[str]) -> np.ndarray:
results = []
for batch_inputs in self.batch(x, self.batch_size):
ids = self.tokenizer(
batch_inputs,
return_tensors=self.nn_fwk,
padding=self.padding,
truncation=self.truncation)
# TODO: for tensorflow?
with torch.no_grad():
output = self.model(**ids)
assert 'pooler_output' in output.keys(), \
'This model (`{}`) does not provide single embeddings for ' \
'input. Switch to use other type of transformers model such as '\
'`bert-base-uncased` or `roberta-base`.'.format(self.model_name_or_path)
results.append(output['pooler_output'])
if self.nn_fwk == 'pt':
return torch.cat(results).numpy()
elif self.nn_fwk == 'tf':
return tf.concat(results, axis=0).numpy()