直接从 S3 读取预训练的 huggingface transformer
Reading a pretrained huggingface transformer directly from S3
正在加载 huggingface pretrained transformer model seemingly requires you to have the model saved locally (as described here),这样您只需将本地路径传递给您的模型和配置:
model = PreTrainedModel.from_pretrained('path/to/model', local_files_only=True)
模型存储在S3上可以实现吗?
回答我自己的问题...(显然 encouraged)
我使用临时文件 (NamedTemporaryFile
) 实现了这一点,它起到了作用。我希望找到一个内存解决方案(即将 BytesIO
直接传递给 from_pretrained
),但这需要 transformers
代码库
的补丁
import boto3
import json
from contextlib import contextmanager
from io import BytesIO
from tempfile import NamedTemporaryFile
from transformers import PretrainedConfig, PreTrainedModel
@contextmanager
def s3_fileobj(bucket, key):
"""
Yields a file object from the filename at {bucket}/{key}
Args:
bucket (str): Name of the S3 bucket where you model is stored
key (str): Relative path from the base of your bucket, including the filename and extension of the object to be retrieved.
"""
s3 = boto3.client("s3")
obj = s3.get_object(Bucket=bucket, Key=key)
yield BytesIO(obj["Body"].read())
def load_model(bucket, path_to_model, model_name='pytorch_model'):
"""
Load a model at the given S3 path. It is assumed that your model is stored at the key:
'{path_to_model}/{model_name}.bin'
and that a config has also been generated at the same path named:
f'{path_to_model}/config.json'
"""
tempfile = NamedTemporaryFile()
with s3_fileobj(bucket, f'{path_to_model}/{model_name}.bin') as f:
tempfile.write(f.read())
with s3_fileobj(bucket, f'{path_to_model}/config.json') as f:
dict_data = json.load(f)
config = PretrainedConfig.from_dict(dict_data)
model = PreTrainedModel.from_pretrained(tempfile.name, config=config)
return model
model = load_model('my_bucket', 'path/to/model')
正在加载 huggingface pretrained transformer model seemingly requires you to have the model saved locally (as described here),这样您只需将本地路径传递给您的模型和配置:
model = PreTrainedModel.from_pretrained('path/to/model', local_files_only=True)
模型存储在S3上可以实现吗?
回答我自己的问题...(显然 encouraged)
我使用临时文件 (NamedTemporaryFile
) 实现了这一点,它起到了作用。我希望找到一个内存解决方案(即将 BytesIO
直接传递给 from_pretrained
),但这需要 transformers
代码库
import boto3
import json
from contextlib import contextmanager
from io import BytesIO
from tempfile import NamedTemporaryFile
from transformers import PretrainedConfig, PreTrainedModel
@contextmanager
def s3_fileobj(bucket, key):
"""
Yields a file object from the filename at {bucket}/{key}
Args:
bucket (str): Name of the S3 bucket where you model is stored
key (str): Relative path from the base of your bucket, including the filename and extension of the object to be retrieved.
"""
s3 = boto3.client("s3")
obj = s3.get_object(Bucket=bucket, Key=key)
yield BytesIO(obj["Body"].read())
def load_model(bucket, path_to_model, model_name='pytorch_model'):
"""
Load a model at the given S3 path. It is assumed that your model is stored at the key:
'{path_to_model}/{model_name}.bin'
and that a config has also been generated at the same path named:
f'{path_to_model}/config.json'
"""
tempfile = NamedTemporaryFile()
with s3_fileobj(bucket, f'{path_to_model}/{model_name}.bin') as f:
tempfile.write(f.read())
with s3_fileobj(bucket, f'{path_to_model}/config.json') as f:
dict_data = json.load(f)
config = PretrainedConfig.from_dict(dict_data)
model = PreTrainedModel.from_pretrained(tempfile.name, config=config)
return model
model = load_model('my_bucket', 'path/to/model')