NLP
tokenizer train
thisisw
2024. 3. 29. 17:18
huggingface 에서 tokenizer 가져와서 나의 dataset으로 추가 학습하기
from transformers import AutoTokenizer
import pandas as pd
def batch_iterator(text, batch_size = 1000):
for i in range(0, len(text), batch_size):
yield text[i : i + batch_size]
def main():
# data
df = pd.read_pickle('train_data.pkl')
text = df.text.tolist()
# base tokenizer
# tokenizer.is_fast==True인 경우에만 가능
tokenizer = AutoTokenizer.from_pretrained("gogamza/kobart-summarization")
# train
tokenizer = tokenizer.train_new_from_iterator(batch_iterator(text), vocab_size=30000)
# save
tokenizer.save_pretrained("mytokenizer")
return tokenizer