NLP

tokenizer train

thisisw 2024. 3. 29. 17:18

huggingface 에서 tokenizer 가져와서 나의 dataset으로 추가 학습하기

from transformers import AutoTokenizer
import pandas as pd

def batch_iterator(text, batch_size = 1000):
    for i in range(0, len(text), batch_size):
        yield text[i : i + batch_size]

def main():
    # data
    df = pd.read_pickle('train_data.pkl')
    text = df.text.tolist()
    
    # base tokenizer
    # tokenizer.is_fast==True인 경우에만 가능
    tokenizer = AutoTokenizer.from_pretrained("gogamza/kobart-summarization")
    
    # train
    tokenizer = tokenizer.train_new_from_iterator(batch_iterator(text), vocab_size=30000)

    # save
    tokenizer.save_pretrained("mytokenizer")

    return tokenizer