티스토리 뷰

개발

[Pytorch] torchtext로 텍스트 classification

thisisw 2022. 7. 13. 19:03

pytorch의 tutorial 을 제가 가지고 있는 데이터셋에 적용해보았습니다.

import torch
from torch import nn
import torch.utils.data as data
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader
import time

from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

데이터 준비하기

dataframe을 이용해 torchtext.datasets에서 import한것과 같은 형태의 dataset iterator를 생성하기 위해서 Dataset을 상속받아 CustomDataset class를 생성합니다.

class CustomDataset(Dataset): 
    def __init__(self, x_data, y_data):
        self.x_data = x_data
        self.y_data = y_data

    # 총 데이터의 개수를 리턴
    def __len__(self): 
        return len(self.x_data)

    # 인덱스를 입력받아 그에 맵핑되는 입출력 데이터를 파이토치의 Tensor 형태로 리턴
    def __getitem__(self, idx): 
        x = self.x_data[idx]
        y = self.y_data[idx]
        return y, x

dataframe df는 분류할 text가 들어있는 'text' 컬럼과 class가 들어있는 'target' 컬럼으로 구성되어있습니다.

data_iter = iter(CustomDataset(x_data = df['text'].tolist(), y_data = df['target'].tolist()))

데이터 처리 파이프라인 준비하기

torchtext의 build_vocab_from_iterator를 이용해 어휘집을 생성합니다. train data에 없는(단어장에 없는) data가 들어왔을때 으로 임베딩 할 수 있도록 specials에 추가하고, defalt_index로 지정합니다.

# 띄어쓰기를 기준으로 tokenize 하겠습니다.
tokenizer = lambda x: x.split(' ')

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(iterator=yield_tokens(data_iter), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])

vocab은 아래와 같이 토큰 목록을 정수로 변환합니다. (hello는 vocab에 없는 단어입니다)

vocab(['안녕','파이썬','hello'])
>>> [1, 5, 0]

text_pipeline은 vocab에 정의된 lookup table에 기반해 문장을 임베딩 하고, label_pipeline은 target을 정수로 변환합니다. target에 -1을 해주는 이유는 이후 loss계산시 target의 index값(=target-1)과 target을 비교하기 때문입니다.

text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x)-1

데이터 batch와 iter 생성하기

길이가 변하는 input을(텍스트 데이터와 같은) 처리하기 위해서 dataloader의 collate_fn을 사용자정의 함수로 다시 재정의하여 사용합니다. (해당 tutorial을 참고하였습니다.)

    _label, _text : (2, '이 영화 생각보다 너무 재밌어요')일 때 각 변수에 아래와 같은 값들이 들어갑니다.

    label_list : 1 (= label - 1)
    text_list : tensor([  1,  13,  36,  16, 330]) (=text to tensor)
    offsets : 5 (=text_list의 길이)

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0) #cumsum을 하는 이유는, text_list에서 시작점 인덱스를 나타내기 위함
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

# pc에 gpu가 없으면 cpu 사용
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

dataloader = DataLoader(data_iter,
                        batch_size=8,
                        shuffle=False,
                        collate_fn=collate_batch)

모델 정의하기

input > embedding > linear_layer > output 으로 모델을 구성합니다.

class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

vocab_size 는 어휘집(vocab)의 크기, embed_dim은 임베딩 차원, num_class는 class의 개수를 model의 파라미터로 사용합니다.

model = TextClassificationModel(vocab_size=1000, embed_dim=64, num_class=5).to(device)

train

전체 데이터를 7:3의 비율로 train / valid 로 나누고 valid data는 train에 이용하지 않았습니다.

x_train, x_valid, y_train, y_valid = train_test_split(df['text'],
                                                    df['target'],
                                                    test_size=0.3,
                                                    shuffle=True,
                                                    random_state=1004)

train시 train data를 95:5의 비율로 train / test로 나눠서 사용하고 loss 계산은 CrossEntropyLoss를 사용합니다.

def train(dataloader, optimizer, criterion):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()
    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('{:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader, criterion):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

train_main에서 train + evaluate를 실행합니다.

def train_main(train_iter, num_class, vocab_size, emsize):

    # Hyperparameters
    EPOCHS = 10 # epoch
    LR = 0.1  # learning rate
    BATCH_SIZE = 64 # batch size for training

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=LR)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
    total_accu = None
    # train_iter, test_iter = AG_NEWS()
    train_dataset = to_map_style_dataset(train_iter)

    # train / test split
    num_train = int(len(train_dataset) * 0.95)
    split_train_, split_test_ = \
        random_split(train_dataset, [num_train, len(train_dataset) - num_train])

    train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                                  shuffle=True, collate_fn=collate_batch)
    test_dataloader = DataLoader(split_test_, batch_size=BATCH_SIZE,
                                  shuffle=True, collate_fn=collate_batch)
    for epoch in range(1, EPOCHS + 1):
        epoch_start_time = time.time()
        train(train_dataloader, optimizer, criterion)
        accu_val = evaluate(test_dataloader, criterion)
        if total_accu is not None and total_accu > accu_val:
            scheduler.step()
        else:
            total_accu = accu_val
        print('-' * 59)
        print('| end of epoch {:3d} | time: {:5.2f}s | '
              'test accuracy {:8.3f} '.format(epoch,
                                               time.time() - epoch_start_time,
                                               accu_val))
        print('-' * 59)

model을 train 합니다.

# train_iter내의 label의 unique한 개수를 num_class로 지정합니다.
num_class = df.target.nunique()

# vocab의 size를 지정해 TextClassificationModel에 파라미터로 사용합니다.
vocab_size = len(vocab)

# 단어 embedding dim을 정합니다.
emsize = 64

model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

%%time
train_iter = iter(CustomDataset(x_data = x_train.tolist(), y_data = y_train.tolist()))
train_main(train_iter, num_class, vocab_size, emsize)

>>>  
 500/ 2986 batches | accuracy    0.365
 1000/ 2986 batches | accuracy    0.667
 1500/ 2986 batches | accuracy    0.753
 2000/ 2986 batches | accuracy    0.787
 2500/ 2986 batches | accuracy    0.805
-----------------------------------------------------------
| end of epoch   1 | time: 10.46s | test accuracy    0.828 
-----------------------------------------------------------
  500/ 2986 batches | accuracy    0.835
 1000/ 2986 batches | accuracy    0.840
 1500/ 2986 batches | accuracy    0.843
 2000/ 2986 batches | accuracy    0.841
 2500/ 2986 batches | accuracy    0.845
-----------------------------------------------------------
| end of epoch   2 | time: 10.23s | test accuracy    0.850 
-----------------------------------------------------------
  500/ 2986 batches | accuracy    0.858
 1000/ 2986 batches | accuracy    0.861
 1500/ 2986 batches | accuracy    0.860
 2000/ 2986 batches | accuracy    0.859
 2500/ 2986 batches | accuracy    0.864
-----------------------------------------------------------
| end of epoch   3 | time: 10.79s | test accuracy    0.866 
-----------------------------------------------------------
  500/ 2986 batches | accuracy    0.867
 1000/ 2986 batches | accuracy    0.871
 1500/ 2986 batches | accuracy    0.874
 2000/ 2986 batches | accuracy    0.868
 2500/ 2986 batches | accuracy    0.873
-----------------------------------------------------------
| end of epoch   4 | time:  9.94s | test accuracy    0.873 
-----------------------------------------------------------
  500/ 2986 batches | accuracy    0.879
 1000/ 2986 batches | accuracy    0.878
 1500/ 2986 batches | accuracy    0.879
 2000/ 2986 batches | accuracy    0.878
 2500/ 2986 batches | accuracy    0.878
-----------------------------------------------------------
| end of epoch   5 | time: 10.19s | test accuracy    0.876 
-----------------------------------------------------------
  500/ 2986 batches | accuracy    0.886
 1000/ 2986 batches | accuracy    0.889
 1500/ 2986 batches | accuracy    0.883
 2000/ 2986 batches | accuracy    0.885
 2500/ 2986 batches | accuracy    0.885
-----------------------------------------------------------
| end of epoch   6 | time:  9.84s | test accuracy    0.879 
-----------------------------------------------------------
  500/ 2986 batches | accuracy    0.892
 1000/ 2986 batches | accuracy    0.887
 1500/ 2986 batches | accuracy    0.889
 2000/ 2986 batches | accuracy    0.892
 2500/ 2986 batches | accuracy    0.889
-----------------------------------------------------------
| end of epoch   7 | time: 10.10s | test accuracy    0.883 
-----------------------------------------------------------
  500/ 2986 batches | accuracy    0.893
 1000/ 2986 batches | accuracy    0.893
 1500/ 2986 batches | accuracy    0.895
 2000/ 2986 batches | accuracy    0.894
 2500/ 2986 batches | accuracy    0.898
-----------------------------------------------------------
| end of epoch   8 | time:  9.60s | test accuracy    0.885 
-----------------------------------------------------------
  500/ 2986 batches | accuracy    0.897
 1000/ 2986 batches | accuracy    0.901
 1500/ 2986 batches | accuracy    0.898
 2000/ 2986 batches | accuracy    0.899
 2500/ 2986 batches | accuracy    0.901
-----------------------------------------------------------
| end of epoch   9 | time: 10.58s | test accuracy    0.887 
-----------------------------------------------------------
  500/ 2986 batches | accuracy    0.901
 1000/ 2986 batches | accuracy    0.905
 1500/ 2986 batches | accuracy    0.904
 2000/ 2986 batches | accuracy    0.903
 2500/ 2986 batches | accuracy    0.900
-----------------------------------------------------------
| end of epoch  10 | time: 10.92s | test accuracy    0.889 
-----------------------------------------------------------
CPU times: user 27min 8s, sys: 3.65 s, total: 27min 12s
Wall time: 1min 42s

test 정확도는 88.9% 가 나왔습니다.

predict

train한 model을 이용해 결과를 확인할 수 있는 predict 함수입니다.

def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1

predict(text='생각보다 재미없음', text_pipeline=text_pipeline)
>>> 2

train에 사용되지 않은 valid dataset을 이용해 정확도를 확인해봅니다.

def evaluate_main(valid_iter):
    # Hyperparameters
    EPOCHS = 10 # epoch
    LR = 0.1  # learning rate
    BATCH_SIZE = 64 # batch size for training

    valid_dataset = to_map_style_dataset(valid_iter)
    valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
    criterion = torch.nn.CrossEntropyLoss()

    print('Checking the results of valid dataset.')
    accu_test = evaluate(valid_dataloader, criterion)
    print('valid accuracy {:8.3f}'.format(accu_test))
    return accu_test

valid_iter = iter(CustomDataset(x_data = x_valid.tolist(), y_data = y_valid.tolist()))
accu_test = evaluate_main(valid_iter)

>>> Checking the results of valid dataset.
valid accuracy    0.887

validation dataset으로 예측한 값의 정확도는 88.7%로 test 점수와 크게 다르지 않은 결과를 확인할 수 있습니다.

'개발' 카테고리의 다른 글

[PyTorch] DataLoader를 사용하기 위한 Dataset 생성하기 (0)	2023.08.08
[PyTorch] 이미지 Classification - CNN (1) CIFAR10 dataset (0)	2023.02.20
[Snowflake] data insert 하는 방법 (1) snowlight worksheet SQL (0)	2023.01.06
[AutoML] auto-sklearn classification example (0)	2022.06.26

공지사항

최근에 올라온 글

최근에 달린 댓글

Total

Today

Yesterday

링크

TAG more

« 2025/07 »
일	월	화	수	목	금	토
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30	31

글 보관함