09_DL(Deep_Learning)

33_bert_문서분류

chuu_travel 2025. 5. 11. 22:21
728x90
# decoder가 필요한 부분이 거의 없으므로 Bert가 GPT보다 문서분류를 더 잘함

 

모델 구조

  • 입력 문장을 토큰화한 뒤 문장 시작과 끝을 알리는 스페셜 토큰 CLS 와 SEP를 각각 원래 토큰 시퀀스 앞뒤에 붙임
  • 위의 데이터를 BERT모델에 입력하고 문장 수준의 벡터(pooler_output)를 만들고, 이 벡터에 작은 추가 모듈을 덧붙여 모델 전체의 출력이 [해당 문장이 긍정일 확률, 해당 문장이 부정일 확률]의 형태가 되도록 학습
import os
import pandas as pd
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertConfig, BertForSequenceClassification
from transformers.optimization import get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, f1_score

 

train_df = pd.read_csv("./data/nsmc/ratings_train.txt", sep = "\t")
test_df = pd.read_csv("./data/nsmc/ratings_test.txt", sep = "\t")

 

train_df = train_df.dropna(subset = ["document"])
test_df = test_df.dropna(subset = ["document"])

 

train_df

 

tokenizer = BertTokenizer.from_pretrained(
    "beomi/kcbert-base",
    do_lower_case = False
)

 

# 전처리된 데이터셋 생성
train_dataset = tokenizer(
    train_df["document"].values.tolist(),
    padding = "max_length",
    max_length = 128,
    truncation = True
)

test_dataset = tokenizer(
    test_df["document"].values.tolist(),
    padding = "max_length",
    max_length = 128,
    truncation = True
)

 

# 레이블 추출
train_labels = train_df["label"].tolist()
test_labels = test_df["label"].tolist()

 

class NSMCDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

 

# PyTorch Dataset 인스턴스 생성
train_dataset = NSMCDataset(train_dataset, train_labels)
test_dataset = NSMCDataset(test_dataset, test_labels)

 

train_dataset[0]
{'input_ids': tensor([    2,  2170,   832,  5045,    17,    17,  7992, 29734,  4040, 10720,
             3,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'labels': tensor(0)}
  • 세그먼트 정보(token_type_ids)는 첫 번째 문서에 해당하는 토큰 시퀀스가 0, 두 번째 문서의 토큰 시퀀스가 1이 되도록 만듦
    • 영화 리뷰 문서 하나를 입력하고 그 문서의 분류를 하는 과제이기 때문에 모든 세그먼트가 0

 

# DataLoader 생성
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

 

# 모델 불러오기
pretrained_model_config = BertConfig.from_pretrained(
    "beomi/kcbert-base",
    num_labels = 2
)

model = BertForSequenceClassification.from_pretrained(
    "beomi/kcbert-base",
    config = pretrained_model_config
)

 

# 모델 학습 task 정의
# GPU 사용 설정
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# 옵티마이저와 스케줄러 설정
# optimizer = AdamW(model.parameters(), lr=2e-5)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
num_epochs = 3
total_steps = len(train_loader) * num_epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# 손실 함수 설정
loss_fn = torch.nn.CrossEntropyLoss()

 

 

def save_checkpoint(state, filename='checkpoint.pth.tar'):
    """
    모델 체크포인트를 저장하는 함수.
    
    Args:
        state (dict): 저장할 상태 딕셔너리 (모델 상태, 옵티마이저 상태 등).
        filename (str): 저장할 파일 이름.
    """
    torch.save(state, filename)

def load_checkpoint(filename, model, optimizer=None):
    """
    모델 체크포인트를 불러오는 함수.
    
    Args:
        filename (str): 불러올 체크포인트 파일 이름.
        model (torch.nn.Module): 불러올 모델.
        optimizer (torch.optim.Optimizer, optional): 불러올 옵티마이저.
    
    Returns:
        int: 불러온 에포크 번호.
    """
    if not os.path.isfile(filename):
        raise FileNotFoundError(f"No checkpoint found at '{filename}'")
    
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint['model_state_dict'])
    
    if optimizer and 'optimizer_state_dict' in checkpoint:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    
    epoch = checkpoint.get('epoch', 0)
    return epoch

 

# 체크포인트 디렉토리 설정
checkpoint_dir = './checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

best_f1 = 0.0  # 최고 F1 점수 초기화

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0

    loop = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', leave=False)
    for batch in loop:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        total_train_loss += loss.item()
        
        loop.set_postfix(loss=loss.item())
    
    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch + 1} Train Loss: {avg_train_loss:.4f}")
    
    # 평가
    model.eval()
    total_eval_loss = 0
    preds = []
    true_labels = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc='Evaluating', leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            
            total_eval_loss += loss.item()
            
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    avg_eval_loss = total_eval_loss / len(test_loader)
    acc = accuracy_score(true_labels, preds)
    f1 = f1_score(true_labels, preds)
    
    print(f"Epoch {epoch + 1} Evaluation Loss: {avg_eval_loss:.4f}, Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")
    
    # 체크포인트 저장
    checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch + 1}.pth.tar')
    save_checkpoint({
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'loss': avg_train_loss,
    }, filename=checkpoint_path)
    print(f"Checkpoint saved at {checkpoint_path}")
    
    # 최고의 F1 점수를 가진 모델 저장
    if f1 > best_f1:
        best_f1 = f1
        best_model_path = os.path.join(checkpoint_dir, 'best_model.pth.tar')
        save_checkpoint({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'loss': avg_train_loss,
        }, filename=best_model_path)
        print(f"Best model updated and saved at {best_model_path}")
Epoch 1/3:   8%|███▋                                             | 2848/37499 [1:27:54<17:50:22,  1.85s/it, loss=0.307]

 

 

# 최종 평가
model.eval()
preds = []
true_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc='Final Evaluation', leave=False):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

acc = accuracy_score(true_labels, preds)
f1 = f1_score(true_labels, preds)

print(f"Final Evaluation - Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")
728x90

'09_DL(Deep_Learning)' 카테고리의 다른 글

surprise 다운로드 방법  (0) 2025.05.16
34_XAI(설명 가능한 인공지능)  (3) 2025.05.11
32_트랜스포머  (0) 2025.05.11
31_트랜스포머_토큰화  (10) 2025.05.11
30_LLM의 개념  (1) 2025.05.08